Skip to content

Commit

Permalink
improved online covariance computation. TODO: reduction kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
amock committed Oct 27, 2022
1 parent 250d787 commit 32ecd57
Show file tree
Hide file tree
Showing 6 changed files with 512 additions and 30 deletions.
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,14 @@ endif(BUILD_EXAMPLES)

### TESTS ###

add_executable(rmcl_test_cov_online src/nodes/tests/cov_online.cpp)

## Specify libraries to link a library or executable target against
target_link_libraries(rmcl_test_cov_online
rmcl
rmcl_cuda
)


# ###### CORRECTION GPU #######
# if(${rmagine_optix_FOUND})
Expand Down
21 changes: 21 additions & 0 deletions include/rmcl/math/math_batched.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,27 @@ void means_covs_online_batched(
rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr);


/**
* @brief one-pass means and covariance computation
*
* @param dataset_points
* @param model_points
* @param mask
* @param dataset_center
* @param model_center
* @param Cs
* @param Ncorr
*/
void means_covs_online_approx_batched(
const rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& dataset_points, // from
const rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& model_points, // to
const rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& mask,
rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& dataset_center,
rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& model_center,
rmagine::MemoryView<rmagine::Matrix3x3, rmagine::VRAM_CUDA>& Cs,
rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr);


} // namespace rmcl

#endif // RMCL_MATH_BATCHED_CUH
21 changes: 21 additions & 0 deletions include/rmcl/math/math_batched.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,27 @@ void means_covs_online_batched(
rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr);


/**
* @brief one-pass means and covariance computation
*
* @param dataset_points
* @param model_points
* @param mask
* @param dataset_center
* @param model_center
* @param Cs
* @param Ncorr
*/
void means_covs_online_approx_batched(
const rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& dataset_points, // from
const rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& model_points, // to
const rmagine::MemoryView<unsigned int, rmagine::RAM>& mask,
rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& dataset_center,
rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& model_center,
rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr);


} // namespace rmcl

#endif // RMCL_MATH_BATCHED_CUH
159 changes: 159 additions & 0 deletions src/nodes/tests/cov_online.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@

#include <rmcl/math/math_batched.h>
#include <random>
#include <iostream>
#include <rmagine/util/prints.h>
#include <rmagine/util/StopWatch.hpp>
#include <rmcl/math/math_batched.cuh>

namespace rm = rmagine;

void fill_random(rm::MemoryView<rm::Vector, rm::RAM> points)
{
std::random_device rd; // Will be used to obtain a seed for the random number engine
std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
std::uniform_real_distribution<> dis(0.0, 1.0);
for(unsigned int i = 0; i < points.size(); i++)
{
points[i].x = dis(gen);
points[i].y = dis(gen);
points[i].z = dis(gen);
}
}

void fill_sequence(rm::MemoryView<rm::Vector, rm::RAM> points)
{
for(unsigned int i = 0; i < points.size(); i++)
{
points[i].x = (float)i;
points[i].y = (float)i;
points[i].z = (float)i;
}
}

void fill_ones(rm::MemoryView<unsigned int> mask)
{
for(unsigned int i=0; i < mask.size(); i++)
{
mask[i] = i % 2;
}
}

int main(int argc, char** argv)
{
rm::Mem<rm::Vector> dataset(128);
// fill_random(dataset);
fill_sequence(dataset);

rm::Mem<rm::Vector> model(dataset.size());

rm::Mem<unsigned int> mask(dataset.size());
fill_ones(mask);


rm::Transform T;
T.t = {1.0, 2.0, 3.0};
T.R.set(rm::EulerAngles{0.0, 0.1, 0.1});

for(unsigned int i=0; i<dataset.size(); i++)
{
model[i] = T * dataset[i];
}


rm::StopWatchHR sw;
double el;
rm::Mem<rm::Vector> ds(1);
rm::Mem<rm::Vector> ms(1);
rm::Mem<rm::Matrix3x3> Cs(1);
rm::Mem<unsigned int> Ncorr(1);

std::cout << "CPU" << std::endl;
{
std::cout << "means_covs_batched" << std::endl;
sw();
rmcl::means_covs_batched(dataset, model, mask, ds, ms, Cs, Ncorr);
el = sw();
std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
std::cout << ds[0] << std::endl;
std::cout << ms[0] << std::endl;
std::cout << Cs[0] << std::endl;
std::cout << Ncorr[0] << std::endl;

std::cout << "means_covs_online_approx_batched" << std::endl;
sw();
rmcl::means_covs_online_approx_batched(dataset, model, mask, ds, ms, Cs, Ncorr);
el = sw();
std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
std::cout << ds[0] << std::endl;
std::cout << ms[0] << std::endl;
std::cout << Cs[0] << std::endl;
std::cout << Ncorr[0] << std::endl;

std::cout << "means_covs_online_batched" << std::endl;
sw();
rmcl::means_covs_online_batched(dataset, model, mask, ds, ms, Cs, Ncorr);
el = sw();
std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
std::cout << ds[0] << std::endl;
std::cout << ms[0] << std::endl;
std::cout << Cs[0] << std::endl;
std::cout << Ncorr[0] << std::endl;
}

std::cout << std::endl;
std::cout << "GPU" << std::endl;
{
rm::Mem<rm::Vector, rm::VRAM_CUDA> dataset_ = dataset;
rm::Mem<rm::Vector, rm::VRAM_CUDA> model_ = model;
rm::Mem<unsigned int, rm::VRAM_CUDA> mask_ = mask;

rm::Mem<rm::Vector, rm::VRAM_CUDA> ds_(1);
rm::Mem<rm::Vector, rm::VRAM_CUDA> ms_(1);
rm::Mem<rm::Matrix3x3, rm::VRAM_CUDA> Cs_(1);
rm::Mem<unsigned int, rm::VRAM_CUDA> Ncorr_(1);


std::cout << "means_covs_batched" << std::endl;
sw();
rmcl::means_covs_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
el = sw();
std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_;
std::cout << ds[0] << std::endl;
std::cout << ms[0] << std::endl;
std::cout << Cs[0] << std::endl;
std::cout << Ncorr[0] << std::endl;

std::cout << "means_covs_online_approx_batched" << std::endl;
sw();
rmcl::means_covs_online_approx_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
el = sw();
std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_;
std::cout << ds[0] << std::endl;
std::cout << ms[0] << std::endl;
std::cout << Cs[0] << std::endl;
std::cout << Ncorr[0] << std::endl;

std::cout << "means_covs_online_batched" << std::endl;
sw();
rmcl::means_covs_online_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
el = sw();
std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_;
std::cout << ds[0] << std::endl;
std::cout << ms[0] << std::endl;
std::cout << Cs[0] << std::endl;
std::cout << Ncorr[0] << std::endl;
}








return 0;
}
94 changes: 83 additions & 11 deletions src/rmcl/math/math_batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include <rmagine/math/math_batched.h>

namespace rm = rmagine;

namespace rmcl
{

Expand Down Expand Up @@ -34,29 +33,31 @@ void means_covs_batched(
{
if(mask_batch[j] > 0)
{
const rm::Vector dD = data_batch[j] - d_mean;
const rm::Vector dM = model_batch[j] - m_mean;
float N = static_cast<float>(n_corr + 1);

// reduction
// sum
n_corr++;
d_mean += dD / N;
m_mean += dM / N;
d_mean += data_batch[j];
m_mean += model_batch[j];
}
}

Ncorr[i] = n_corr;

if(n_corr > 0)
{
const float Ncorr_f = static_cast<float>(n_corr);
d_mean /= Ncorr_f;
m_mean /= Ncorr_f;

rm::Matrix3x3 C = rm::Matrix3x3::Zeros();

for(size_t j = 0; j < batchSize; j++)
{
C += (model_batch[j] - m_mean).multT(data_batch[j] - d_mean);
if(mask_batch[j] > 0)
{
C += (model_batch[j] - m_mean).multT(data_batch[j] - d_mean);
}
}

const float Ncorr_f = static_cast<float>(n_corr);

C /= Ncorr_f;

dataset_center[i] = d_mean;
Expand All @@ -82,6 +83,77 @@ void means_covs_online_batched(
unsigned int Nbatches = Ncorr.size();
unsigned int batchSize = dataset_points.size() / Nbatches;

#pragma omp parallel for default(shared) if(Nbatches > 4)
for(size_t i=0; i<Nbatches; i++)
{
const rm::MemoryView<rm::Vector> data_batch = dataset_points(i * batchSize, (i+1) * batchSize);
const rm::MemoryView<rm::Vector> model_batch = model_points(i * batchSize, (i+1) * batchSize);
const rm::MemoryView<unsigned int> mask_batch = mask(i * batchSize, (i+1) * batchSize);

rm::Vector d_mean = {0.0f, 0.0f, 0.0f};
rm::Vector m_mean = {0.0f, 0.0f, 0.0f};
rm::Matrix3x3 C = rm::Matrix3x3::Zeros();
unsigned int n_corr = 0;

for(size_t j=0; j<batchSize; j++)
{
if(mask_batch[j] > 0)
{
const float N_1 = static_cast<float>(n_corr);
const float N = static_cast<float>(n_corr + 1);

// update ncorr
n_corr++;

const rm::Vector Di = data_batch[j]; // read
const rm::Vector Mi = model_batch[j]; // read
const rm::Vector d_mean_old = d_mean; // read
const rm::Vector m_mean_old = m_mean; // read

// update means
// rm::Vector dD = Di - d_mean;
// rm::Vector dM = Mi - m_mean;

// save old means for covariance

const rm::Vector d_mean_new = d_mean_old + (Di - d_mean) / N;
const rm::Vector m_mean_new = m_mean_old + (Mi - m_mean) / N;

const float w1 = N_1/N;
const float w2 = 1.0/N;

auto P1 = (Mi - m_mean_new).multT(Di - d_mean_new);
auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);

C = C * w1 + P1 * w2 + P2 * w1;

// C += ((Mi - m_mean_new).multT(Di - d_mean_new) - C) * 1.0 / N
// + (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new) * N_1/N;

d_mean = d_mean_new; // write
m_mean = m_mean_new; // write
}
}

Ncorr[i] = n_corr;
dataset_center[i] = d_mean;
model_center[i] = m_mean;
Cs[i] = C;
}
}

void means_covs_online_approx_batched(
const rm::MemoryView<rm::Vector, rm::RAM>& dataset_points, // from
const rm::MemoryView<rm::Vector, rm::RAM>& model_points, // to
const rm::MemoryView<unsigned int, rm::RAM>& mask,
rm::MemoryView<rm::Vector, rm::RAM>& dataset_center,
rm::MemoryView<rm::Vector, rm::RAM>& model_center,
rm::MemoryView<rm::Matrix3x3, rm::RAM>& Cs,
rm::MemoryView<unsigned int, rm::RAM>& Ncorr)
{
unsigned int Nbatches = Ncorr.size();
unsigned int batchSize = dataset_points.size() / Nbatches;

#pragma omp parallel for default(shared) if(Nbatches > 4)
for(size_t i=0; i<Nbatches; i++)
{
Expand Down
Loading

0 comments on commit 32ecd57

Please sign in to comment.