From 32ecd5764bddf6f60585a5099bf853ccf20e3dcd Mon Sep 17 00:00:00 2001 From: Alexander Mock Date: Thu, 27 Oct 2022 17:30:53 +0200 Subject: [PATCH] improved online covariance computation. TODO: reduction kernel --- CMakeLists.txt | 8 + include/rmcl/math/math_batched.cuh | 21 +++ include/rmcl/math/math_batched.h | 21 +++ src/nodes/tests/cov_online.cpp | 159 +++++++++++++++++++ src/rmcl/math/math_batched.cpp | 94 ++++++++++-- src/rmcl/math/math_batched.cu | 239 ++++++++++++++++++++++++++--- 6 files changed, 512 insertions(+), 30 deletions(-) create mode 100644 src/nodes/tests/cov_online.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b220a9..0e1a6fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -493,6 +493,14 @@ endif(BUILD_EXAMPLES) ### TESTS ### +add_executable(rmcl_test_cov_online src/nodes/tests/cov_online.cpp) + +## Specify libraries to link a library or executable target against +target_link_libraries(rmcl_test_cov_online + rmcl + rmcl_cuda +) + # ###### CORRECTION GPU ####### # if(${rmagine_optix_FOUND}) diff --git a/include/rmcl/math/math_batched.cuh b/include/rmcl/math/math_batched.cuh index 418493a..15c4228 100644 --- a/include/rmcl/math/math_batched.cuh +++ b/include/rmcl/math/math_batched.cuh @@ -186,6 +186,27 @@ void means_covs_online_batched( rmagine::MemoryView& Ncorr); +/** + * @brief one-pass means and covariance computation + * + * @param dataset_points + * @param model_points + * @param mask + * @param dataset_center + * @param model_center + * @param Cs + * @param Ncorr + */ +void means_covs_online_approx_batched( + const rmagine::MemoryView& dataset_points, // from + const rmagine::MemoryView& model_points, // to + const rmagine::MemoryView& mask, + rmagine::MemoryView& dataset_center, + rmagine::MemoryView& model_center, + rmagine::MemoryView& Cs, + rmagine::MemoryView& Ncorr); + + } // namespace rmcl #endif // RMCL_MATH_BATCHED_CUH \ No newline at end of file diff --git a/include/rmcl/math/math_batched.h b/include/rmcl/math/math_batched.h index 650352d..82d9cda 100644 --- a/include/rmcl/math/math_batched.h +++ b/include/rmcl/math/math_batched.h @@ -88,6 +88,27 @@ void means_covs_online_batched( rmagine::MemoryView& Ncorr); +/** + * @brief one-pass means and covariance computation + * + * @param dataset_points + * @param model_points + * @param mask + * @param dataset_center + * @param model_center + * @param Cs + * @param Ncorr + */ +void means_covs_online_approx_batched( + const rmagine::MemoryView& dataset_points, // from + const rmagine::MemoryView& model_points, // to + const rmagine::MemoryView& mask, + rmagine::MemoryView& dataset_center, + rmagine::MemoryView& model_center, + rmagine::MemoryView& Cs, + rmagine::MemoryView& Ncorr); + + } // namespace rmcl #endif // RMCL_MATH_BATCHED_CUH \ No newline at end of file diff --git a/src/nodes/tests/cov_online.cpp b/src/nodes/tests/cov_online.cpp new file mode 100644 index 0000000..5687f4d --- /dev/null +++ b/src/nodes/tests/cov_online.cpp @@ -0,0 +1,159 @@ + +#include +#include +#include +#include +#include +#include + +namespace rm = rmagine; + +void fill_random(rm::MemoryView points) +{ + std::random_device rd; // Will be used to obtain a seed for the random number engine + std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd() + std::uniform_real_distribution<> dis(0.0, 1.0); + for(unsigned int i = 0; i < points.size(); i++) + { + points[i].x = dis(gen); + points[i].y = dis(gen); + points[i].z = dis(gen); + } +} + +void fill_sequence(rm::MemoryView points) +{ + for(unsigned int i = 0; i < points.size(); i++) + { + points[i].x = (float)i; + points[i].y = (float)i; + points[i].z = (float)i; + } +} + +void fill_ones(rm::MemoryView mask) +{ + for(unsigned int i=0; i < mask.size(); i++) + { + mask[i] = i % 2; + } +} + +int main(int argc, char** argv) +{ + rm::Mem dataset(128); + // fill_random(dataset); + fill_sequence(dataset); + + rm::Mem model(dataset.size()); + + rm::Mem mask(dataset.size()); + fill_ones(mask); + + + rm::Transform T; + T.t = {1.0, 2.0, 3.0}; + T.R.set(rm::EulerAngles{0.0, 0.1, 0.1}); + + for(unsigned int i=0; i ds(1); + rm::Mem ms(1); + rm::Mem Cs(1); + rm::Mem Ncorr(1); + + std::cout << "CPU" << std::endl; + { + std::cout << "means_covs_batched" << std::endl; + sw(); + rmcl::means_covs_batched(dataset, model, mask, ds, ms, Cs, Ncorr); + el = sw(); + std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl; + std::cout << ds[0] << std::endl; + std::cout << ms[0] << std::endl; + std::cout << Cs[0] << std::endl; + std::cout << Ncorr[0] << std::endl; + + std::cout << "means_covs_online_approx_batched" << std::endl; + sw(); + rmcl::means_covs_online_approx_batched(dataset, model, mask, ds, ms, Cs, Ncorr); + el = sw(); + std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl; + std::cout << ds[0] << std::endl; + std::cout << ms[0] << std::endl; + std::cout << Cs[0] << std::endl; + std::cout << Ncorr[0] << std::endl; + + std::cout << "means_covs_online_batched" << std::endl; + sw(); + rmcl::means_covs_online_batched(dataset, model, mask, ds, ms, Cs, Ncorr); + el = sw(); + std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl; + std::cout << ds[0] << std::endl; + std::cout << ms[0] << std::endl; + std::cout << Cs[0] << std::endl; + std::cout << Ncorr[0] << std::endl; + } + + std::cout << std::endl; + std::cout << "GPU" << std::endl; + { + rm::Mem dataset_ = dataset; + rm::Mem model_ = model; + rm::Mem mask_ = mask; + + rm::Mem ds_(1); + rm::Mem ms_(1); + rm::Mem Cs_(1); + rm::Mem Ncorr_(1); + + + std::cout << "means_covs_batched" << std::endl; + sw(); + rmcl::means_covs_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_); + el = sw(); + std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl; + ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_; + std::cout << ds[0] << std::endl; + std::cout << ms[0] << std::endl; + std::cout << Cs[0] << std::endl; + std::cout << Ncorr[0] << std::endl; + + std::cout << "means_covs_online_approx_batched" << std::endl; + sw(); + rmcl::means_covs_online_approx_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_); + el = sw(); + std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl; + ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_; + std::cout << ds[0] << std::endl; + std::cout << ms[0] << std::endl; + std::cout << Cs[0] << std::endl; + std::cout << Ncorr[0] << std::endl; + + std::cout << "means_covs_online_batched" << std::endl; + sw(); + rmcl::means_covs_online_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_); + el = sw(); + std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl; + ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_; + std::cout << ds[0] << std::endl; + std::cout << ms[0] << std::endl; + std::cout << Cs[0] << std::endl; + std::cout << Ncorr[0] << std::endl; + } + + + + + + + + + return 0; +} \ No newline at end of file diff --git a/src/rmcl/math/math_batched.cpp b/src/rmcl/math/math_batched.cpp index 07641a6..c541168 100644 --- a/src/rmcl/math/math_batched.cpp +++ b/src/rmcl/math/math_batched.cpp @@ -3,7 +3,6 @@ #include namespace rm = rmagine; - namespace rmcl { @@ -34,14 +33,10 @@ void means_covs_batched( { if(mask_batch[j] > 0) { - const rm::Vector dD = data_batch[j] - d_mean; - const rm::Vector dM = model_batch[j] - m_mean; - float N = static_cast(n_corr + 1); - - // reduction + // sum n_corr++; - d_mean += dD / N; - m_mean += dM / N; + d_mean += data_batch[j]; + m_mean += model_batch[j]; } } @@ -49,14 +44,20 @@ void means_covs_batched( if(n_corr > 0) { + const float Ncorr_f = static_cast(n_corr); + d_mean /= Ncorr_f; + m_mean /= Ncorr_f; + rm::Matrix3x3 C = rm::Matrix3x3::Zeros(); for(size_t j = 0; j < batchSize; j++) { - C += (model_batch[j] - m_mean).multT(data_batch[j] - d_mean); + if(mask_batch[j] > 0) + { + C += (model_batch[j] - m_mean).multT(data_batch[j] - d_mean); + } } - - const float Ncorr_f = static_cast(n_corr); + C /= Ncorr_f; dataset_center[i] = d_mean; @@ -82,6 +83,77 @@ void means_covs_online_batched( unsigned int Nbatches = Ncorr.size(); unsigned int batchSize = dataset_points.size() / Nbatches; + #pragma omp parallel for default(shared) if(Nbatches > 4) + for(size_t i=0; i data_batch = dataset_points(i * batchSize, (i+1) * batchSize); + const rm::MemoryView model_batch = model_points(i * batchSize, (i+1) * batchSize); + const rm::MemoryView mask_batch = mask(i * batchSize, (i+1) * batchSize); + + rm::Vector d_mean = {0.0f, 0.0f, 0.0f}; + rm::Vector m_mean = {0.0f, 0.0f, 0.0f}; + rm::Matrix3x3 C = rm::Matrix3x3::Zeros(); + unsigned int n_corr = 0; + + for(size_t j=0; j 0) + { + const float N_1 = static_cast(n_corr); + const float N = static_cast(n_corr + 1); + + // update ncorr + n_corr++; + + const rm::Vector Di = data_batch[j]; // read + const rm::Vector Mi = model_batch[j]; // read + const rm::Vector d_mean_old = d_mean; // read + const rm::Vector m_mean_old = m_mean; // read + + // update means + // rm::Vector dD = Di - d_mean; + // rm::Vector dM = Mi - m_mean; + + // save old means for covariance + + const rm::Vector d_mean_new = d_mean_old + (Di - d_mean) / N; + const rm::Vector m_mean_new = m_mean_old + (Mi - m_mean) / N; + + const float w1 = N_1/N; + const float w2 = 1.0/N; + + auto P1 = (Mi - m_mean_new).multT(Di - d_mean_new); + auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new); + + C = C * w1 + P1 * w2 + P2 * w1; + + // C += ((Mi - m_mean_new).multT(Di - d_mean_new) - C) * 1.0 / N + // + (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new) * N_1/N; + + d_mean = d_mean_new; // write + m_mean = m_mean_new; // write + } + } + + Ncorr[i] = n_corr; + dataset_center[i] = d_mean; + model_center[i] = m_mean; + Cs[i] = C; + } +} + +void means_covs_online_approx_batched( + const rm::MemoryView& dataset_points, // from + const rm::MemoryView& model_points, // to + const rm::MemoryView& mask, + rm::MemoryView& dataset_center, + rm::MemoryView& model_center, + rm::MemoryView& Cs, + rm::MemoryView& Ncorr) +{ + unsigned int Nbatches = Ncorr.size(); + unsigned int batchSize = dataset_points.size() / Nbatches; + #pragma omp parallel for default(shared) if(Nbatches > 4) for(size_t i=0; i 0) + { + // global read + const rm::Vector Di = dataset_points[globId + blockSize * i]; + const rm::Vector Mi = model_points[globId + blockSize * i]; + + // shared read + const rm::Vector D = sD[tid]; + const rm::Vector M = sM[tid]; + const rm::Matrix3x3 C = sC[tid]; + const unsigned int N_1 = sN[tid]; + const float N_1f = static_cast(N_1); + const unsigned int N = N_1 + 1; + const float Nf = static_cast(N); + + // compute + // rm::Vector dD = Di - D; + // rm::Vector dM = Mi - M; + + const rm::Vector Dnew = D + (Di - D) / Nf; + const rm::Vector Mnew = M + (Mi - M) / Nf; + + // dD = Di - Dnew; + // dM = Mi - Mnew; + + + const float w1 = N_1f / Nf; + const float w2 = 1.0 / Nf; + + auto P1 = (Mi - Mnew).multT(Di - Dnew); + auto P2 = (M - Mnew).multT(D - Dnew); + + // printf("w1: %f\n", w1); + // printf("w2: %f\n", w2); + // printf("P1: %f\n", P1(0,0)); + // printf("P2: %f\n", P2(0,0)); + // printf("P3: %f\n", P3(0,0)); + + // shared write + // auto Cnew = P1 + P2 + P3; + // sC[tid] = C * N_1f / Nf + (Mi - Mnew).multT(Di - Dnew) * 1.0 / Nf + // + (M - Mnew).multT(D - Dnew) * N_1f / Nf; + + auto Cnew = C * w1 + P1 * w2 + P2 * w1; + + sC[tid] = Cnew; + sD[tid] = Dnew; + sM[tid] = Mnew; + sN[tid] = N; + + printf("(%u,G) -> (%u,S) = %u, %f\n", globId + blockSize * i, tid, N, Cnew(0,0) ); + } + } + } + __syncthreads(); + + for(unsigned int s = blockSize / 2; s > 0; s >>= 1) + { + if(tid < s && tid < chunkSize / 2) + { + // if(tid < s) + // shared read + const rm::Vector D = sD[tid]; + const rm::Vector M = sM[tid]; + const rm::Matrix3x3 C = sC[tid]; + const unsigned int N = sN[tid]; + + const rm::Vector Dnext = sD[tid + s]; + const rm::Vector Mnext = sM[tid + s]; + const rm::Matrix3x3 Cnext = sC[tid + s]; + const unsigned int Nnext = sN[tid + s]; + + // compute + const unsigned int Ntot = N + Nnext; + + if(Ntot > 0) + { + const float w1 = static_cast(N) / static_cast(Ntot); + const float w2 = static_cast(Nnext) / static_cast(Ntot); + + + + // auto P1 = (Mi - Mnew).multT(Di - Dnew); + // auto P2 = (M - Mnew).multT(D - Dnew); + + const rm::Vector Dtot = D * w1 + Dnext * w2; + + + + // printf("(%u,S) %u, (%f, %f, %f) -> (%u,S) %u, (%f, %f, %f) = %u, (%f, %f, %f)\n", + // tid + s, Nnext, Dnext.x, Dnext.y, Dnext.z, + // tid, N, D.x, D.y, D.z, + // Ntot, Dtot.x, Dtot.y, Dtot.z); + + // shared write + sD[tid] = Dtot; + sM[tid] = M * w1 + Mnext * w2; + + + // TODO weighted covariance + auto Cnew = C * w1 + Cnext * w2; + + + sC[tid] = Cnew; // currently only a sum + sN[tid] = Ntot; + } + } + __syncthreads(); + } + + if(tid == 0) + { + const unsigned int N = sN[0]; + if(N > 0) + { + dataset_center[blockIdx.x] = sD[0]; + model_center[blockIdx.x] = sM[0]; + Cs[blockIdx.x] = sC[0]; + Ncorr[blockIdx.x] = N; + } else { + dataset_center[blockIdx.x].setZeros(); + model_center[blockIdx.x].setZeros(); + Cs[blockIdx.x].setZeros(); + Ncorr[blockIdx.x] = 0; + } + } +} + + +template +__global__ void means_covs_online_approx_batched_kernel( + const rm::Vector* dataset_points, // from, dataset + const rm::Vector* model_points, // to, model + const unsigned int* mask, + unsigned int chunkSize, + rm::Vector* dataset_center, + rm::Vector* model_center, + rm::Matrix3x3* Cs, + unsigned int* Ncorr) +{ + // Online update: Covariance and means + // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + // + // Comment: + // I dont think the Wikipedia algorithm is completely correct (Covariances). + // But it is used in a lot of code -> Let everything be equally incorrect + + __shared__ rm::Vector sD[blockSize]; + __shared__ rm::Vector sM[blockSize]; + __shared__ rm::Matrix3x3 sC[blockSize]; + __shared__ unsigned int sN[blockSize]; + // __shared__ bool sMask[blockSize]; + + + const unsigned int tid = threadIdx.x; + const unsigned int globId = chunkSize * blockIdx.x + threadIdx.x; + const unsigned int rows = (chunkSize + blockSize - 1) / blockSize; + + // if(tid == 0) + // { + // printf("blockSize: %u, chunkSize %u\n", blockSize, chunkSize); + // } + + sD[tid] = {0.0f, 0.0f, 0.0f}; + sM[tid] = {0.0f, 0.0f, 0.0f}; sC[tid].setZeros(); sN[tid] = 0; @@ -365,13 +538,13 @@ __global__ void means_covs_online_batched_kernel( const rm::Vector dD = dataset_points[globId + blockSize * i] - D; const rm::Vector dM = model_points[globId + blockSize * i] - M; - // printf("Adding glob mem %u to shared mem %u\n", globId + blockSize * i, tid); - // shared write sD[tid] = D + dD / static_cast(N + 1); sM[tid] = M + dM / static_cast(N + 1); sC[tid] = C + dM.multT(dD); sN[tid] = N + 1; + + // printf("(%u,G) -> (%u,S) = %u\n", globId + blockSize * i, tid, N + 1); } } } @@ -395,23 +568,32 @@ __global__ void means_covs_online_batched_kernel( // compute const unsigned int Ntot = N + Nnext; - const float w1 = static_cast(N) / static_cast(Ntot); - const float w2 = static_cast(Nnext) / static_cast(Ntot); - // printf("Adding shared mem %u to shared mem %u\n", tid + s, tid); + if(Ntot > 0) + { + const float w1 = static_cast(N) / static_cast(Ntot); + const float w2 = static_cast(Nnext) / static_cast(Ntot); + - // shared write - sD[tid] = D * w1 + Dnext * w2; - sM[tid] = M * w1 + Mnext * w2; - sC[tid] = C + Cnext; // currently only a sum - sN[tid] = Ntot; + const rm::Vector Dtot = D * w1 + Dnext * w2; + // printf("(%u,S) %u, (%f, %f, %f) -> (%u,S) %u, (%f, %f, %f) = %u, (%f, %f, %f)\n", + // tid + s, Nnext, Dnext.x, Dnext.y, Dnext.z, + // tid, N, D.x, D.y, D.z, + // Ntot, Dtot.x, Dtot.y, Dtot.z); + + // shared write + sD[tid] = Dtot; + sM[tid] = M * w1 + Mnext * w2; + sC[tid] = C + Cnext; // currently only a sum + sN[tid] = Ntot; + } } __syncthreads(); } if(tid == 0) { - const unsigned int N = sN[blockIdx.x]; + const unsigned int N = sN[0]; if(N > 0) { dataset_center[blockIdx.x] = sD[0]; @@ -558,9 +740,10 @@ void means_covs_batched( rmagine::MemoryView& Cs, rmagine::MemoryView& Ncorr) { + rm::sumBatched(mask, Ncorr); + mean_batched(dataset_points, mask, Ncorr, dataset_center); mean_batched(model_points, mask, Ncorr, model_center); - rm::sumBatched(mask, Ncorr); cov_batched(dataset_points, dataset_center, model_points, model_center, @@ -576,13 +759,31 @@ void means_covs_online_batched( rmagine::MemoryView& Cs, rmagine::MemoryView& Ncorr) { - unsigned int Nchunks = Ncorr.size(); - unsigned int batchSize = dataset_points.size() / Nchunks; + unsigned int Nbatches = Ncorr.size(); + unsigned int batchSize = dataset_points.size() / Nbatches; + constexpr unsigned int blockSize = 64; // TODO: get best value for this one + + means_covs_online_batched_kernel <<>>( + dataset_points.raw(), model_points.raw(), mask.raw(), batchSize, + dataset_center.raw(), model_center.raw(), Cs.raw(), Ncorr.raw()); +} + + +void means_covs_online_approx_batched( + const rmagine::MemoryView& dataset_points, // from + const rmagine::MemoryView& model_points, // to + const rmagine::MemoryView& mask, + rmagine::MemoryView& dataset_center, + rmagine::MemoryView& model_center, + rmagine::MemoryView& Cs, + rmagine::MemoryView& Ncorr) +{ + unsigned int Nbatches = Ncorr.size(); + unsigned int batchSize = dataset_points.size() / Nbatches; constexpr unsigned int blockSize = 64; // TODO: get best value for this one - means_covs_online_batched_kernel <<>>( - dataset_points.raw(), model_points.raw(), mask.raw(), - batchSize, + means_covs_online_approx_batched_kernel <<>>( + dataset_points.raw(), model_points.raw(), mask.raw(), batchSize, dataset_center.raw(), model_center.raw(), Cs.raw(), Ncorr.raw()); }