From 32ecd5764bddf6f60585a5099bf853ccf20e3dcd Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Thu, 27 Oct 2022 17:30:53 +0200
Subject: [PATCH] improved online covariance computation. TODO: reduction
 kernel

---
 CMakeLists.txt                     |   8 +
 include/rmcl/math/math_batched.cuh |  21 +++
 include/rmcl/math/math_batched.h   |  21 +++
 src/nodes/tests/cov_online.cpp     | 159 +++++++++++++++++++
 src/rmcl/math/math_batched.cpp     |  94 ++++++++++--
 src/rmcl/math/math_batched.cu      | 239 ++++++++++++++++++++++++++---
 6 files changed, 512 insertions(+), 30 deletions(-)
 create mode 100644 src/nodes/tests/cov_online.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b220a9..0e1a6fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -493,6 +493,14 @@ endif(BUILD_EXAMPLES)
 
 ### TESTS ###
 
+add_executable(rmcl_test_cov_online src/nodes/tests/cov_online.cpp)
+
+## Specify libraries to link a library or executable target against
+target_link_libraries(rmcl_test_cov_online
+    rmcl
+    rmcl_cuda
+)
+
 
 # ###### CORRECTION GPU #######
 # if(${rmagine_optix_FOUND})
diff --git a/include/rmcl/math/math_batched.cuh b/include/rmcl/math/math_batched.cuh
index 418493a..15c4228 100644
--- a/include/rmcl/math/math_batched.cuh
+++ b/include/rmcl/math/math_batched.cuh
@@ -186,6 +186,27 @@ void means_covs_online_batched(
     rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr);
 
 
+/**
+ * @brief one-pass means and covariance computation
+ * 
+ * @param dataset_points 
+ * @param model_points 
+ * @param mask 
+ * @param dataset_center 
+ * @param model_center 
+ * @param Cs 
+ * @param Ncorr 
+ */
+void means_covs_online_approx_batched(
+    const rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& dataset_points, // from
+    const rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& model_points, // to
+    const rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& mask,
+    rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& dataset_center,
+    rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& model_center,
+    rmagine::MemoryView<rmagine::Matrix3x3, rmagine::VRAM_CUDA>& Cs,
+    rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr);
+
+
 } // namespace rmcl
 
 #endif // RMCL_MATH_BATCHED_CUH
\ No newline at end of file
diff --git a/include/rmcl/math/math_batched.h b/include/rmcl/math/math_batched.h
index 650352d..82d9cda 100644
--- a/include/rmcl/math/math_batched.h
+++ b/include/rmcl/math/math_batched.h
@@ -88,6 +88,27 @@ void means_covs_online_batched(
     rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr);
 
 
+/**
+ * @brief one-pass means and covariance computation
+ * 
+ * @param dataset_points 
+ * @param model_points 
+ * @param mask 
+ * @param dataset_center 
+ * @param model_center 
+ * @param Cs 
+ * @param Ncorr 
+ */
+void means_covs_online_approx_batched(
+    const rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& dataset_points, // from
+    const rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& model_points, // to
+    const rmagine::MemoryView<unsigned int, rmagine::RAM>& mask,
+    rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& dataset_center,
+    rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& model_center,
+    rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
+    rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr);
+
+
 } // namespace rmcl
 
 #endif // RMCL_MATH_BATCHED_CUH
\ No newline at end of file
diff --git a/src/nodes/tests/cov_online.cpp b/src/nodes/tests/cov_online.cpp
new file mode 100644
index 0000000..5687f4d
--- /dev/null
+++ b/src/nodes/tests/cov_online.cpp
@@ -0,0 +1,159 @@
+
+#include <rmcl/math/math_batched.h>
+#include <random>
+#include <iostream>
+#include <rmagine/util/prints.h>
+#include <rmagine/util/StopWatch.hpp>
+#include <rmcl/math/math_batched.cuh>
+
+namespace rm = rmagine;
+
+void fill_random(rm::MemoryView<rm::Vector, rm::RAM> points)
+{
+    std::random_device rd;  // Will be used to obtain a seed for the random number engine
+    std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
+    std::uniform_real_distribution<> dis(0.0, 1.0);
+    for(unsigned int i = 0; i < points.size(); i++) 
+    {
+        points[i].x = dis(gen);
+        points[i].y = dis(gen);
+        points[i].z = dis(gen);
+    }
+}
+
+void fill_sequence(rm::MemoryView<rm::Vector, rm::RAM> points)
+{
+    for(unsigned int i = 0; i < points.size(); i++) 
+    {
+        points[i].x = (float)i;
+        points[i].y = (float)i;
+        points[i].z = (float)i;
+    }
+}
+
+void fill_ones(rm::MemoryView<unsigned int> mask)
+{
+    for(unsigned int i=0; i < mask.size(); i++)
+    {
+        mask[i] = i % 2;
+    }
+}
+
+int main(int argc, char** argv)
+{
+    rm::Mem<rm::Vector> dataset(128);
+    // fill_random(dataset);
+    fill_sequence(dataset);
+
+    rm::Mem<rm::Vector> model(dataset.size());
+
+    rm::Mem<unsigned int> mask(dataset.size());
+    fill_ones(mask);
+
+
+    rm::Transform T;
+    T.t = {1.0, 2.0, 3.0};
+    T.R.set(rm::EulerAngles{0.0, 0.1, 0.1});
+
+    for(unsigned int i=0; i<dataset.size(); i++)
+    {
+        model[i] = T * dataset[i];
+    }
+
+
+    rm::StopWatchHR sw;
+    double el;
+    rm::Mem<rm::Vector> ds(1);
+    rm::Mem<rm::Vector> ms(1);
+    rm::Mem<rm::Matrix3x3> Cs(1);
+    rm::Mem<unsigned int> Ncorr(1);
+
+    std::cout << "CPU" << std::endl;
+    {
+        std::cout << "means_covs_batched" << std::endl;
+        sw();
+        rmcl::means_covs_batched(dataset, model, mask, ds, ms, Cs, Ncorr);
+        el = sw();
+        std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
+        std::cout << ds[0] << std::endl;
+        std::cout << ms[0] << std::endl;
+        std::cout << Cs[0] << std::endl;
+        std::cout << Ncorr[0] << std::endl;
+
+        std::cout << "means_covs_online_approx_batched" << std::endl;
+        sw();
+        rmcl::means_covs_online_approx_batched(dataset, model, mask, ds, ms, Cs, Ncorr);
+        el = sw();
+        std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
+        std::cout << ds[0] << std::endl;
+        std::cout << ms[0] << std::endl;
+        std::cout << Cs[0] << std::endl;
+        std::cout << Ncorr[0] << std::endl;
+
+        std::cout << "means_covs_online_batched" << std::endl;
+        sw();
+        rmcl::means_covs_online_batched(dataset, model, mask, ds, ms, Cs, Ncorr);
+        el = sw();
+        std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
+        std::cout << ds[0] << std::endl;
+        std::cout << ms[0] << std::endl;
+        std::cout << Cs[0] << std::endl;
+        std::cout << Ncorr[0] << std::endl;
+    }
+
+    std::cout << std::endl;
+    std::cout << "GPU" << std::endl;
+    {
+        rm::Mem<rm::Vector, rm::VRAM_CUDA> dataset_ = dataset;
+        rm::Mem<rm::Vector, rm::VRAM_CUDA> model_ = model;
+        rm::Mem<unsigned int, rm::VRAM_CUDA> mask_ = mask;
+
+        rm::Mem<rm::Vector, rm::VRAM_CUDA> ds_(1);
+        rm::Mem<rm::Vector, rm::VRAM_CUDA> ms_(1);
+        rm::Mem<rm::Matrix3x3, rm::VRAM_CUDA> Cs_(1);
+        rm::Mem<unsigned int, rm::VRAM_CUDA> Ncorr_(1);
+
+
+        std::cout << "means_covs_batched" << std::endl;
+        sw();
+        rmcl::means_covs_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
+        el = sw();
+        std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
+        ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_;
+        std::cout << ds[0] << std::endl;
+        std::cout << ms[0] << std::endl;
+        std::cout << Cs[0] << std::endl;
+        std::cout << Ncorr[0] << std::endl;
+
+        std::cout << "means_covs_online_approx_batched" << std::endl;
+        sw();
+        rmcl::means_covs_online_approx_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
+        el = sw();
+        std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
+        ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_;
+        std::cout << ds[0] << std::endl;
+        std::cout << ms[0] << std::endl;
+        std::cout << Cs[0] << std::endl;
+        std::cout << Ncorr[0] << std::endl;
+
+        std::cout << "means_covs_online_batched" << std::endl;
+        sw();
+        rmcl::means_covs_online_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
+        el = sw();
+        std::cout << "- runtime: " << el * 1000.0 << " ms" << std::endl;
+        ds = ds_; ms = ms_; Cs = Cs_; Ncorr = Ncorr_;
+        std::cout << ds[0] << std::endl;
+        std::cout << ms[0] << std::endl;
+        std::cout << Cs[0] << std::endl;
+        std::cout << Ncorr[0] << std::endl;
+    }
+
+
+    
+
+
+
+
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/rmcl/math/math_batched.cpp b/src/rmcl/math/math_batched.cpp
index 07641a6..c541168 100644
--- a/src/rmcl/math/math_batched.cpp
+++ b/src/rmcl/math/math_batched.cpp
@@ -3,7 +3,6 @@
 #include <rmagine/math/math_batched.h>
 
 namespace rm = rmagine;
-
 namespace rmcl
 {
 
@@ -34,14 +33,10 @@ void means_covs_batched(
         {
             if(mask_batch[j] > 0)
             {
-                const rm::Vector dD = data_batch[j] - d_mean;
-                const rm::Vector dM = model_batch[j] - m_mean;
-                float N = static_cast<float>(n_corr + 1);
-
-                // reduction
+                // sum
                 n_corr++;
-                d_mean += dD / N;
-                m_mean += dM / N;
+                d_mean += data_batch[j];
+                m_mean += model_batch[j];
             }
         }
 
@@ -49,14 +44,20 @@ void means_covs_batched(
 
         if(n_corr > 0)
         {
+            const float Ncorr_f = static_cast<float>(n_corr);
+            d_mean /= Ncorr_f;
+            m_mean /= Ncorr_f;
+
             rm::Matrix3x3 C = rm::Matrix3x3::Zeros();
 
             for(size_t j = 0; j < batchSize; j++)
             {
-                C += (model_batch[j] - m_mean).multT(data_batch[j] - d_mean);
+                if(mask_batch[j] > 0)
+                {
+                    C += (model_batch[j] - m_mean).multT(data_batch[j] - d_mean);
+                }
             }
-
-            const float Ncorr_f = static_cast<float>(n_corr);
+            
             C /= Ncorr_f;
 
             dataset_center[i] = d_mean;
@@ -82,6 +83,77 @@ void means_covs_online_batched(
     unsigned int Nbatches = Ncorr.size();
     unsigned int batchSize = dataset_points.size() / Nbatches;
 
+    #pragma omp parallel for default(shared) if(Nbatches > 4)
+    for(size_t i=0; i<Nbatches; i++)
+    {
+        const rm::MemoryView<rm::Vector> data_batch = dataset_points(i * batchSize, (i+1) * batchSize);
+        const rm::MemoryView<rm::Vector> model_batch = model_points(i * batchSize, (i+1) * batchSize);
+        const rm::MemoryView<unsigned int> mask_batch = mask(i * batchSize, (i+1) * batchSize);
+
+        rm::Vector d_mean = {0.0f, 0.0f, 0.0f};
+        rm::Vector m_mean = {0.0f, 0.0f, 0.0f};
+        rm::Matrix3x3 C = rm::Matrix3x3::Zeros();
+        unsigned int n_corr = 0;
+
+        for(size_t j=0; j<batchSize; j++)
+        {
+            if(mask_batch[j] > 0)
+            {
+                const float N_1 = static_cast<float>(n_corr);
+                const float N = static_cast<float>(n_corr + 1);
+
+                // update ncorr
+                n_corr++;
+
+                const rm::Vector Di = data_batch[j]; // read
+                const rm::Vector Mi = model_batch[j]; // read
+                const rm::Vector d_mean_old = d_mean; // read
+                const rm::Vector m_mean_old = m_mean; // read
+
+                // update means
+                // rm::Vector dD = Di - d_mean;
+                // rm::Vector dM = Mi - m_mean;
+                
+                // save old means for covariance
+                
+                const rm::Vector d_mean_new = d_mean_old + (Di - d_mean) / N; 
+                const rm::Vector m_mean_new = m_mean_old + (Mi - m_mean) / N; 
+
+                const float w1 = N_1/N;
+                const float w2 = 1.0/N;
+
+                auto P1 = (Mi - m_mean_new).multT(Di - d_mean_new);
+                auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                C = C * w1 + P1 * w2 + P2 * w1;
+
+                // C += ((Mi - m_mean_new).multT(Di - d_mean_new) - C) * 1.0 / N 
+                //     + (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new) * N_1/N;
+
+                d_mean = d_mean_new; // write
+                m_mean = m_mean_new; // write
+            }
+        }
+
+        Ncorr[i] = n_corr;
+        dataset_center[i] = d_mean;
+        model_center[i] = m_mean;
+        Cs[i] = C;
+    }
+}
+
+void means_covs_online_approx_batched(
+    const rm::MemoryView<rm::Vector, rm::RAM>& dataset_points, // from
+    const rm::MemoryView<rm::Vector, rm::RAM>& model_points, // to
+    const rm::MemoryView<unsigned int, rm::RAM>& mask,
+    rm::MemoryView<rm::Vector, rm::RAM>& dataset_center,
+    rm::MemoryView<rm::Vector, rm::RAM>& model_center,
+    rm::MemoryView<rm::Matrix3x3, rm::RAM>& Cs,
+    rm::MemoryView<unsigned int, rm::RAM>& Ncorr)
+{
+    unsigned int Nbatches = Ncorr.size();
+    unsigned int batchSize = dataset_points.size() / Nbatches;
+
     #pragma omp parallel for default(shared) if(Nbatches > 4)
     for(size_t i=0; i<Nbatches; i++)
     {
diff --git a/src/rmcl/math/math_batched.cu b/src/rmcl/math/math_batched.cu
index 3f0cbba..3fabb89 100644
--- a/src/rmcl/math/math_batched.cu
+++ b/src/rmcl/math/math_batched.cu
@@ -333,6 +333,7 @@ __global__ void means_covs_online_batched_kernel(
     __shared__ rm::Vector sM[blockSize];
     __shared__ rm::Matrix3x3 sC[blockSize];
     __shared__ unsigned int sN[blockSize];
+    // __shared__ bool sMask[blockSize];
     
 
     const unsigned int tid = threadIdx.x;
@@ -344,8 +345,180 @@ __global__ void means_covs_online_batched_kernel(
     //     printf("blockSize: %u, chunkSize %u\n", blockSize, chunkSize);
     // }
 
-    sD[tid].setZeros();
-    sM[tid].setZeros();
+    sD[tid] = {0.0f, 0.0f, 0.0f};
+    sM[tid] = {0.0f, 0.0f, 0.0f};
+    sC[tid].setZeros();
+    sN[tid] = 0;
+
+    for(unsigned int i=0; i<rows; i++)
+    {
+        if(tid + blockSize * i < chunkSize)
+        {
+            if(mask[globId + blockSize * i] > 0)
+            {
+                // global read
+                const rm::Vector Di = dataset_points[globId + blockSize * i];
+                const rm::Vector Mi = model_points[globId + blockSize * i];
+
+                // shared read 
+                const rm::Vector D      = sD[tid];
+                const rm::Vector M      = sM[tid];
+                const rm::Matrix3x3 C   = sC[tid];
+                const unsigned int N_1    = sN[tid];
+                const float N_1f = static_cast<float>(N_1);
+                const unsigned int N = N_1 + 1;
+                const float Nf = static_cast<float>(N);
+
+                // compute
+                // rm::Vector dD = Di - D;
+                // rm::Vector dM = Mi - M;
+
+                const rm::Vector Dnew = D + (Di - D) / Nf;
+                const rm::Vector Mnew = M + (Mi - M) / Nf;
+
+                // dD = Di - Dnew;
+                // dM = Mi - Mnew;
+
+
+                const float w1 = N_1f / Nf;
+                const float w2 = 1.0 / Nf;
+
+                auto P1 = (Mi - Mnew).multT(Di - Dnew);
+                auto P2 = (M - Mnew).multT(D - Dnew);
+
+                // printf("w1: %f\n", w1);
+                // printf("w2: %f\n", w2);
+                // printf("P1: %f\n", P1(0,0));
+                // printf("P2: %f\n", P2(0,0));
+                // printf("P3: %f\n", P3(0,0));
+
+                // shared write
+                // auto Cnew = P1 + P2 + P3;
+                // sC[tid] = C * N_1f / Nf + (Mi - Mnew).multT(Di - Dnew) * 1.0 / Nf
+                //     + (M - Mnew).multT(D - Dnew) * N_1f / Nf;
+                
+                auto Cnew = C * w1 + P1 * w2 + P2 * w1;
+
+                sC[tid] = Cnew;
+                sD[tid] = Dnew;
+                sM[tid] = Mnew;
+                sN[tid] = N;
+
+                printf("(%u,G) -> (%u,S) = %u, %f\n", globId + blockSize * i, tid, N, Cnew(0,0) );
+            }
+        }
+    }
+    __syncthreads();
+    
+    for(unsigned int s = blockSize / 2; s > 0; s >>= 1)
+    {
+        if(tid < s && tid < chunkSize / 2)
+        {
+            // if(tid < s)
+            // shared read 
+            const rm::Vector D      = sD[tid];
+            const rm::Vector M      = sM[tid];
+            const rm::Matrix3x3 C   = sC[tid];
+            const unsigned int N    = sN[tid];
+
+            const rm::Vector Dnext      = sD[tid + s];
+            const rm::Vector Mnext      = sM[tid + s];
+            const rm::Matrix3x3 Cnext   = sC[tid + s];
+            const unsigned int Nnext    = sN[tid + s];
+
+            // compute
+            const unsigned int Ntot = N + Nnext;
+
+            if(Ntot > 0)
+            {
+                const float w1 = static_cast<float>(N) / static_cast<float>(Ntot);
+                const float w2 = static_cast<float>(Nnext) / static_cast<float>(Ntot);
+
+
+
+                // auto P1 = (Mi - Mnew).multT(Di - Dnew);
+                // auto P2 = (M - Mnew).multT(D - Dnew);
+
+                const rm::Vector Dtot = D * w1 + Dnext * w2;
+
+
+
+                // printf("(%u,S) %u, (%f, %f, %f) -> (%u,S) %u, (%f, %f, %f) = %u, (%f, %f, %f)\n", 
+                //     tid + s, Nnext, Dnext.x, Dnext.y, Dnext.z,
+                //     tid, N, D.x, D.y, D.z,
+                //     Ntot, Dtot.x, Dtot.y, Dtot.z);
+                
+                // shared write
+                sD[tid] = Dtot;
+                sM[tid] = M * w1 + Mnext * w2;
+                
+
+                // TODO weighted covariance
+                auto Cnew = C * w1 + Cnext * w2;
+
+
+                sC[tid] = Cnew; // currently only a sum
+                sN[tid] = Ntot;
+            }
+        }
+        __syncthreads();   
+    }
+
+    if(tid == 0)
+    {
+        const unsigned int N = sN[0];
+        if(N > 0)
+        {
+            dataset_center[blockIdx.x] = sD[0];
+            model_center[blockIdx.x] = sM[0];
+            Cs[blockIdx.x] = sC[0];
+            Ncorr[blockIdx.x] = N;
+        } else {
+            dataset_center[blockIdx.x].setZeros();
+            model_center[blockIdx.x].setZeros();
+            Cs[blockIdx.x].setZeros();
+            Ncorr[blockIdx.x] = 0;
+        }
+    }
+}
+
+
+template<unsigned int blockSize>
+__global__ void means_covs_online_approx_batched_kernel(
+    const rm::Vector* dataset_points, // from, dataset
+    const rm::Vector* model_points, // to, model
+    const unsigned int* mask,
+    unsigned int chunkSize,
+    rm::Vector* dataset_center,
+    rm::Vector* model_center,
+    rm::Matrix3x3* Cs,
+    unsigned int* Ncorr)
+{
+    // Online update: Covariance and means 
+    // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    // 
+    // Comment:
+    // I dont think the Wikipedia algorithm is completely correct (Covariances).
+    // But it is used in a lot of code -> Let everything be equally incorrect
+
+    __shared__ rm::Vector sD[blockSize];
+    __shared__ rm::Vector sM[blockSize];
+    __shared__ rm::Matrix3x3 sC[blockSize];
+    __shared__ unsigned int sN[blockSize];
+    // __shared__ bool sMask[blockSize];
+    
+
+    const unsigned int tid = threadIdx.x;
+    const unsigned int globId = chunkSize * blockIdx.x + threadIdx.x;
+    const unsigned int rows = (chunkSize + blockSize - 1) / blockSize;
+
+    // if(tid == 0)
+    // {
+    //     printf("blockSize: %u, chunkSize %u\n", blockSize, chunkSize);
+    // }
+
+    sD[tid] = {0.0f, 0.0f, 0.0f};
+    sM[tid] = {0.0f, 0.0f, 0.0f};
     sC[tid].setZeros();
     sN[tid] = 0;
 
@@ -365,13 +538,13 @@ __global__ void means_covs_online_batched_kernel(
                 const rm::Vector dD = dataset_points[globId + blockSize * i] - D;
                 const rm::Vector dM = model_points[globId + blockSize * i] - M;
 
-                // printf("Adding glob mem %u to shared mem %u\n", globId + blockSize * i, tid);
-
                 // shared write
                 sD[tid] = D + dD / static_cast<float>(N + 1);
                 sM[tid] = M + dM / static_cast<float>(N + 1);
                 sC[tid] = C + dM.multT(dD);
                 sN[tid] = N + 1;
+
+                // printf("(%u,G) -> (%u,S) = %u\n", globId + blockSize * i, tid, N + 1);
             }
         }
     }
@@ -395,23 +568,32 @@ __global__ void means_covs_online_batched_kernel(
 
             // compute
             const unsigned int Ntot = N + Nnext;
-            const float w1 = static_cast<float>(N) / static_cast<float>(Ntot);
-            const float w2 = static_cast<float>(Nnext) / static_cast<float>(Ntot);
 
-            // printf("Adding shared mem %u to shared mem %u\n", tid + s, tid);
+            if(Ntot > 0)
+            {
+                const float w1 = static_cast<float>(N) / static_cast<float>(Ntot);
+                const float w2 = static_cast<float>(Nnext) / static_cast<float>(Ntot);
+
 
-            // shared write
-            sD[tid] = D * w1 + Dnext * w2;
-            sM[tid] = M * w1 + Mnext * w2;
-            sC[tid] = C + Cnext; // currently only a sum
-            sN[tid] = Ntot;
+                const rm::Vector Dtot = D * w1 + Dnext * w2;
+                // printf("(%u,S) %u, (%f, %f, %f) -> (%u,S) %u, (%f, %f, %f) = %u, (%f, %f, %f)\n", 
+                //     tid + s, Nnext, Dnext.x, Dnext.y, Dnext.z,
+                //     tid, N, D.x, D.y, D.z,
+                //     Ntot, Dtot.x, Dtot.y, Dtot.z);
+                
+                // shared write
+                sD[tid] = Dtot;
+                sM[tid] = M * w1 + Mnext * w2;
+                sC[tid] = C + Cnext; // currently only a sum
+                sN[tid] = Ntot;
+            }
         }
         __syncthreads();   
     }
 
     if(tid == 0)
     {
-        const unsigned int N = sN[blockIdx.x];
+        const unsigned int N = sN[0];
         if(N > 0)
         {
             dataset_center[blockIdx.x] = sD[0];
@@ -558,9 +740,10 @@ void means_covs_batched(
     rmagine::MemoryView<rmagine::Matrix3x3, rmagine::VRAM_CUDA>& Cs,
     rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr)
 {
+    rm::sumBatched(mask, Ncorr);
+
     mean_batched(dataset_points, mask, Ncorr, dataset_center);
     mean_batched(model_points, mask, Ncorr, model_center);
-    rm::sumBatched(mask, Ncorr);
 
     cov_batched(dataset_points, dataset_center,
             model_points, model_center,
@@ -576,13 +759,31 @@ void means_covs_online_batched(
     rmagine::MemoryView<rmagine::Matrix3x3, rmagine::VRAM_CUDA>& Cs,
     rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr)
 {
-    unsigned int Nchunks = Ncorr.size();
-    unsigned int batchSize = dataset_points.size() / Nchunks;
+    unsigned int Nbatches = Ncorr.size();
+    unsigned int batchSize = dataset_points.size() / Nbatches;
+    constexpr unsigned int blockSize = 64; // TODO: get best value for this one
+
+    means_covs_online_batched_kernel<blockSize> <<<Nbatches, blockSize>>>(
+        dataset_points.raw(), model_points.raw(), mask.raw(), batchSize, 
+        dataset_center.raw(), model_center.raw(), Cs.raw(), Ncorr.raw());
+}
+
+
+void means_covs_online_approx_batched(
+    const rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& dataset_points, // from
+    const rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& model_points, // to
+    const rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& mask,
+    rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& dataset_center,
+    rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& model_center,
+    rmagine::MemoryView<rmagine::Matrix3x3, rmagine::VRAM_CUDA>& Cs,
+    rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA>& Ncorr)
+{
+    unsigned int Nbatches = Ncorr.size();
+    unsigned int batchSize = dataset_points.size() / Nbatches;
     constexpr unsigned int blockSize = 64; // TODO: get best value for this one
 
-    means_covs_online_batched_kernel<blockSize> <<<Nchunks, blockSize>>>(
-        dataset_points.raw(), model_points.raw(), mask.raw(), 
-        batchSize, 
+    means_covs_online_approx_batched_kernel<blockSize> <<<Nbatches, blockSize>>>(
+        dataset_points.raw(), model_points.raw(), mask.raw(), batchSize, 
         dataset_center.raw(), model_center.raw(), Cs.raw(), Ncorr.raw());
 }