diff --git a/include/rmcl/correction/CorrectionResults.hpp b/include/rmcl/correction/CorrectionResults.hpp
index 83bcbff..840f90c 100644
--- a/include/rmcl/correction/CorrectionResults.hpp
+++ b/include/rmcl/correction/CorrectionResults.hpp
@@ -40,6 +40,66 @@
 
 namespace rmcl {
 
+template<typename MemT>
+struct CorrespondencesView
+{
+    rmagine::MemoryView<rmagine::Vector, MemT>  dataset_points;
+    rmagine::MemoryView<rmagine::Vector, MemT>  model_points;
+    rmagine::MemoryView<unsigned int, MemT>     corr_valid;
+};
+
+
+template<typename MemT>
+struct Correspondences
+{
+    rmagine::Memory<rmagine::Vector, MemT>  dataset_points;
+    rmagine::Memory<rmagine::Vector, MemT>  model_points;
+    rmagine::Memory<unsigned int, MemT>     corr_valid;
+
+
+
+    
+    inline CorrespondencesView<MemT> slice(
+        unsigned int start, unsigned int end) const
+    {
+        return {
+            dataset_points.slice(start, end),
+            model_points.slice(start, end),
+            corr_valid.slice(start, end)
+        };
+    }
+
+    inline CorrespondencesView<MemT> view() const
+    {
+        return {
+            dataset_points,
+            model_points,
+            corr_valid
+        };
+    }
+
+    // Enable this function?
+    // inline std::vector<CorrespondencesView<MemT> > split(
+    //     unsigned int N) const
+    // {
+    //     std::vector<CorrespondencesView<MemT> > ret;
+
+    //     unsigned int batch_size = dataset_points.size() / N;
+
+    //     for(unsigned int i=0; i<N; i++)
+    //     {
+    //         ret.push_back({
+    //             dataset_points(i * batch_size, (i+1) * batch_size),
+    //             model_points(i * batch_size, (i+1) * batch_size),
+    //             corr_valid(i * batch_size, (i+1) * batch_size)
+    //         });
+    //     }
+
+    //     return ret;
+    // }
+};
+
+
 template<typename MemT>
 struct CorrectionPreResults 
 {
diff --git a/include/rmcl/correction/PinholeCorrectorEmbree.hpp b/include/rmcl/correction/PinholeCorrectorEmbree.hpp
index f855841..8edbacb 100644
--- a/include/rmcl/correction/PinholeCorrectorEmbree.hpp
+++ b/include/rmcl/correction/PinholeCorrectorEmbree.hpp
@@ -111,15 +111,6 @@ class PinholeCorrectorEmbree
         rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr
     );
 
-    // Next: reuse the already computed correspondences
-    // void computeCovs(
-    //     const rmagine::MemoryView<rmagine::Point> dataset_points,
-    //     const rmagine::MemoryView<rmagine::Point> model_points,
-    //     const rmagine::MemoryView<unsigned int> corr_valid,
-    //     rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
-    //     rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr
-    // );
-
     void computeCovs(
         const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
         CorrectionPreResults<rmagine::RAM>& res
@@ -172,24 +163,6 @@ class PinholeCorrectorEmbree
 
     // TODO: currently unused
     rmagine::SVDPtr m_svd;
-
-private:
-     // different implementations for testing
-    void computeCovsSequential(
-        const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
-        rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& data_means,
-        rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& model_means,
-        rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
-        rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr
-    );
-
-    void computeCovsOuterInnerParallel(
-        const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
-        rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& data_means,
-        rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& model_means,
-        rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
-        rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr
-    );
 };
 
 using PinholeCorrectorEmbreePtr = std::shared_ptr<PinholeCorrectorEmbree>;
diff --git a/include/rmcl/correction/SphereCorrectorEmbree.hpp b/include/rmcl/correction/SphereCorrectorEmbree.hpp
index 6b32435..e244643 100644
--- a/include/rmcl/correction/SphereCorrectorEmbree.hpp
+++ b/include/rmcl/correction/SphereCorrectorEmbree.hpp
@@ -140,6 +140,16 @@ class SphereCorrectorEmbree
         rmagine::Memory<unsigned int>& corr_valid
     );
 
+    // TODO: add them to other classes
+    void findSPC(
+        const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
+        Correspondences<rmagine::RAM>& corr
+    );
+
+    Correspondences<rmagine::RAM> findSPC(
+        const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms);
+
+
     // TODO: add properly - rmagine
     inline CorrectionParams params() const
     {
diff --git a/src/nodes/tests/cov_online.cpp b/src/nodes/tests/cov_online.cpp
index 5687f4d..51a4593 100644
--- a/src/nodes/tests/cov_online.cpp
+++ b/src/nodes/tests/cov_online.cpp
@@ -41,9 +41,9 @@ void fill_ones(rm::MemoryView<unsigned int> mask)
 
 int main(int argc, char** argv)
 {
-    rm::Mem<rm::Vector> dataset(128);
-    // fill_random(dataset);
-    fill_sequence(dataset);
+    rm::Mem<rm::Vector> dataset(10000);
+    fill_random(dataset);
+    // fill_sequence(dataset);
 
     rm::Mem<rm::Vector> model(dataset.size());
 
@@ -60,7 +60,6 @@ int main(int argc, char** argv)
         model[i] = T * dataset[i];
     }
 
-
     rm::StopWatchHR sw;
     double el;
     rm::Mem<rm::Vector> ds(1);
@@ -113,7 +112,6 @@ int main(int argc, char** argv)
         rm::Mem<rm::Matrix3x3, rm::VRAM_CUDA> Cs_(1);
         rm::Mem<unsigned int, rm::VRAM_CUDA> Ncorr_(1);
 
-
         std::cout << "means_covs_batched" << std::endl;
         sw();
         rmcl::means_covs_batched(dataset_, model_, mask_, ds_, ms_, Cs_, Ncorr_);
@@ -148,12 +146,5 @@ int main(int argc, char** argv)
         std::cout << Ncorr[0] << std::endl;
     }
 
-
-    
-
-
-
-
-
     return 0;
 }
\ No newline at end of file
diff --git a/src/rmcl/correction/O1DnCorrectorEmbree.cpp b/src/rmcl/correction/O1DnCorrectorEmbree.cpp
index d391b63..b7e60f3 100644
--- a/src/rmcl/correction/O1DnCorrectorEmbree.cpp
+++ b/src/rmcl/correction/O1DnCorrectorEmbree.cpp
@@ -128,18 +128,28 @@ CorrectionResults<rm::RAM> O1DnCorrectorEmbree::correct(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            Ncorr++;
-                            const float N = static_cast<float>(Ncorr);
+                            const float N_1 = static_cast<float>(Ncorr);
+                            const float N = static_cast<float>(Ncorr + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
 
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
 
-                            // reduction
-                            Dmean += dD / N;
-                            Mmean += dM / N;
-                            C += dM.multT(dD);
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr = Ncorr + 1;
                         }
                     }
                 }
@@ -150,9 +160,6 @@ CorrectionResults<rm::RAM> O1DnCorrectorEmbree::correct(
 
         if(Ncorr > 0)
         {
-            const float Ncorr_f = static_cast<float>(Ncorr);
-            C /= Ncorr_f;
-
             Matrix3x3 U, V, S;
 
             { // the only Eigen code left
@@ -285,18 +292,28 @@ void O1DnCorrectorEmbree::computeCovs(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
-                            float N = static_cast<float>(Ncorr_ + 1);
-
-                            // reduction
-                            Ncorr_++;
-                            Mmean += dM / N;
-                            Dmean += dD / N;
-                            C += dM.multT(dD);
+                            const float N_1 = static_cast<float>(Ncorr_);
+                            const float N = static_cast<float>(Ncorr_ + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
+
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
+
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr_ = Ncorr_ + 1;
                         }
                     }
                 }
@@ -304,20 +321,9 @@ void O1DnCorrectorEmbree::computeCovs(
         }
 
         Ncorr[pid] = Ncorr_;
-
-        if(Ncorr_ > 0)
-        {
-            const float Ncorr_f = static_cast<float>(Ncorr_);
-            C /= Ncorr_f;
-
-            ds[pid] = Dmean;
-            ms[pid] = Mmean;
-            Cs[pid] = C;
-        } else {
-            ds[pid] = {0.0, 0.0, 0.0};
-            ms[pid] = {0.0, 0.0, 0.0};
-            Cs[pid].setZeros();
-        }
+        ds[pid] = Dmean;
+        ms[pid] = Mmean;
+        Cs[pid] = C;
     }
 }
 
@@ -376,6 +382,8 @@ void O1DnCorrectorEmbree::findSPC(
                 if(range_real < m_model->range.min 
                     || range_real > m_model->range.max)
                 {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                     continue;
                 }
@@ -434,19 +442,22 @@ void O1DnCorrectorEmbree::findSPC(
 
                     const float distance = (pmesh_s - preal_s).l2norm();
 
-                    if(distance < max_distance)
-                    {
-                        // convert back to base (sensor shared coordinate system)
-                        const Vector preal_b = Tsb * preal_s;
-                        const Vector pmesh_b = Tsb * pmesh_s;
+                    // convert back to base (sensor shared coordinate system)
+                    const Vector preal_b = Tsb * preal_s;
+                    const Vector pmesh_b = Tsb * pmesh_s;
 
-                        dataset_points[glob_id] = preal_b;
-                        model_points[glob_id] = pmesh_b;
+                    dataset_points[glob_id] = preal_b;
+                    model_points[glob_id] = pmesh_b;
+
+                    if(distance < max_distance)
+                    {   
                         corr_valid[glob_id] = 1;
                     } else {
                         corr_valid[glob_id] = 0;
                     }
                 } else {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                 }
             }
diff --git a/src/rmcl/correction/OnDnCorrectorEmbree.cpp b/src/rmcl/correction/OnDnCorrectorEmbree.cpp
index c8b5829..5135c7c 100644
--- a/src/rmcl/correction/OnDnCorrectorEmbree.cpp
+++ b/src/rmcl/correction/OnDnCorrectorEmbree.cpp
@@ -129,18 +129,28 @@ CorrectionResults<rm::RAM> OnDnCorrectorEmbree::correct(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            Ncorr++;
-                            const float N = static_cast<float>(Ncorr);
+                            const float N_1 = static_cast<float>(Ncorr);
+                            const float N = static_cast<float>(Ncorr + 1);
+                            const float w1 = N_1 / N;
+                            const float w2 = 1.0 / N;
 
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
 
-                            // reduction
-                            Dmean += dD / N;
-                            Mmean += dM / N;
-                            C += dM.multT(dD);
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr = Ncorr + 1;
                         }
                     }
                 }
@@ -151,9 +161,6 @@ CorrectionResults<rm::RAM> OnDnCorrectorEmbree::correct(
 
         if(Ncorr > 0)
         {
-            const float Ncorr_f = static_cast<float>(Ncorr);
-            C /= Ncorr_f;
-
             Matrix3x3 U, V, S;
 
             { // the only Eigen code left
@@ -286,18 +293,28 @@ void OnDnCorrectorEmbree::computeCovs(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
-                            float N = static_cast<float>(Ncorr_ + 1);
-
-                            // reduction
-                            Ncorr_++;
-                            Mmean += dM / N;
-                            Dmean += dD / N;
-                            C += dM.multT(dD);
+                            const float N_1 = static_cast<float>(Ncorr_);
+                            const float N = static_cast<float>(Ncorr_ + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
+
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
+
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr_ = Ncorr_ + 1;
                         }
                     }
                 }
@@ -377,6 +394,8 @@ void OnDnCorrectorEmbree::findSPC(
                 if(range_real < m_model->range.min 
                     || range_real > m_model->range.max)
                 {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                     continue;
                 }
@@ -435,19 +454,22 @@ void OnDnCorrectorEmbree::findSPC(
 
                     const float distance = (pmesh_s - preal_s).l2norm();
 
+                    // convert back to base (sensor shared coordinate system)
+                    const Vector preal_b = Tsb * preal_s;
+                    const Vector pmesh_b = Tsb * pmesh_s;
+
+                    dataset_points[glob_id] = preal_b;
+                    model_points[glob_id] = pmesh_b;
+
                     if(distance < max_distance)
                     {
-                        // convert back to base (sensor shared coordinate system)
-                        const Vector preal_b = Tsb * preal_s;
-                        const Vector pmesh_b = Tsb * pmesh_s;
-
-                        dataset_points[glob_id] = preal_b;
-                        model_points[glob_id] = pmesh_b;
                         corr_valid[glob_id] = 1;
                     } else {
                         corr_valid[glob_id] = 0;
                     }
                 } else {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                 }
             }
diff --git a/src/rmcl/correction/PinholeCorrectorEmbree.cpp b/src/rmcl/correction/PinholeCorrectorEmbree.cpp
index 5b15419..dffd147 100644
--- a/src/rmcl/correction/PinholeCorrectorEmbree.cpp
+++ b/src/rmcl/correction/PinholeCorrectorEmbree.cpp
@@ -139,18 +139,28 @@ CorrectionResults<rmagine::RAM> PinholeCorrectorEmbree::correct(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            Ncorr++;
-                            const float N = static_cast<float>(Ncorr);
+                            const float N_1 = static_cast<float>(Ncorr);
+                            const float N = static_cast<float>(Ncorr + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
 
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
 
-                            // reduction
-                            Dmean += dD / N;
-                            Mmean += dM / N;
-                            C += dM.multT(dD);
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr = Ncorr + 1;
                         }
                     }
                 }
@@ -161,8 +171,6 @@ CorrectionResults<rmagine::RAM> PinholeCorrectorEmbree::correct(
 
         if(Ncorr > 0)
         {
-            C /= static_cast<float>(Ncorr);
-
             Matrix3x3 U, V, S;
 
             { // the only Eigen code left
@@ -301,19 +309,28 @@ void PinholeCorrectorEmbree::computeCovs(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-
-                        // #pragma omp critical
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
-                            float N = static_cast<float>(Ncorr_ + 1);
-
-                            // reduction
-                            Ncorr_++;
-                            Mmean += dM / N;
-                            Dmean += dD / N;
-                            C += dM.multT(dD);
+                            const float N_1 = static_cast<float>(Ncorr_);
+                            const float N = static_cast<float>(Ncorr_ + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
+
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
+
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr_ = Ncorr_ + 1;
                         }
                     }
                 }
@@ -393,6 +410,8 @@ void PinholeCorrectorEmbree::findSPC(
                 if(range_real < m_model->range.min 
                     || range_real > m_model->range.max)
                 {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                     continue;
                 }
@@ -456,19 +475,22 @@ void PinholeCorrectorEmbree::findSPC(
 
                     const float distance = (pmesh_s - preal_s).l2norm();
 
+                    // convert back to base (sensor shared coordinate system)
+                    const Vector preal_b = Tsb * preal_s;
+                    const Vector pmesh_b = Tsb * pmesh_s;
+
+                    dataset_points[glob_id] = preal_b;
+                    model_points[glob_id] = pmesh_b;
+
                     if(distance < max_distance)
                     {
-                        // convert back to base (sensor shared coordinate system)
-                        const Vector preal_b = Tsb * preal_s;
-                        const Vector pmesh_b = Tsb * pmesh_s;
-
-                        dataset_points[glob_id] = preal_b;
-                        model_points[glob_id] = pmesh_b;
                         corr_valid[glob_id] = 1;
                     } else {
                         corr_valid[glob_id] = 0;
                     }
                 } else {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                 }
             }
@@ -501,269 +523,4 @@ void PinholeCorrectorEmbree::findSPC(
     findSPC(Tbms, dataset_points(0, Nrays), model_points(0, Nrays), corr_valid(0, Nrays));
 }
 
-
-void PinholeCorrectorEmbree::computeCovsSequential(
-    const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
-    rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& ds,
-    rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& ms,
-    rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
-    rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr)
-{
-    const float max_distance = m_params.max_distance;
-
-    auto scene = m_map->scene->handle();
-
-    const rmagine::Transform Tsb = m_Tsb[0];
-
-    // #pragma omp parallel for default(shared) if(Tbms.size() > 4)
-    for(size_t pid=0; pid < Tbms.size(); pid++)
-    {
-        auto Tbms0 = Tbms(pid, pid+1);
-        const rmagine::Transform Tbm = Tbms[pid];
-        const rmagine::Transform Tsm = Tbm * Tsb;
-        const rmagine::Transform Tms = ~Tsm;
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        Memory<Vector, RAM> dataset_points;
-        Memory<Vector, RAM> model_points;
-        Memory<unsigned int, RAM> corr_valid;
-        findSPC(Tbms0, dataset_points, model_points, corr_valid);
-
-        Vector Dmean = {0.0, 0.0, 0.0};
-        Vector Mmean = {0.0, 0.0, 0.0};
-        unsigned int Ncorr_ = 0;
-        
-        for(size_t i=0; i<dataset_points.size(); i++)
-        {
-            if(corr_valid[i] > 0)
-            {
-                Dmean += dataset_points[i];
-                Mmean += model_points[i];
-                Ncorr_++;
-            }
-        }
-
-        Ncorr[pid] = Ncorr_;
-
-        if(Ncorr_ > 0)
-        {
-            const float Ncorr_f = static_cast<float>(Ncorr_);
-
-            Dmean /= Ncorr_f;
-            Mmean /= Ncorr_f;
-
-            Matrix3x3 C;
-            C.setZeros();
-
-            for(size_t i=0; i<dataset_points.size(); i++)
-            {
-                if(corr_valid[i] > 0)
-                {
-                    C += (model_points[i] - Mmean).multT(dataset_points[i] - Dmean);
-                }
-            }
-
-            C /= Ncorr_f;
-
-            // Bessel's correction for sample variance
-            // C /= (Ncorr_ - 1);
-            
-            ds[pid] = Dmean;
-            ms[pid] = Mmean;
-            Cs[pid] = C;
-        } else {
-            ds[pid] = {0.0, 0.0, 0.0};
-            ms[pid] = {0.0, 0.0, 0.0};
-            Cs[pid].setZeros();
-        }
-    }
-}
-
-void PinholeCorrectorEmbree::computeCovsOuterInnerParallel(
-    const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
-    rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& ms,
-    rmagine::MemoryView<rmagine::Vector, rmagine::RAM>& ds,
-    rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
-    rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr)
-{
-    // std::cout << "computeCovsOuterInnerParallel" << std::endl;
-
-    const float max_distance = m_params.max_distance;
-
-    auto scene = m_map->scene->handle();
-
-    const rmagine::Transform Tsb = m_Tsb[0];
-
-    // #pragma omp parallel for default(shared) if(Tbms.size() > 4)
-    for(size_t pid=0; pid < Tbms.size(); pid++)
-    {
-        const rmagine::Transform Tbm = Tbms[pid];
-        
-        const rmagine::Transform Tsm = Tbm * Tsb;
-        const rmagine::Transform Tms = ~Tsm;
-
-        const unsigned int glob_shift = pid * m_model->size();
-
-        Vector Dmean = {0.0, 0.0, 0.0};
-        Vector Mmean = {0.0, 0.0, 0.0};
-        unsigned int Ncorr_ = 0;
-        Matrix3x3 C;
-        C.setZeros();
-
-        // #pragma omp parallel for default(shared) reduction(+:Dmean,Mmean,Ncorr_,C)
-        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
-        {
-            Vector Dmean_inner = {0.0, 0.0, 0.0};
-            Vector Mmean_inner = {0.0, 0.0, 0.0};
-            unsigned int Ncorr_inner = 0;
-            Matrix3x3 C_inner;
-            C_inner.setZeros();
-
-            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
-            {
-                const unsigned int loc_id = m_model->getBufferId(vid, hid);
-                const unsigned int glob_id = glob_shift + loc_id;
-
-                const float range_real = m_ranges[loc_id];
-                if(range_real < m_model->range.min 
-                    || range_real > m_model->range.max)
-                {
-                    continue;
-                }
-
-                Vector ray_dir_s;
-                if(m_optical)
-                {
-                    ray_dir_s = m_model->getDirectionOptical(vid, hid);
-                } else {
-                    ray_dir_s = m_model->getDirection(vid, hid);
-                }
-
-                const Vector ray_dir_m = Tsm.R * ray_dir_s;
-
-                RTCRayHit rayhit;
-                rayhit.ray.org_x = Tsm.t.x;
-                rayhit.ray.org_y = Tsm.t.y;
-                rayhit.ray.org_z = Tsm.t.z;
-                rayhit.ray.dir_x = ray_dir_m.x;
-                rayhit.ray.dir_y = ray_dir_m.y;
-                rayhit.ray.dir_z = ray_dir_m.z;
-                rayhit.ray.tnear = 0;
-                rayhit.ray.tfar = INFINITY;
-                rayhit.ray.mask = 0;
-                rayhit.ray.flags = 0;
-                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-
-                rtcIntersect1(scene, &m_context, &rayhit);
-
-                bool sim_valid = rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID;
-                if(sim_valid)
-                {
-                    Vector nint_m;
-                    nint_m.x = rayhit.hit.Ng_x;
-                    nint_m.y = rayhit.hit.Ng_y;
-                    nint_m.z = rayhit.hit.Ng_z;
-                    nint_m.normalizeInplace();
-
-                    // sensor space
-                    // Do point to plane ICP here
-                    Vector preal_s, pint_s, nint_s;
-                    preal_s = ray_dir_s * range_real;
-
-                    // search point on surface that is more nearby
-                    pint_s = ray_dir_s * rayhit.ray.tfar;
-                    
-                    // transform normal from global to local
-                    nint_s = Tms.R * nint_m;
-
-                    // flip to base: check if this is really needed: no
-                    // if(ray_dir_s.dot(nint_s) > 0.0)
-                    // {
-                    //     nint_s = -nint_s;
-                    // }
-
-                    // distance of real point to plane at simulated point
-                    float signed_plane_dist = (pint_s - preal_s).dot(nint_s);
-                    // project point to plane results in correspondence
-                    const Vector pmesh_s = preal_s + nint_s * signed_plane_dist;  
-
-                    const float distance = (pmesh_s - preal_s).l2norm();
-
-                    if(distance < max_distance)
-                    {
-                        const Vector preal_b = Tsb * preal_s;
-                        const Vector pmesh_b = Tsb * pmesh_s;
-
-                        // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-
-                        Ncorr_inner++;
-                        const float N = static_cast<float>(Ncorr_inner);
-
-                        const Vector dD = preal_b - Dmean_inner;
-                        const Vector dM = pmesh_b - Mmean_inner;
-
-                        // reduction
-                        Dmean_inner += dD / N;
-                        Mmean_inner += dM / N;
-                        C_inner += dM.multT(dD);
-                    }
-                }
-            }
-
-            // reduce
-            // must be done sequentially
-            // #pragma omp critical
-            // {
-                // combining covariances of two sets
-                // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-                const double Nouter = static_cast<double>(Ncorr_);
-                const double Ninner = static_cast<double>(Ncorr_inner);
-                const double Ntot = static_cast<double>(Ncorr_ + Ncorr_inner);
-
-
-                auto dD = Dmean_inner.cast<double>() - Dmean.cast<double>();
-                auto dM = Mmean_inner.cast<double>() - Mmean.cast<double>();
-
-                Dmean = (Dmean * Nouter / Ntot + Dmean_inner * Ninner / Ntot).cast<float>();
-                Mmean = (Mmean * Nouter / Ntot + Mmean_inner * Ninner / Ntot).cast<float>();            
-
-                auto Cd = C.cast<double>();
-                auto C_innerd = C_inner.cast<double>();
-                auto dC = dM.multT(dD);
-
-                // const Matrix3x3 dC = (Mmean_inner - Mmean).multT(Dmean_inner - Dmean);
-
-                // const float factor = Ninner / Ntot * N;
-                Cd = Cd + C_innerd + dC * Ninner / Ntot * Nouter;
-                // C = C_inner + C;
-
-                C = Cd.cast<float>();
-                
-                Ncorr_ += Ncorr_inner;
-            // }
-        }
-
-        Ncorr[pid] = Ncorr_;
-
-        if(Ncorr_ > 0)
-        {
-            C /= static_cast<float>(Ncorr_);
-            // C *= 0.95;
-            // Bessel's correction for sample variance
-            // C /= (Ncorr_ - 1);
-            
-            ms[pid] = Mmean;
-            ds[pid] = Dmean;
-            Cs[pid] = C;
-        } else {
-            ms[pid] = {0.0, 0.0, 0.0};
-            ds[pid] = {0.0, 0.0, 0.0};
-            Cs[pid].setZeros();
-        }
-    }
-}
-
 } // namespace rmcl
\ No newline at end of file
diff --git a/src/rmcl/correction/PinholeCorrectorOptix.cpp b/src/rmcl/correction/PinholeCorrectorOptix.cpp
index c57d3a4..5411f9d 100644
--- a/src/rmcl/correction/PinholeCorrectorOptix.cpp
+++ b/src/rmcl/correction/PinholeCorrectorOptix.cpp
@@ -162,8 +162,8 @@ void PinholeCorrectorOptix::computeCovs(
         if(Tbms.size() < max_poses_rw)
         {
             // std::cout << "Raywise" << std::endl;
-            rm::StopWatchHR sw;
-            double el;
+            // rm::StopWatchHR sw;
+            // double el;
 
             // sw();
             computeMeansCovsRW(Tbms, ds, ms, Cs, Ncorr);
@@ -359,15 +359,6 @@ void PinholeCorrectorOptix::computeMeansCovsRW(
             means_dataset, means_model, // outputs
             Cs, Ncorr
         );
-
-    // old: two pass
-    // mean_batched(dataset_points, corr_valid, Ncorr, means_dataset);
-    // mean_batched(model_points, corr_valid, Ncorr, means_model);
-    // rm::sumBatched(corr_valid, Ncorr);
-
-    // cov_batched(dataset_points, means_dataset,
-    //         model_points, means_model,
-    //         corr_valid, Ncorr, Cs);
 }
 
 void PinholeCorrectorOptix::computeMeansCovsSW(
diff --git a/src/rmcl/correction/SphereCorrectorEmbree.cpp b/src/rmcl/correction/SphereCorrectorEmbree.cpp
index cae36ea..3b5f542 100644
--- a/src/rmcl/correction/SphereCorrectorEmbree.cpp
+++ b/src/rmcl/correction/SphereCorrectorEmbree.cpp
@@ -129,18 +129,28 @@ CorrectionResults<rmagine::RAM> SphereCorrectorEmbree::correct(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            Ncorr++;
-                            const float N = static_cast<float>(Ncorr);
+                            const float N_1 = static_cast<float>(Ncorr);
+                            const float N = static_cast<float>(Ncorr + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
 
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
 
-                            // reduction
-                            Dmean += dD / N;
-                            Mmean += dM / N;
-                            C += dM.multT(dD);
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr = Ncorr + 1;
                         }
                     }
                 }
@@ -151,9 +161,6 @@ CorrectionResults<rmagine::RAM> SphereCorrectorEmbree::correct(
 
         if(Ncorr > 0)
         {
-            const float Ncorr_f = static_cast<float>(Ncorr);
-            C /= Ncorr_f;
-
             Matrix3x3 U, V, S;
 
             { // the only Eigen code left
@@ -283,39 +290,38 @@ void SphereCorrectorEmbree::computeCovs(
                         const Vector pmesh_b = Tsb * pmesh_s;
 
                         // Online update: Covariance and means 
-                        // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-
+                        // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        // use the following equations instead
                         {
-                            const Vector dD = preal_b - Dmean;
-                            const Vector dM = pmesh_b - Mmean;
-                            float N = static_cast<float>(Ncorr_ + 1);
-
-                            // reduction
-                            Ncorr_++;
-                            Mmean += dM / N;
-                            Dmean += dD / N;
-                            C += dM.multT(dD);
+                            const float N_1 = static_cast<float>(Ncorr_);
+                            const float N = static_cast<float>(Ncorr_ + 1);
+                            const float w1 = N_1/N;
+                            const float w2 = 1.0/N;
+
+                            const rm::Vector d_mean_old = Dmean;
+                            const rm::Vector m_mean_old = Mmean;
+
+                            const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                            const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                            auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                            auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                            // write
+                            Dmean = d_mean_new;
+                            Mmean = m_mean_new;
+                            C = C * w1 + P1 * w2 + P2 * w1;
+                            Ncorr_ = Ncorr_ + 1;
                         }
                     }
                 }
             }
         }
 
+        ds[pid] = Dmean;
+        ms[pid] = Mmean;
+        Cs[pid] = C;
         Ncorr[pid] = Ncorr_;
-
-        if(Ncorr_ > 0)
-        {
-            const float Ncorr_f = static_cast<float>(Ncorr_);
-            C /= Ncorr_f;
-            
-            ds[pid] = Dmean;
-            ms[pid] = Mmean;
-            Cs[pid] = C;
-        } else {
-            ds[pid] = {0.0f, 0.0f, 0.0f};
-            ms[pid] = {0.0f, 0.0f, 0.0f};
-            Cs[pid].setZeros();
-        }
     }
 }
 
@@ -373,6 +379,8 @@ void SphereCorrectorEmbree::findSPC(
                 if(range_real < m_model->range.min 
                     || range_real > m_model->range.max)
                 {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                     continue;
                 }
@@ -429,19 +437,22 @@ void SphereCorrectorEmbree::findSPC(
 
                     const float distance = (pmesh_s - preal_s).l2norm();
 
+                    // convert back to base (sensor shared coordinate system)
+                    const Vector preal_b = Tsb * preal_s;
+                    const Vector pmesh_b = Tsb * pmesh_s;
+
+                    dataset_points[glob_id] = preal_b;
+                    model_points[glob_id] = pmesh_b;
+
                     if(distance < max_distance)
                     {
-                        // convert back to base (sensor shared coordinate system)
-                        const Vector preal_b = Tsb * preal_s;
-                        const Vector pmesh_b = Tsb * pmesh_s;
-
-                        dataset_points[glob_id] = preal_b;
-                        model_points[glob_id] = pmesh_b;
                         corr_valid[glob_id] = 1;
                     } else {
                         corr_valid[glob_id] = 0;
                     }
                 } else {
+                    dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+                    model_points[glob_id] = {0.0f, 0.0f, 0.0f};
                     corr_valid[glob_id] = 0;
                 }
             }
@@ -474,4 +485,20 @@ void SphereCorrectorEmbree::findSPC(
     findSPC(Tbms, dataset_points(0, Nrays), model_points(0, Nrays), corr_valid(0, Nrays));
 }
 
+void SphereCorrectorEmbree::findSPC(
+    const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
+    Correspondences<rmagine::RAM>& corr)
+{
+    findSPC(Tbms, corr.dataset_points, corr.model_points, corr.corr_valid);
+}
+
+Correspondences<rmagine::RAM> SphereCorrectorEmbree::findSPC(
+    const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms)
+{
+    Correspondences<rmagine::RAM> ret;
+    findSPC(Tbms, ret);
+    return ret;
+}
+
+
 } // namespace rmcl
\ No newline at end of file
diff --git a/src/rmcl/correction/SphereCorrectorOptix.cpp b/src/rmcl/correction/SphereCorrectorOptix.cpp
index f031419..c907970 100644
--- a/src/rmcl/correction/SphereCorrectorOptix.cpp
+++ b/src/rmcl/correction/SphereCorrectorOptix.cpp
@@ -261,13 +261,6 @@ void SphereCorrectorOptix::computeMeansCovsRW(
         means_dataset, means_model, // outputs
         Cs, Ncorr
     );
-
-    // old: two-pass
-    // means_covs_batched(
-    //     dataset_points, model_points, corr_valid, // input
-    //     means_dataset, means_model, // outputs
-    //     Cs, Ncorr
-    // );
 }
 
 void SphereCorrectorOptix::computeMeansCovsSW(
diff --git a/src/rmcl/correction/optix/O1DnCorrectProgramSW.cu b/src/rmcl/correction/optix/O1DnCorrectProgramSW.cu
index 1fe1bd2..538aa16 100644
--- a/src/rmcl/correction/optix/O1DnCorrectProgramSW.cu
+++ b/src/rmcl/correction/optix/O1DnCorrectProgramSW.cu
@@ -108,30 +108,34 @@ extern "C" __global__ void __raygen__rg()
                     const rm::Vector preal_b = Tsb * preal_s;
                     const rm::Vector pmesh_b = Tsb * pmesh_s;
                     // Online update: Covariance and means 
-                    // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                    // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 
+                    // use the following equations instead
                     {
-                        Ncorr++;
-                        const float N = static_cast<float>(Ncorr);
+                        const float N_1 = static_cast<float>(Ncorr);
+                        const float N = static_cast<float>(Ncorr + 1);
+                        const float w1 = N_1/N;
+                        const float w2 = 1.0/N;
 
-                        const rm::Vector dD = preal_b - Dmean;
-                        const rm::Vector dM = pmesh_b - Mmean;
+                        const rm::Vector d_mean_old = Dmean;
+                        const rm::Vector m_mean_old = Mmean;
 
-                        // reduction
-                        Dmean += dD / N;
-                        Mmean += dM / N;
-                        C += dM.multT(dD);
+                        const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                        const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                        auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                        auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                        // write
+                        Dmean = d_mean_new;
+                        Mmean = m_mean_new;
+                        C = C * w1 + P1 * w2 + P2 * w1;
+                        Ncorr = Ncorr + 1;
                     }
                 }
             }
         }
 
         mem.Ncorr[pid] = Ncorr;
-
-        if(Ncorr > 0)
-        {
-            C /= static_cast<float>(Ncorr);
-        }
-
         mem.C[pid] = C;
         mem.m1[pid] = Dmean;
         mem.m2[pid] = Mmean;
diff --git a/src/rmcl/correction/optix/OnDnCorrectProgramSW.cu b/src/rmcl/correction/optix/OnDnCorrectProgramSW.cu
index b483845..0a93380 100644
--- a/src/rmcl/correction/optix/OnDnCorrectProgramSW.cu
+++ b/src/rmcl/correction/optix/OnDnCorrectProgramSW.cu
@@ -93,11 +93,7 @@ extern "C" __global__ void __raygen__rg()
                 const rm::Vector pint_s = ray_orig_s + ray_dir_s * range;
 
                 rm::Vector nint_s = Rms * nint_m;
-                
-                // if(nint_s.dot(ray_dir_s) > 0.0 )
-                // {
-                //     nint_s *= -1.0;
-                // }
+
 
                 const float signed_plane_dist = (pint_s - preal_s).dot(nint_s);
                 const rm::Vector pmesh_s = preal_s + nint_s * signed_plane_dist;
@@ -109,32 +105,34 @@ extern "C" __global__ void __raygen__rg()
                     const rm::Vector preal_b = Tsb * preal_s;
                     const rm::Vector pmesh_b = Tsb * pmesh_s;
                     // Online update: Covariance and means 
-                    // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-                    // -- The means shoud be exact (except of floating point issues)
-                    // -- Are the covariances exact? I dont think so
+                    // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 
+                    // use the following equations instead
                     {
-                        Ncorr++;
-                        const float N = static_cast<float>(Ncorr);
+                        const float N_1 = static_cast<float>(Ncorr);
+                        const float N = static_cast<float>(Ncorr + 1);
+                        const float w1 = N_1/N;
+                        const float w2 = 1.0/N;
+
+                        const rm::Vector d_mean_old = Dmean;
+                        const rm::Vector m_mean_old = Mmean;
+
+                        const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                        const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
 
-                        const rm::Vector dD = preal_b - Dmean;
-                        const rm::Vector dM = pmesh_b - Mmean;
+                        auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                        auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
 
-                        // reduction
-                        Dmean += dD / N;
-                        Mmean += dM / N;
-                        C += dM.multT(dD);
+                        // write
+                        Dmean = d_mean_new;
+                        Mmean = m_mean_new;
+                        C = C * w1 + P1 * w2 + P2 * w1;
+                        Ncorr = Ncorr + 1;
                     }
                 }
             }
         }
 
         mem.Ncorr[pid] = Ncorr;
-
-        if(Ncorr > 0)
-        {
-            C /= static_cast<float>(Ncorr);
-        }
-
         mem.C[pid] = C;
         mem.m1[pid] = Dmean;
         mem.m2[pid] = Mmean;
diff --git a/src/rmcl/correction/optix/PinholeCorrectProgramSW.cu b/src/rmcl/correction/optix/PinholeCorrectProgramSW.cu
index 0084b0c..c75667e 100644
--- a/src/rmcl/correction/optix/PinholeCorrectProgramSW.cu
+++ b/src/rmcl/correction/optix/PinholeCorrectProgramSW.cu
@@ -106,31 +106,36 @@ extern "C" __global__ void __raygen__rg()
                 {
                     const rm::Vector preal_b = Tsb * preal_s;
                     const rm::Vector pmesh_b = Tsb * pmesh_s;
+                    
                     // Online update: Covariance and means 
-                    // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                    // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                    // use the following equations instead
                     {
-                        Ncorr++;
-                        const float N = static_cast<float>(Ncorr);
+                        const float N_1 = static_cast<float>(Ncorr);
+                        const float N = static_cast<float>(Ncorr + 1);
+                        const float w1 = N_1/N;
+                        const float w2 = 1.0/N;
 
-                        const rm::Vector dD = preal_b - Dmean;
-                        const rm::Vector dM = pmesh_b - Mmean;
+                        const rm::Vector d_mean_old = Dmean;
+                        const rm::Vector m_mean_old = Mmean;
 
-                        // reduction
-                        Dmean += dD / N;
-                        Mmean += dM / N;
-                        C += dM.multT(dD);
+                        const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                        const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                        auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                        auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                        // write
+                        Dmean = d_mean_new;
+                        Mmean = m_mean_new;
+                        C = C * w1 + P1 * w2 + P2 * w1;
+                        Ncorr = Ncorr + 1;
                     }
                 }
             }
         }
 
         mem.Ncorr[pid] = Ncorr;
-
-        if(Ncorr > 0)
-        {
-            C /= static_cast<float>(Ncorr);
-        }
-
         mem.C[pid] = C;
         mem.m1[pid] = Dmean;
         mem.m2[pid] = Mmean;
diff --git a/src/rmcl/correction/optix/SphereCorrectProgramSW.cu b/src/rmcl/correction/optix/SphereCorrectProgramSW.cu
index 48bb0b8..516ec9d 100644
--- a/src/rmcl/correction/optix/SphereCorrectProgramSW.cu
+++ b/src/rmcl/correction/optix/SphereCorrectProgramSW.cu
@@ -116,30 +116,34 @@ extern "C" __global__ void __raygen__rg()
                     const rm::Vector preal_b = Tsb * preal_s;
                     const rm::Vector pmesh_b = Tsb * pmesh_s;
                     // Online update: Covariance and means 
-                    // - https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                    // - wrong: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 
+                    // use the following equations instead
                     {
-                        Ncorr++;
-                        const float N = static_cast<float>(Ncorr);
+                        const float N_1 = static_cast<float>(Ncorr);
+                        const float N = static_cast<float>(Ncorr + 1);
+                        const float w1 = N_1/N;
+                        const float w2 = 1.0/N;
 
-                        const rm::Vector dD = preal_b - Dmean;
-                        const rm::Vector dM = pmesh_b - Mmean;
+                        const rm::Vector d_mean_old = Dmean;
+                        const rm::Vector m_mean_old = Mmean;
 
-                        // reduction
-                        Dmean += dD / N;
-                        Mmean += dM / N;
-                        C += dM.multT(dD);
+                        const rm::Vector d_mean_new = d_mean_old * w1 + preal_b * w2; 
+                        const rm::Vector m_mean_new = m_mean_old * w1 + pmesh_b * w2;
+
+                        auto P1 = (pmesh_b - m_mean_new).multT(preal_b - d_mean_new);
+                        auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
+
+                        // write
+                        Dmean = d_mean_new;
+                        Mmean = m_mean_new;
+                        C = C * w1 + P1 * w2 + P2 * w1;
+                        Ncorr = Ncorr + 1;
                     }
                 }
             }
         }
 
         mem.Ncorr[pid] = Ncorr;
-
-        if(Ncorr > 0)
-        {
-            C /= static_cast<float>(Ncorr);
-        }
-
         mem.C[pid] = C;
         mem.m1[pid] = Dmean;
         mem.m2[pid] = Mmean;
diff --git a/src/rmcl/math/math.cpp b/src/rmcl/math/math.cpp
index 6251e8f..7b89705 100644
--- a/src/rmcl/math/math.cpp
+++ b/src/rmcl/math/math.cpp
@@ -3,6 +3,7 @@
 #include <rmagine/util/prints.h>
 
 using namespace rmagine;
+namespace rm = rmagine;
 
 namespace rmcl {
 
@@ -158,9 +159,16 @@ void weighted_average(
         const unsigned int Ncorr_ = Ncorr1[pid] + Ncorr2[pid];
         const float Ncorrf = static_cast<float>(Ncorr_);
 
-        ds[pid] = ds1[pid] * w1 + ds2[pid] * w2;
-        ms[pid] = ms1[pid] * w1 + ms2[pid] * w2;
-        Cs[pid] = Cs1[pid] * w1 + Cs2[pid] * w2;
+        const rm::Vector dsi = ds1[pid] * w1 + ds2[pid] * w2;
+        const rm::Vector msi = ms1[pid] * w1 + ms2[pid] * w2;
+        
+        // std::cout << "weighted_average 1 - NEW IMPL" << std::endl;
+        auto P1 = Cs1[pid] * w1 + Cs2[pid] * w2;
+        auto P2 = (ms1[pid] - msi).multT(ds1[pid] - dsi) * w1 + (ms2[pid] - msi).multT(ds2[pid] - dsi) * w2;
+
+        ds[pid] = dsi;
+        ms[pid] = msi;
+        Cs[pid] = P1 + P2;
         Ncorr[pid] = Ncorr_;
     }
 }
@@ -175,6 +183,8 @@ void weighted_average(
     rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
     rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr)
 {
+    // std::cout << "weighted_average 2 - NEW IMPL" << std::endl;
+
     #pragma omp parallel for
     for(size_t pid=0; pid<ds.size(); pid++)
     {
@@ -222,33 +232,42 @@ void weighted_average(
     rmagine::MemoryView<rmagine::Matrix3x3, rmagine::RAM>& Cs,
     rmagine::MemoryView<unsigned int, rmagine::RAM>& Ncorr)
 {
+    // std::cout << "weighted_average 3 - NEW IMPL" << std::endl;
+
     #pragma omp parallel for if(ds.size() > 100)
     for(size_t pid=0; pid<ds.size(); pid++)
     {
-        unsigned int Ncorr_ = 0;
-        for(size_t i=0; i<dataset_means.size(); i++)
-        {
-            Ncorr_ += Ncorrs[i][pid];
-        }
-
-        // std::vector<float> weights(model_means.size());
-        // const float Ncorrf = static_cast<float>(Ncorr_);
-
-        // for(size_t i=0; i<model_means.size(); i++)
-        // {
-        //     weights[i] = static_cast<float>(Ncorrs[i][pid]) / Ncorrf;
-        // }
-
         Vector ms_ = {0.0, 0.0, 0.0};
         Vector ds_ = {0.0, 0.0, 0.0};
         Matrix3x3 C_;
         C_.setZeros();
+        unsigned int Ncorr_ = 0;
+        float w_ = 0.0;
 
         for(size_t i=0; i<dataset_means.size(); i++)
         {
-            ds_ += dataset_means[i][pid] * weights[i];
-            ms_ += model_means[i][pid] * weights[i];
-            C_ += covs[i][pid] * weights[i];
+            const float w =  weights[i];
+            const rm::Vector Di = dataset_means[i][pid];
+            const rm::Vector Mi = model_means[i][pid];
+            const rm::Matrix3x3 Ci = covs[i][pid];
+            
+            const float w_tot = w_ + w;
+            const float w1 = w_ / w_tot;
+            const float w2 = w  / w_tot;
+
+            const rm::Vector ds_old = ds_;
+            const rm::Vector ms_old = ms_;
+
+            // ds_ = w_ * ds_old + w * 
+            ds_ = ds_old * w1 + Di * w2;
+            ms_ = ms_old * w1 + Mi * w2;
+
+            auto P1 = C_ * w1 + Ci * w2;
+            auto P2 = (ms_old - ms_).multT(ds_old - ds_) * w1 + (Mi - ms_).multT(Di - ds_) * w2;
+
+            C_ = P1 + P2;
+            Ncorr_ += Ncorrs[i][pid];
+            w_ += w;
         }
 
         ds[pid] = ds_;
@@ -263,6 +282,8 @@ void weighted_average(
     const std::vector<CorrectionPreResults<rmagine::RAM> >& pre_results,
     CorrectionPreResults<rmagine::RAM>& pre_results_combined)
 {
+    // std::cout << "weighted_average 4 - NEW IMPL" << std::endl;
+
     // source: to fuse
     std::vector<MemoryView<Vector, RAM> > ds;
     std::vector<MemoryView<Vector, RAM> > ms;
@@ -305,6 +326,7 @@ void weighted_average(
     const std::vector<float>& weights,
     CorrectionPreResults<rmagine::RAM>& pre_results_combined)
 {
+    // std::cout << "weighted_average 5 - NEW IMPL" << std::endl;
     // source: to fuse
     std::vector<MemoryView<Vector, RAM> > ds;
     std::vector<MemoryView<Vector, RAM> > ms;
@@ -327,6 +349,8 @@ CorrectionPreResults<rmagine::RAM> weighted_average(
     const std::vector<CorrectionPreResults<rmagine::RAM> >& pre_results,
     const std::vector<float>& weights)
 {
+    // std::cout << "weighted_average 6 - NEW IMPL" << std::endl;
+
     CorrectionPreResults<rmagine::RAM> res;
     if(pre_results.size() > 0)
     {
diff --git a/src/rmcl/math/math.cu b/src/rmcl/math/math.cu
index 0669d6e..330b24e 100644
--- a/src/rmcl/math/math.cu
+++ b/src/rmcl/math/math.cu
@@ -174,12 +174,27 @@ __global__ void weighted_average_kernel(
     {
         const unsigned int Ncorr_ = Ncorr1[pid] + Ncorr2[pid];
         const float Ncorrf = static_cast<float>(Ncorr_);
-        float w1 = static_cast<float>(Ncorr1[pid]) / Ncorrf;
-        float w2 = static_cast<float>(Ncorr2[pid]) / Ncorrf;
-
-        ds[pid] = ds1[pid] * w1 + ds2[pid] * w2;
-        ms[pid] = ms1[pid] * w1 + ms2[pid] * w2;
-        Cs[pid] = Cs1[pid] * w1 + Cs2[pid] * w2;
+        const float w1 = static_cast<float>(Ncorr1[pid]) / Ncorrf;
+        const float w2 = static_cast<float>(Ncorr2[pid]) / Ncorrf;
+
+        const rm::Vector d1 = ds1[pid];
+        const rm::Vector m1 = ms1[pid];
+        const rm::Matrix3x3 C1 = Cs1[pid];
+        const rm::Vector d2 = ds2[pid];
+        const rm::Vector m2 = ms2[pid];
+        const rm::Matrix3x3 C2 = Cs2[pid];
+
+        // means
+        const rm::Vector d = d1 * w1 + d2 * w2;
+        const rm::Vector m = m1 * w1 + m2 * w2;
+
+        // covs
+        auto P1 = C1 * w1 + C2 * w2;
+        auto P2 = (m1 - m).multT(d1 - d) * w1 + (m2 - m).multT(d2 - d) * w2;
+        
+        ds[pid] = d;
+        ms[pid] = m;
+        Cs[pid] = P1 + P2;
         Ncorr[pid] = Ncorr_;
     }
 }
@@ -221,9 +236,24 @@ __global__ void weighted_average_kernel(
     {
         const unsigned int Ncorr_ = Ncorr1[pid] + Ncorr2[pid];
 
-        ds[pid] = ds1[pid] * w1 + ds2[pid] * w2;
-        ms[pid] = ms1[pid] * w1 + ms2[pid] * w2;
-        Cs[pid] = Cs1[pid] * w1 + Cs2[pid] * w2;
+        const rm::Vector d1 = ds1[pid];
+        const rm::Vector m1 = ms1[pid];
+        const rm::Matrix3x3 C1 = Cs1[pid];
+        const rm::Vector d2 = ds2[pid];
+        const rm::Vector m2 = ms2[pid];
+        const rm::Matrix3x3 C2 = Cs2[pid];
+
+        // means
+        const rm::Vector d = d1 * w1 + d2 * w2;
+        const rm::Vector m = m1 * w1 + m2 * w2;
+        
+        // covs
+        auto P1 = C1 * w1 + C2 * w2;
+        auto P2 = (m1 - m).multT(d1 - d) * w1 + (m2 - m).multT(d2 - d) * w2;
+
+        ds[pid] = d;
+        ms[pid] = m;
+        Cs[pid] = P1 + P2;
         Ncorr[pid] = Ncorr_;
     }
 }
diff --git a/src/rmcl/math/math_batched.cpp b/src/rmcl/math/math_batched.cpp
index c541168..530c88f 100644
--- a/src/rmcl/math/math_batched.cpp
+++ b/src/rmcl/math/math_batched.cpp
@@ -101,9 +101,8 @@ void means_covs_online_batched(
             {
                 const float N_1 = static_cast<float>(n_corr);
                 const float N = static_cast<float>(n_corr + 1);
-
-                // update ncorr
-                n_corr++;
+                const float w1 = N_1/N;
+                const float w2 = 1.0/N;
 
                 const rm::Vector Di = data_batch[j]; // read
                 const rm::Vector Mi = model_batch[j]; // read
@@ -111,27 +110,20 @@ void means_covs_online_batched(
                 const rm::Vector m_mean_old = m_mean; // read
 
                 // update means
-                // rm::Vector dD = Di - d_mean;
-                // rm::Vector dM = Mi - m_mean;
                 
                 // save old means for covariance
                 
-                const rm::Vector d_mean_new = d_mean_old + (Di - d_mean) / N; 
-                const rm::Vector m_mean_new = m_mean_old + (Mi - m_mean) / N; 
-
-                const float w1 = N_1/N;
-                const float w2 = 1.0/N;
+                const rm::Vector d_mean_new = d_mean_old * w1 + Di * w2; 
+                const rm::Vector m_mean_new = m_mean_old * w1 + Mi * w2;
 
                 auto P1 = (Mi - m_mean_new).multT(Di - d_mean_new);
                 auto P2 = (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new);
 
+                // Write
                 C = C * w1 + P1 * w2 + P2 * w1;
-
-                // C += ((Mi - m_mean_new).multT(Di - d_mean_new) - C) * 1.0 / N 
-                //     + (m_mean_old - m_mean_new).multT(d_mean_old - d_mean_new) * N_1/N;
-
                 d_mean = d_mean_new; // write
                 m_mean = m_mean_new; // write
+                n_corr = n_corr + 1;
             }
         }
 
diff --git a/src/rmcl/math/math_batched.cu b/src/rmcl/math/math_batched.cu
index 3fabb89..5693698 100644
--- a/src/rmcl/math/math_batched.cu
+++ b/src/rmcl/math/math_batched.cu
@@ -376,35 +376,25 @@ __global__ void means_covs_online_batched_kernel(
                 const rm::Vector Dnew = D + (Di - D) / Nf;
                 const rm::Vector Mnew = M + (Mi - M) / Nf;
 
-                // dD = Di - Dnew;
-                // dM = Mi - Mnew;
-
-
-                const float w1 = N_1f / Nf;
-                const float w2 = 1.0 / Nf;
-
-                auto P1 = (Mi - Mnew).multT(Di - Dnew);
-                auto P2 = (M - Mnew).multT(D - Dnew);
-
-                // printf("w1: %f\n", w1);
-                // printf("w2: %f\n", w2);
-                // printf("P1: %f\n", P1(0,0));
-                // printf("P2: %f\n", P2(0,0));
-                // printf("P3: %f\n", P3(0,0));
-
                 // shared write
                 // auto Cnew = P1 + P2 + P3;
                 // sC[tid] = C * N_1f / Nf + (Mi - Mnew).multT(Di - Dnew) * 1.0 / Nf
                 //     + (M - Mnew).multT(D - Dnew) * N_1f / Nf;
                 
+                const float w1 = N_1f / Nf;
+                const float w2 = 1.0 / Nf;
+
+                auto P1 = (Mi - Mnew).multT(Di - Dnew);
+                auto P2 = (M - Mnew).multT(D - Dnew);                
+                
                 auto Cnew = C * w1 + P1 * w2 + P2 * w1;
 
+
+
                 sC[tid] = Cnew;
                 sD[tid] = Dnew;
                 sM[tid] = Mnew;
                 sN[tid] = N;
-
-                printf("(%u,G) -> (%u,S) = %u, %f\n", globId + blockSize * i, tid, N, Cnew(0,0) );
             }
         }
     }
@@ -434,29 +424,16 @@ __global__ void means_covs_online_batched_kernel(
                 const float w1 = static_cast<float>(N) / static_cast<float>(Ntot);
                 const float w2 = static_cast<float>(Nnext) / static_cast<float>(Ntot);
 
-
-
-                // auto P1 = (Mi - Mnew).multT(Di - Dnew);
-                // auto P2 = (M - Mnew).multT(D - Dnew);
-
                 const rm::Vector Dtot = D * w1 + Dnext * w2;
-
-
-
-                // printf("(%u,S) %u, (%f, %f, %f) -> (%u,S) %u, (%f, %f, %f) = %u, (%f, %f, %f)\n", 
-                //     tid + s, Nnext, Dnext.x, Dnext.y, Dnext.z,
-                //     tid, N, D.x, D.y, D.z,
-                //     Ntot, Dtot.x, Dtot.y, Dtot.z);
+                const rm::Vector Mtot = M * w1 + Mnext * w2;
                 
+                auto P1 = C * w1 + Cnext * w2;
+                auto P2 = (M - Mtot).multT(D - Dtot) * w1 + (Mnext - Mtot).multT(Dnext - Dtot) * w2;
+                auto Cnew = P1 + P2;
+
                 // shared write
                 sD[tid] = Dtot;
-                sM[tid] = M * w1 + Mnext * w2;
-                
-
-                // TODO weighted covariance
-                auto Cnew = C * w1 + Cnext * w2;
-
-
+                sM[tid] = Mtot;
                 sC[tid] = Cnew; // currently only a sum
                 sN[tid] = Ntot;
             }
@@ -574,7 +551,6 @@ __global__ void means_covs_online_approx_batched_kernel(
                 const float w1 = static_cast<float>(N) / static_cast<float>(Ntot);
                 const float w2 = static_cast<float>(Nnext) / static_cast<float>(Ntot);
 
-
                 const rm::Vector Dtot = D * w1 + Dnext * w2;
                 // printf("(%u,S) %u, (%f, %f, %f) -> (%u,S) %u, (%f, %f, %f) = %u, (%f, %f, %f)\n", 
                 //     tid + s, Nnext, Dnext.x, Dnext.y, Dnext.z,