From 89e1dd0793c64614d2a1ae3c0d33096461767842 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 19 Sep 2024 10:07:28 -0400
Subject: [PATCH 1/7] Update packages

---
 conan/conanfile.py           | 2 +-
 ports/decimal/portfile.cmake | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conan/conanfile.py b/conan/conanfile.py
index 7a0774c4b..b2ac89198 100644
--- a/conan/conanfile.py
+++ b/conan/conanfile.py
@@ -19,7 +19,7 @@
 
 class CharconvConan(ConanFile):
     name = "boost_decimal"
-    version = "1.1.0"
+    version = "2.3.0"
     description = "Boost provides free peer-reviewed portable C++ source libraries"
     url = "https://github.com/cppalliance/decimal"
     homepage = "https://github.com/cppalliance/decimal"
diff --git a/ports/decimal/portfile.cmake b/ports/decimal/portfile.cmake
index 881e719c4..6f0ae9e03 100644
--- a/ports/decimal/portfile.cmake
+++ b/ports/decimal/portfile.cmake
@@ -7,8 +7,8 @@
 vcpkg_from_github(
         OUT_SOURCE_PATH SOURCE_PATH
         REPO cppalliance/decimal
-        REF v1.1.0
-        SHA512 9ee10a32958a58e96ec4d1fc5f39e3c86352d36365547716e7903340435878b86b56f74e8abcaedfc227154c409d8ccfdb0563a2d98b605aa1799c071ba15cca
+        REF v2.3.0
+        SHA512 a5dac21e3f786e028b5e526c615c1ff7f671f2759a6968edaf97de90f5c5ede59e956dca730d14edabce4d86b58d5dacca2dab0706cb9e17474871f6433171eb
         HEAD_REF master
 )
 

From c995e2323054af12251776ff9db1ebba58ffe44a Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 19 Sep 2024 13:07:54 -0400
Subject: [PATCH 2/7] Remove u128 x64 intrinsic optimizations

---
 include/boost/decimal/detail/emulated128.hpp | 90 +++++---------------
 1 file changed, 22 insertions(+), 68 deletions(-)

diff --git a/include/boost/decimal/detail/emulated128.hpp b/include/boost/decimal/detail/emulated128.hpp
index e99e25ec8..574ebe585 100644
--- a/include/boost/decimal/detail/emulated128.hpp
+++ b/include/boost/decimal/detail/emulated128.hpp
@@ -757,31 +757,15 @@ constexpr auto uint128::operator+=(std::uint64_t n) noexcept -> uint128&
 
 constexpr auto operator+(uint128 lhs, uint128 rhs) noexcept -> uint128
 {
-    #if (defined(BOOST_DECIMAL_HAS_X64_INTRINSICS) || defined(BOOST_DECIMAL_HAS_MSVC_64BIT_INTRINSICS)) && !defined(BOOST_DECIMAL_NO_CONSTEVAL_DETECTION)
-    if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs.low))
-    {
-        // Branchless version can be executed on x64 machines when available
-        unsigned long long low {};
-        unsigned long long high {};
-
-        const auto carry {BOOST_DECIMAL_ADD_CARRY(0, lhs.low, rhs.low, &low)};
-        BOOST_DECIMAL_ADD_CARRY(carry, lhs.high, rhs.high, &high);
+    uint128 temp {lhs.high + rhs.high, lhs.low + rhs.low};
 
-        return uint128{high, low};
-    }
-    else
-    #endif
+    // Need to carry a bit into rhs
+    if (temp.low < lhs.low)
     {
-        uint128 temp {lhs.high + rhs.high, lhs.low + rhs.low};
-
-        // Need to carry a bit into rhs
-        if (temp.low < lhs.low)
-        {
-            ++temp.high;
-        }
-
-        return temp;
+        ++temp.high;
     }
+
+    return temp;
 }
 
 constexpr auto uint128::operator+=(uint128 v) noexcept -> uint128&
@@ -807,31 +791,15 @@ constexpr auto uint128::operator++(int) noexcept -> uint128
 
 constexpr auto operator-(uint128 lhs, uint128 rhs) noexcept -> uint128
 {
-    #if (defined(BOOST_DECIMAL_HAS_X64_INTRINSICS) || defined(BOOST_DECIMAL_HAS_MSVC_64BIT_INTRINSICS)) && !defined(BOOST_DECIMAL_NO_CONSTEVAL_DETECTION)
-    if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs.low))
-    {
-        // Branchless version can be executed on x64 machines when available
-        unsigned long long low {};
-        unsigned long long high {};
-
-        const auto carry {_subborrow_u64(0, lhs.low, rhs.low, &low)};
-        _subborrow_u64(carry, lhs.high, rhs.high, &high);
+    uint128 temp {lhs.high - rhs.high, lhs.low - rhs.low};
 
-        return uint128{high, low};
-    }
-    else
-    #endif
+    // Check for carry
+    if (lhs.low < rhs.low)
     {
-        uint128 temp {lhs.high - rhs.high, lhs.low - rhs.low};
-
-        // Check for carry
-        if (lhs.low < rhs.low)
-        {
-            --temp.high;
-        }
-
-        return temp;
+        --temp.high;
     }
+
+    return temp;
 }
 
 constexpr auto uint128::operator-=(uint128 v) noexcept -> uint128&
@@ -1328,35 +1296,21 @@ constexpr auto int128::operator>(int rhs) const noexcept -> bool
 
 constexpr auto operator+(const int128& lhs, const int128& rhs) noexcept -> int128
 {
-    #if (defined(BOOST_DECIMAL_HAS_X64_INTRINSICS) || defined(BOOST_DECIMAL_HAS_MSVC_64BIT_INTRINSICS)) && !defined(BOOST_DECIMAL_NO_CONSTEVAL_DETECTION)
-    if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs.low))
-    {
-        unsigned long long low {};
-        unsigned long long high {};
-
-        const auto carry {BOOST_DECIMAL_ADD_CARRY(0, lhs.low, rhs.low, &low)};
-        BOOST_DECIMAL_ADD_CARRY(carry, static_cast<std::uint64_t>(lhs.high), static_cast<std::uint64_t>(rhs.high), &high);
-
-        return {static_cast<std::int64_t>(high), low};
-    }
-    #endif
-    {
-        #ifdef BOOST_DECIMAL_HAS_INT128
+    #ifdef BOOST_DECIMAL_HAS_INT128
 
-        const auto lhs_full {(static_cast<__uint128_t>(lhs.high) << 64) | lhs.low};
-        const auto rhs_full {(static_cast<__uint128_t>(rhs.high) << 64) | rhs.low};
-        const auto result {lhs_full + rhs_full};
+    const auto lhs_full {(static_cast<__uint128_t>(lhs.high) << 64) | lhs.low};
+    const auto rhs_full {(static_cast<__uint128_t>(rhs.high) << 64) | rhs.low};
+    const auto result {lhs_full + rhs_full};
 
-        return {static_cast<std::int64_t>(result >> 64), static_cast<std::uint64_t>(result)};
+    return {static_cast<std::int64_t>(result >> 64), static_cast<std::uint64_t>(result)};
 
-        #else
+    #else
 
-        const auto new_low {lhs.low + rhs.low};
-        const auto new_high {lhs.high + rhs.high + static_cast<std::int64_t>(new_low < lhs.low)};
-        return int128{new_high, new_low};
+    const auto new_low {lhs.low + rhs.low};
+    const auto new_high {lhs.high + rhs.high + static_cast<std::int64_t>(new_low < lhs.low)};
+    return int128{new_high, new_low};
 
-        #endif
-    }
+    #endif
 }
 
 constexpr auto operator-(const int128& lhs, const int128& rhs) noexcept -> int128

From 81c80d50659ce13c917d3737a0f5fe71cf4a8b7c Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 19 Sep 2024 14:51:23 -0400
Subject: [PATCH 3/7] Add x64 performance to the docs

---
 doc/decimal/benchmarks.adoc | 161 ++++++++++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)

diff --git a/doc/decimal/benchmarks.adoc b/doc/decimal/benchmarks.adoc
index 9a700ddae..b4f7e0db3 100644
--- a/doc/decimal/benchmarks.adoc
+++ b/doc/decimal/benchmarks.adoc
@@ -25,6 +25,38 @@ An example on Linux with b2: `../../../b2 cxxstd=20 toolset=gcc-13 define=BOOST_
 The benchmark for comparisons generates a random vector containing 2,000,000 elements and does operations `>`, `>=`, `<`, `\<=`, `==`, and `!=` between `vec[i] and vec[i + 1]`.
 This is repeated 5 times to generate stable results.
 
+=== x64 Linux Results
+
+Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
+
+|===
+| Type | Runtime (us) | Ratio to `double`
+| `float`
+| 35,581
+| 0.604
+| `double`
+| 58,848
+| 1.000
+| `decimal32`
+| 2,410,084
+| 40.954
+| `decimal64`
+| 4,233,175
+| 71.934
+| `decimal128`
+| 6,337,447
+| 107.692
+| `decimal32_fast`
+| 628,241
+| 10.676
+| `decimal64_fast`
+| 724,474
+| 12.311
+| `decimal128_fast`
+| 517,930
+| 8.801
+|===
+
 === M1 macOS Results
 
 Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and homebrew Clang 18.1.4
@@ -62,6 +94,130 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 The benchmark for these operations generates a random vector containing 2,000,000 elements and does operations `+`, `-`, `*`, `/` between `vec[i] and vec[i + 1]`.
 This is repeated 5 times to generate stable results.
 
+=== x64 Linux Results
+
+Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
+
+==== Addition
+
+|===
+| Type | Runtime (us) | Ratio to `double`
+| `float`
+| 54,566
+| 1.077
+| `double`
+| 50,640
+| 1.000
+| `decimal32`
+| 3,639,957
+| 71.879
+| `decimal64`
+| 4,172,318
+| 82.392
+| `decimal128`
+| 10,936,595
+| 215.968
+| `decimal32_fast`
+| 1,148,249
+| 22.675
+| `decimal64_fast`
+| 1,149,203
+| 22.694
+| `decimal128_fast`
+| 7,424,598
+| 146.615
+|===
+
+==== Subtraction
+
+|===
+| Type | Runtime (us) | Ratio to `double`
+| `float`
+| 48,654
+| 0.912
+| `double`
+| 53,348
+| 1.000
+| `decimal32`
+| 2,850,709
+| 53.436
+| `decimal64`
+| 3,493,936
+| 65.493
+| `decimal128`
+| 10,492,728
+| 196.685
+| `decimal32_fast`
+| 1,012,199
+| 18.974
+| `decimal64_fast`
+| 1,055,476
+| 19.785
+| `decimal128_fast`
+| 2,114,185
+| 39.630
+|===
+
+==== Multiplication
+
+|===
+| Type | Runtime (us) | Ratio to `double`
+| `float`
+| 53,405
+| 1.101
+| `double`
+| 48,497
+| 1.000
+| `decimal32`
+| 2,708,779
+| 55.855
+| `decimal64`
+| 2,761,465
+| 56.941
+| `decimal128`
+| 8,509,678
+| 175.468
+| `decimal32_fast`
+| 451,679
+| 9.313
+| `decimal64_fast`
+| 777,927
+| 16.041
+| `decimal128_fast`
+| 13,970,509
+| 288.070
+|===
+
+==== Division
+
+|===
+| Type | Runtime (us) | Ratio to `double`
+| `float`
+| 58,955
+| 0.755
+| `double`
+| 78,046
+| 1.000
+| `decimal32`
+| 2,907,134
+| 37.249
+| `decimal64`
+| 3,464,841
+| 44.394
+| `decimal128`
+| 18,202,742
+| 233.231
+| `decimal32_fast`
+| 1,092,346
+| 13.996
+| `decimal64_fast`
+| 1,207,648
+| 15.474
+| `decimal128_fast`
+| 1,208,184
+| 15.480
+|===
+
 === M1 macOS Results
 
 Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and homebrew Clang 18.1.4
@@ -186,6 +342,9 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 | 77.956
 |===
 
+////
+These are not available for the built-ins so not deleting but also not incorporating
+
 == Selected Special Functions
 
 The benchmark for these operations generates a random vector containing 2,000,000 elements and does operations `+`, `-`, `*`, `/` between `vec[i] and vec[i + 1]`.
@@ -377,3 +536,5 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 | 3,108,380
 | 9.724
 |===
+
+////

From bf087a2bf0f8df14f27f1326211916398b250377 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 19 Sep 2024 15:20:34 -0400
Subject: [PATCH 4/7] Update number of runs for C benchmarks

---
 test/benchmark_libdfp.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/test/benchmark_libdfp.c b/test/benchmark_libdfp.c
index 16f2cb654..be0a0f24f 100644
--- a/test/benchmark_libdfp.c
+++ b/test/benchmark_libdfp.c
@@ -8,7 +8,7 @@
 #include <time.h>
 #include <inttypes.h>
 
-#define K 2000000
+#define K 20000000
 #define N 5
 
 double float_rand(double min, double max)
@@ -136,21 +136,21 @@ __attribute__ ((__noinline__)) void test_comparisons_128(_Decimal128* data, cons
 
 typedef _Decimal32 (*operation_32)(_Decimal32, _Decimal32);
 
-_Decimal32 add_32(_Decimal32 a, _Decimal32 b)
+__attribute__ ((__noinline__)) _Decimal32 add_32(_Decimal32 a, _Decimal32 b)
 {
     return a + b;
 }
-_Decimal32 sub_32(_Decimal32 a, _Decimal32 b)
+__attribute__ ((__noinline__)) _Decimal32 sub_32(_Decimal32 a, _Decimal32 b)
 {
     return a - b;
 }
 
-_Decimal32 mul_32(_Decimal32 a, _Decimal32 b)
+__attribute__ ((__noinline__)) _Decimal32 mul_32(_Decimal32 a, _Decimal32 b)
 {
     return a * b;
 }
 
-_Decimal32 div_32(_Decimal32 a, _Decimal32 b)
+__attribute__ ((__noinline__)) _Decimal32 div_32(_Decimal32 a, _Decimal32 b)
 {
     return a / b;
 }
@@ -181,21 +181,22 @@ __attribute__ ((__noinline__)) void test_two_element_operation_32(_Decimal32* da
 
 typedef _Decimal64 (*operation_64)(_Decimal64, _Decimal64);
 
-_Decimal64 add_64(_Decimal64 a, _Decimal64 b)
+__attribute__ ((__noinline__)) _Decimal64 add_64(_Decimal64 a, _Decimal64 b)
 {
     return a + b;
 }
-_Decimal64 sub_64(_Decimal64 a, _Decimal64 b)
+
+__attribute__ ((__noinline__)) _Decimal64 sub_64(_Decimal64 a, _Decimal64 b)
 {
     return a - b;
 }
 
-_Decimal64 mul_64(_Decimal64 a, _Decimal64 b)
+__attribute__ ((__noinline__)) _Decimal64 mul_64(_Decimal64 a, _Decimal64 b)
 {
     return a * b;
 }
 
-_Decimal64 div_64(_Decimal64 a, _Decimal64 b)
+__attribute__ ((__noinline__)) _Decimal64 div_64(_Decimal64 a, _Decimal64 b)
 {
     return a / b;
 }
@@ -226,21 +227,22 @@ __attribute__ ((__noinline__)) void test_two_element_operation_64(_Decimal64* da
 
 typedef _Decimal128 (*operation_128)(_Decimal128, _Decimal128);
 
-_Decimal128 add_128(_Decimal128 a, _Decimal128 b)
+__attribute__ ((__noinline__)) _Decimal128 add_128(_Decimal128 a, _Decimal128 b)
 {
     return a + b;
 }
-_Decimal128 sub_128(_Decimal128 a, _Decimal128 b)
+
+__attribute__ ((__noinline__)) _Decimal128 sub_128(_Decimal128 a, _Decimal128 b)
 {
     return a - b;
 }
 
-_Decimal128 mul_128(_Decimal128 a, _Decimal128 b)
+__attribute__ ((__noinline__)) _Decimal128 mul_128(_Decimal128 a, _Decimal128 b)
 {
     return a * b;
 }
 
-_Decimal128 div_128(_Decimal128 a, _Decimal128 b)
+__attribute__ ((__noinline__)) _Decimal128 div_128(_Decimal128 a, _Decimal128 b)
 {
     return a / b;
 }

From 693907fe349c5584ffc3e27bae449eebc6a08a06 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 19 Sep 2024 15:21:41 -0400
Subject: [PATCH 5/7] Add libdfp benchmarks to table

---
 doc/decimal/benchmarks.adoc | 45 +++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/doc/decimal/benchmarks.adoc b/doc/decimal/benchmarks.adoc
index b4f7e0db3..b90946181 100644
--- a/doc/decimal/benchmarks.adoc
+++ b/doc/decimal/benchmarks.adoc
@@ -55,6 +55,15 @@ Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
 | `decimal128_fast`
 | 517,930
 | 8.801
+| GCC `_Decimal32`
+| 893,375
+| 15.181
+| GCC `_Decimal64`
+| 496,127
+| 8.431
+| GCC `_Decimal128`
+| 1,143,636
+| 19.434
 |===
 
 === M1 macOS Results
@@ -126,6 +135,15 @@ Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
 | `decimal128_fast`
 | 7,424,598
 | 146.615
+| GCC `_Decimal32`
+| 2,997,658
+| 50.939
+| GCC `_Decimal64`
+| 2,129,898
+| 36.193
+| GCC `_Decimal128`
+| 3,056,979
+| 51.947
 |===
 
 ==== Subtraction
@@ -156,6 +174,15 @@ Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
 | `decimal128_fast`
 | 2,114,185
 | 39.630
+| GCC `_Decimal32`
+| 2,006,964
+| 37.620
+| GCC `_Decimal64`
+| 1,324,796
+| 24.833
+| GCC `_Decimal128`
+| 2,783,553
+| 52.177
 |===
 
 ==== Multiplication
@@ -186,6 +213,15 @@ Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
 | `decimal128_fast`
 | 13,970,509
 | 288.070
+| GCC `_Decimal32`
+| 2,507,998
+| 51.714
+| GCC `_Decimal64`
+| 2,414,864
+| 49.794
+| GCC `_Decimal128`
+| 6,248,956
+| 128.852
 |===
 
 ==== Division
@@ -216,6 +252,15 @@ Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
 | `decimal128_fast`
 | 1,208,184
 | 15.480
+| GCC `_Decimal32`
+| 5,002,197
+| 64.093
+| GCC `_Decimal64`
+| 2,961,731
+| 37.900
+| GCC `_Decimal128`
+| 10,095,995
+| 129.360
 |===
 
 === M1 macOS Results

From 17be9c21f39cbb1e23b800aa4f5f0c9bc4129be3 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 20 Sep 2024 09:59:50 -0400
Subject: [PATCH 6/7] Fix ordering

---
 test/benchmarks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmarks.cpp b/test/benchmarks.cpp
index aa58485c0..4b47be4c9 100644
--- a/test/benchmarks.cpp
+++ b/test/benchmarks.cpp
@@ -378,8 +378,8 @@ int main()
 
     std::cout << "\n===== Subtraction =====\n";
 
-    test_two_element_operation(double_vector, std::minus<>(), "Subtraction", "double");
     test_two_element_operation(float_vector, std::minus<>(), "Subtraction", "float");
+    test_two_element_operation(double_vector, std::minus<>(), "Subtraction", "double");
     test_two_element_operation(dec32_vector, std::minus<>(), "Subtraction", "decimal32");
     test_two_element_operation(dec64_vector, std::minus<>(), "Subtraction", "decimal64");
     test_two_element_operation(dec128_vector, std::minus<>(), "Subtraction", "decimal128");

From 0478b0cbb8500b7726dfc04efb1d1a90a49e59a7 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 20 Sep 2024 10:21:01 -0400
Subject: [PATCH 7/7] Update apple benchmarks

---
 doc/decimal/benchmarks.adoc | 156 ++++++++++++++++++------------------
 1 file changed, 78 insertions(+), 78 deletions(-)

diff --git a/doc/decimal/benchmarks.adoc b/doc/decimal/benchmarks.adoc
index b90946181..b2ff69d33 100644
--- a/doc/decimal/benchmarks.adoc
+++ b/doc/decimal/benchmarks.adoc
@@ -22,7 +22,7 @@ An example on Linux with b2: `../../../b2 cxxstd=20 toolset=gcc-13 define=BOOST_
 
 == Comparisons
 
-The benchmark for comparisons generates a random vector containing 2,000,000 elements and does operations `>`, `>=`, `<`, `\<=`, `==`, and `!=` between `vec[i] and vec[i + 1]`.
+The benchmark for comparisons generates a random vector containing 20,000,000 elements and does operations `>`, `>=`, `<`, `\<=`, `==`, and `!=` between `vec[i] and vec[i + 1]`.
 This is repeated 5 times to generate stable results.
 
 === x64 Linux Results
@@ -68,39 +68,39 @@ Run using an Intel i9-11900k chipset running RHEL 9.4 and GCC 11.4.1-3
 
 === M1 macOS Results
 
-Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and homebrew Clang 18.1.4
+Run using a Macbook pro with M1 pro chipset running macOS Sonoma 15.0 and homebrew Clang 18.1.8
 
 |===
 | Type | Runtime (us) | Ratio to `double`
 | `float`
-| 8587
-| 1.376
+| 146,976
+| 2.319
 | `double`
-| 6240
+| 63,382
 | 1.000
 | `decimal32`
-| 275,597
-| 44.166
+| 1,797,597
+| 28.361
 | `decimal64`
-| 296,929
-| 47.587
+| 2,799,376
+| 44.167
 | `decimal128`
-| 821,847
-| 131.706
+| 6,478,939
+| 102.220
 | `decimal32_fast`
-| 99,664
-| 15.972
+| 1,070,232
+| 16.885
 | `decimal64_fast`
-| 102,132
-| 16.367
+| 1,111,273
+| 17.533
 | `decimal128_fast`
-| 146,302
-| 23.446
+| 1,118,976
+| 17.654
 |===
 
 == Basic Operations
 
-The benchmark for these operations generates a random vector containing 2,000,000 elements and does operations `+`, `-`, `*`, `/` between `vec[i] and vec[i + 1]`.
+The benchmark for these operations generates a random vector containing 20,000,000 elements and does operations `+`, `-`, `*`, `/` between `vec[i] and vec[i + 1]`.
 This is repeated 5 times to generate stable results.
 
 === x64 Linux Results
@@ -272,29 +272,29 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 |===
 | Type | Runtime (us) | Ratio to `double`
 | `float`
-| 2705
-| 0.859
+| 16,685
+| 0.955
 | `double`
-| 3148
+| 17,476
 | 1.000
 | `decimal32`
-| 351,505
-| 111.660
+| 2,528,095
+| 144.661
 | `decimal64`
-| 359,425
-| 114.176
+| 2,713,507
+| 155.270
 | `decimal128`
-| 1,446,674
-| 459.553
+| 11,969,714
+| 684.923
 | `decimal32_fast`
-| 146,873
-| 46.656
+| 1,423,277
+| 81.442
 | `decimal64_fast`
-| 139,294
-| 44.248
+| 1,280,409
+| 73.267
 | `decimal128_fast`
-| 707,308
-| 224.685
+| 6,047,499
+| 346.046
 |===
 
 ==== Subtraction
@@ -302,29 +302,29 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 |===
 | Type | Runtime (us) | Ratio to `double`
 | `float`
-| 3339
-| 2.014
+| 16,302
+| 1.045
 | `double`
-| 1658
+| 17,033
 | 1.000
 | `decimal32`
-| 267,646
-| 161.427
+| 2,010,525
+| 118.037
 | `decimal64`
-| 303,589
-| 183.106
+| 2,237,729
+| 131.376
 | `decimal128`
-| 954,211
-| 575.519
+| 6,907,396
+| 405.530
 | `decimal32_fast`
-| 147,112
-| 88.729
+| 1,378,448
+| 80.928
 | `decimal64_fast`
-| 145,606
-| 87.820
+| 1,276,731
+| 74.956
 | `decimal128_fast`
-| 394,538
-| 2387.960
+| 2,970,586
+| 174.401
 |===
 
 ==== Multiplication
@@ -332,29 +332,29 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 |===
 | Type | Runtime (us) | Ratio to `double`
 | `float`
-| 1646
-| 0.957
+| 16,499
+| 0.926
 | `double`
-| 1720
+| 17,821
 | 1.000
 | `decimal32`
-| 313,219
-| 182.104
+| 1,951,504
+| 109.506
 | `decimal64`
-| 583,818
-| 339.429
+| 2,480,528
+| 139.191
 | `decimal128`
-| 1,881,936
-| 1094.149
+| 14,360,630
+| 805.826
 | `decimal32_fast`
-| 86,093
-| 50.054
+| 630,355
+| 35.371
 | `decimal64_fast`
-| 333,582
-| 193.943
+| 987,703
+| 55.424
 | `decimal128_fast`
-| 1,269,429
-| 738.040
+| 12,573,178
+| 705.526
 |===
 
 ==== Division
@@ -362,29 +362,29 @@ Run using a Macbook pro with M1 pro chipset running macOS Sonoma 14.4.1 and home
 |===
 | Type | Runtime (us) | Ratio to `double`
 | `float`
-| 2120
-| 0.547
+| 20,267
+| 0.841
 | `double`
-| 3874
+| 24,111
 | 1.000
 | `decimal32`
-| 307,337
-| 79.333
+| 1,757,506
+| 72.892
 | `decimal64`
-| 447,910
-| 115.620
+| 3,496,913
+| 145.033
 | `decimal128`
-| 2,544,798
-| 656.892
+| 20,017,989
+| 830.243
 | `decimal32_fast`
-| 105,796
-| 27.309
+| 846,727
+| 35.118
 | `decimal64_fast`
-| 291,671
-| 75.289
+| 2,484,985
+| 103.064
 | `decimal128_fast`
-| 302,003
-| 77.956
+| 2,490,175
+| 103.280
 |===
 
 ////