diff --git a/Bwdif/Bwdif.cpp b/Bwdif/Bwdif.cpp
index b83ceaa..c7b4114 100644
--- a/Bwdif/Bwdif.cpp
+++ b/Bwdif/Bwdif.cpp
@@ -26,53 +26,62 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
+#include <cmath>
 #include <cstdlib>
 
 #include <algorithm>
 #include <memory>
 #include <string>
-#include <type_traits>
 
 #include <VapourSynth.h>
 #include <VSHelper.h>
 
-/*
- * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
- * Used when there is spatial and temporal interpolation.
- * Filter coefficients coef_sp are used when there is spatial interpolation only.
- * Adjusted for matching visual sharpness impression of spatial and temporal interpolation.
- */
-static constexpr uint16_t coef_lf[2] = { 4309, 213 };
-static constexpr uint16_t coef_hf[3] = { 5570, 3801, 1016 };
-static constexpr uint16_t coef_sp[2] = { 5077, 981 };
-static constexpr float coef_lf_f[2] = { 4309 / 8192.0f, 213 / 8192.0f };
-static constexpr float coef_hf_f[3] = { 5570 / 8192.0f, 3801 / 8192.0f, 1016 / 8192.0f };
-static constexpr float coef_sp_f[2] = { 5077 / 8192.0f, 981 / 8192.0f };
+#include "Bwdif.h"
+
+#ifdef BWDIF_X86
+template<typename pixel_t, bool spat> extern void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template<typename pixel_t, bool spat> extern void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template<typename pixel_t, bool spat> extern void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template<typename pixel_t> extern void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template<typename pixel_t> extern void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template<typename pixel_t> extern void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
 
 struct BwdifData final {
     VSNodeRef * node;
     VSVideoInfo vi;
     const VSVideoInfo * viSaved;
     int field;
-    int peak;
+    int edgeStep, lineStep, peak;
+    void (*filterEdgeWithSpat)(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+    void (*filterEdgeWithoutSpat)(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+    void (*filterLine)(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
 };
 
-template<typename T, bool spat>
-static inline void filterEdge(const T * prev2, const T * prev, const T * cur, const T * next, const T * next2, T * VS_RESTRICT dst, const int width,
-                              const int positiveStride, const int negativeStride, const int stride2, const int peak) noexcept {
-    const T * prev2Above2 = prev2 - stride2;
-    const T * prev2Below2 = prev2 + stride2;
-    const T * prevAbove = prev + negativeStride;
-    const T * prevBelow = prev + positiveStride;
-    const T * curAbove = cur + negativeStride;
-    const T * curBelow = cur + positiveStride;
-    const T * nextAbove = next + negativeStride;
-    const T * nextBelow = next + positiveStride;
-    const T * next2Above2 = next2 - stride2;
-    const T * next2Below2 = next2 + stride2;
+template<typename pixel_t, bool spat>
+static inline void filterEdge_c(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                                const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * VS_RESTRICT dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prevAbove = prev + negativeStride;
+    const pixel_t * prevBelow = prev + positiveStride;
+    const pixel_t * curAbove = cur + negativeStride;
+    const pixel_t * curBelow = cur + positiveStride;
+    const pixel_t * nextAbove = next + negativeStride;
+    const pixel_t * nextBelow = next + positiveStride;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
 
     for (int x = 0; x < width; x++) {
-        if constexpr (std::is_integral_v<T>) {
+        if constexpr (std::is_integral_v<pixel_t>) {
             const int c = curAbove[x];
             const int d = (prev2[x] + next2[x]) >> 1;
             const int e = curBelow[x];
@@ -89,9 +98,9 @@ static inline void filterEdge(const T * prev2, const T * prev, const T * cur, co
                     const int f = ((prev2Below2[x] + next2Below2[x]) >> 1) - e;
                     const int dc = d - c;
                     const int de = d - e;
-                    const int max = std::max({ de, dc, std::min(b, f) });
-                    const int min = std::min({ de, dc, std::max(b, f) });
-                    diff = std::max({ diff, min, -max });
+                    const int maximum = std::max({ de, dc, std::min(b, f) });
+                    const int minimum = std::min({ de, dc, std::max(b, f) });
+                    diff = std::max({ diff, minimum, -maximum });
                 }
 
                 const int interpol = std::clamp((c + e) >> 1, d - diff, d + diff);
@@ -114,9 +123,9 @@ static inline void filterEdge(const T * prev2, const T * prev, const T * cur, co
                     const float f = ((prev2Below2[x] + next2Below2[x]) * 0.5f) - e;
                     const float dc = d - c;
                     const float de = d - e;
-                    const float max = std::max({ de, dc, std::min(b, f) });
-                    const float min = std::min({ de, dc, std::max(b, f) });
-                    diff = std::max({ diff, min, -max });
+                    const float maximum = std::max({ de, dc, std::min(b, f) });
+                    const float minimum = std::min({ de, dc, std::max(b, f) });
+                    diff = std::max({ diff, minimum, -maximum });
                 }
 
                 dst[x] = std::clamp((c + e) * 0.5f, d - diff, d + diff);
@@ -125,28 +134,35 @@ static inline void filterEdge(const T * prev2, const T * prev, const T * cur, co
     }
 }
 
-template<typename T>
-static inline void filterLine(const T * prev2, const T * prev, const T * cur, const T * next, const T * next2, T * VS_RESTRICT dst, const int width,
-                              const int stride, const int stride2, const int stride3, const int stride4, const int peak) noexcept {
-    const T * prev2Above4 = prev2 - stride4;
-    const T * prev2Above2 = prev2 - stride2;
-    const T * prev2Below2 = prev2 + stride2;
-    const T * prev2Below4 = prev2 + stride4;
-    const T * prevAbove = prev - stride;
-    const T * prevBelow = prev + stride;
-    const T * curAbove3 = cur - stride3;
-    const T * curAbove = cur - stride;
-    const T * curBelow = cur + stride;
-    const T * curBelow3 = cur + stride3;
-    const T * nextAbove = next - stride;
-    const T * nextBelow = next + stride;
-    const T * next2Above4 = next2 - stride4;
-    const T * next2Above2 = next2 - stride2;
-    const T * next2Below2 = next2 + stride2;
-    const T * next2Below4 = next2 + stride4;
+template<typename pixel_t>
+static inline void filterLine_c(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                                const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * VS_RESTRICT dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above4 = prev2 - stride4;
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prev2Below4 = prev2 + stride4;
+    const pixel_t * prevAbove = prev - stride;
+    const pixel_t * prevBelow = prev + stride;
+    const pixel_t * curAbove3 = cur - stride3;
+    const pixel_t * curAbove = cur - stride;
+    const pixel_t * curBelow = cur + stride;
+    const pixel_t * curBelow3 = cur + stride3;
+    const pixel_t * nextAbove = next - stride;
+    const pixel_t * nextBelow = next + stride;
+    const pixel_t * next2Above4 = next2 - stride4;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+    const pixel_t * next2Below4 = next2 + stride4;
 
     for (int x = 0; x < width; x++) {
-        if constexpr (std::is_integral_v<T>) {
+        if constexpr (std::is_integral_v<pixel_t>) {
             const int c = curAbove[x];
             const int d = (prev2[x] + next2[x]) >> 1;
             const int e = curBelow[x];
@@ -162,9 +178,9 @@ static inline void filterLine(const T * prev2, const T * prev, const T * cur, co
                 const int f = ((prev2Below2[x] + next2Below2[x]) >> 1) - e;
                 const int dc = d - c;
                 const int de = d - e;
-                const int max = std::max({ de, dc, std::min(b, f) });
-                const int min = std::min({ de, dc, std::max(b, f) });
-                diff = std::max({ diff, min, -max });
+                const int maximum = std::max({ de, dc, std::min(b, f) });
+                const int minimum = std::min({ de, dc, std::max(b, f) });
+                diff = std::max({ diff, minimum, -maximum });
 
                 int interpol;
                 if (std::abs(c - e) > temporal_diff0)
@@ -194,9 +210,9 @@ static inline void filterLine(const T * prev2, const T * prev, const T * cur, co
                 const float f = ((prev2Below2[x] + next2Below2[x]) * 0.5f) - e;
                 const float dc = d - c;
                 const float de = d - e;
-                const float max = std::max({ de, dc, std::min(b, f) });
-                const float min = std::min({ de, dc, std::max(b, f) });
-                diff = std::max({ diff, min, -max });
+                const float maximum = std::max({ de, dc, std::min(b, f) });
+                const float minimum = std::min({ de, dc, std::max(b, f) });
+                diff = std::max({ diff, minimum, -maximum });
 
                 float interpol;
                 if (std::abs(c - e) > temporal_diff0)
@@ -213,23 +229,23 @@ static inline void filterLine(const T * prev2, const T * prev, const T * cur, co
     }
 }
 
-template<typename T>
+template<typename pixel_t>
 static void filter(const VSFrameRef * prevFrame, const VSFrameRef * curFrame, const VSFrameRef * nextFrame, VSFrameRef * dstFrame,
                    const int field, const BwdifData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
     for (int plane = 0; plane < d->vi.format->numPlanes; plane++) {
         const int width = vsapi->getFrameWidth(curFrame, plane);
         const int height = vsapi->getFrameHeight(curFrame, plane);
-        const int stride = vsapi->getStride(curFrame, plane) / sizeof(T);
-        const T * prev = reinterpret_cast<const T *>(vsapi->getReadPtr(prevFrame, plane));
-        const T * cur = reinterpret_cast<const T *>(vsapi->getReadPtr(curFrame, plane));
-        const T * next = reinterpret_cast<const T *>(vsapi->getReadPtr(nextFrame, plane));
-        T * VS_RESTRICT dst = reinterpret_cast<T *>(vsapi->getWritePtr(dstFrame, plane));
+        const int stride = vsapi->getStride(curFrame, plane) / sizeof(pixel_t);
+        const pixel_t * prev = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(prevFrame, plane));
+        const pixel_t * cur = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(curFrame, plane));
+        const pixel_t * next = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(nextFrame, plane));
+        pixel_t * VS_RESTRICT dst = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dstFrame, plane));
 
         vs_bitblt(dst + stride * (1 - field),
                   vsapi->getStride(dstFrame, plane) * 2,
                   cur + stride * (1 - field),
                   vsapi->getStride(curFrame, plane) * 2,
-                  width * sizeof(T),
+                  width * sizeof(pixel_t),
                   height / 2);
 
         prev += stride * field;
@@ -237,27 +253,27 @@ static void filter(const VSFrameRef * prevFrame, const VSFrameRef * curFrame, co
         next += stride * field;
         dst += stride * field;
 
-        const T * prev2 = field ? prev : cur;
-        const T * next2 = field ? cur : next;
+        const pixel_t * prev2 = field ? prev : cur;
+        const pixel_t * next2 = field ? cur : next;
 
         for (int y = field; y < height; y += 2) {
             if ((y < 4) || (y + 5 > height)) {
                 if ((y < 2) || (y + 3 > height))
-                    filterEdge<T, false>(prev2, prev, cur, next, next2, dst, width,
-                                         y + 1 < height ? stride : -stride,
-                                         y > 0 ? -stride : stride,
-                                         stride * 2,
-                                         d->peak);
+                    d->filterEdgeWithoutSpat(prev2, prev, cur, next, next2, dst, width,
+                                             y + 1 < height ? stride : -stride,
+                                             y > 0 ? -stride : stride,
+                                             stride * 2,
+                                             d->edgeStep, d->peak);
                 else
-                    filterEdge<T, true>(prev2, prev, cur, next, next2, dst, width,
-                                        y + 1 < height ? stride : -stride,
-                                        y > 0 ? -stride : stride,
-                                        stride * 2,
-                                        d->peak);
+                    d->filterEdgeWithSpat(prev2, prev, cur, next, next2, dst, width,
+                                          y + 1 < height ? stride : -stride,
+                                          y > 0 ? -stride : stride,
+                                          stride * 2,
+                                          d->edgeStep, d->peak);
             } else {
-                filterLine(prev2, prev, cur, next, next2, dst, width,
-                           stride, stride * 2, stride * 3, stride * 4,
-                           d->peak);
+                d->filterLine(prev2, prev, cur, next, next2, dst, width,
+                              stride, stride * 2, stride * 3, stride * 4,
+                              d->lineStep, d->peak);
             }
 
             prev2 += stride * 2;
@@ -359,6 +375,7 @@ static void VS_CC bwdifCreate(const VSMap * in, VSMap * out, void * userData, VS
         d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
         d->vi = *vsapi->getVideoInfo(d->node);
         d->viSaved = vsapi->getVideoInfo(d->node);
+        int err;
 
         if (!isConstantFormat(&d->vi) ||
             (d->vi.format->sampleType == stInteger && d->vi.format->bitsPerSample > 16) ||
@@ -369,10 +386,89 @@ static void VS_CC bwdifCreate(const VSMap * in, VSMap * out, void * userData, VS
             throw "height must be greater than or equal to 4"sv;
 
         d->field = int64ToIntS(vsapi->propGetInt(in, "field", 0, nullptr));
+        const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
 
         if (d->field < 0 || d->field > 3)
             throw "field must be 0, 1, 2, or 3"sv;
 
+        if (opt < 0 || opt > 4)
+            throw "opt must be 0, 1, 2, 3, or 4"sv;
+
+        {
+            if (d->vi.format->bytesPerSample == 1) {
+                d->filterEdgeWithSpat = filterEdge_c<uint8_t, true>;
+                d->filterEdgeWithoutSpat = filterEdge_c<uint8_t, false>;
+                d->filterLine = filterLine_c<uint8_t>;
+            } else if (d->vi.format->bytesPerSample == 2) {
+                d->filterEdgeWithSpat = filterEdge_c<uint16_t, true>;
+                d->filterEdgeWithoutSpat = filterEdge_c<uint16_t, false>;
+                d->filterLine = filterLine_c<uint16_t>;
+            } else {
+                d->filterEdgeWithSpat = filterEdge_c<float, true>;
+                d->filterEdgeWithoutSpat = filterEdge_c<float, false>;
+                d->filterLine = filterLine_c<float>;
+            }
+
+#ifdef BWDIF_X86
+            const int iset = instrset_detect();
+            if ((opt == 0 && iset >= 10) || opt == 4) {
+                if (d->vi.format->bytesPerSample == 1) {
+                    d->filterEdgeWithSpat = filterEdge_avx512<uint8_t, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_avx512<uint8_t, false>;
+                    d->filterLine = filterLine_avx512<uint8_t>;
+                    d->edgeStep = 32;
+                } else if (d->vi.format->bytesPerSample == 2) {
+                    d->filterEdgeWithSpat = filterEdge_avx512<uint16_t, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_avx512<uint16_t, false>;
+                    d->filterLine = filterLine_avx512<uint16_t>;
+                    d->edgeStep = 16;
+                } else {
+                    d->filterEdgeWithSpat = filterEdge_avx512<float, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_avx512<float, false>;
+                    d->filterLine = filterLine_avx512<float>;
+                    d->edgeStep = 16;
+                }
+                d->lineStep = 16;
+            } else if ((opt == 0 && iset >= 8) || opt == 3) {
+                if (d->vi.format->bytesPerSample == 1) {
+                    d->filterEdgeWithSpat = filterEdge_avx2<uint8_t, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_avx2<uint8_t, false>;
+                    d->filterLine = filterLine_avx2<uint8_t>;
+                    d->edgeStep = 16;
+                } else if (d->vi.format->bytesPerSample == 2) {
+                    d->filterEdgeWithSpat = filterEdge_avx2<uint16_t, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_avx2<uint16_t, false>;
+                    d->filterLine = filterLine_avx2<uint16_t>;
+                    d->edgeStep = 8;
+                } else {
+                    d->filterEdgeWithSpat = filterEdge_avx2<float, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_avx2<float, false>;
+                    d->filterLine = filterLine_avx2<float>;
+                    d->edgeStep = 8;
+                }
+                d->lineStep = 8;
+            } else if ((opt == 0 && iset >= 2) || opt == 2) {
+                if (d->vi.format->bytesPerSample == 1) {
+                    d->filterEdgeWithSpat = filterEdge_sse2<uint8_t, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_sse2<uint8_t, false>;
+                    d->filterLine = filterLine_sse2<uint8_t>;
+                    d->edgeStep = 8;
+                } else if (d->vi.format->bytesPerSample == 2) {
+                    d->filterEdgeWithSpat = filterEdge_sse2<uint16_t, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_sse2<uint16_t, false>;
+                    d->filterLine = filterLine_sse2<uint16_t>;
+                    d->edgeStep = 4;
+                } else {
+                    d->filterEdgeWithSpat = filterEdge_sse2<float, true>;
+                    d->filterEdgeWithoutSpat = filterEdge_sse2<float, false>;
+                    d->filterLine = filterLine_sse2<float>;
+                    d->edgeStep = 4;
+                }
+                d->lineStep = 4;
+            }
+#endif
+        }
+
         if (d->field > 1) {
             if (d->vi.numFrames > INT_MAX / 2)
                 throw "resulting clip is too long"sv;
@@ -398,6 +494,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegiste
     configFunc("com.holywu.bwdif", "bwdif", "BobWeaver Deinterlacing Filter", VAPOURSYNTH_API_VERSION, 1, plugin);
     registerFunc("Bwdif",
                  "clip:clip;"
-                 "field:int;",
+                 "field:int;"
+                 "opt:int:opt;",
                  bwdifCreate, nullptr, plugin);
 }
diff --git a/Bwdif/Bwdif.h b/Bwdif/Bwdif.h
new file mode 100644
index 0000000..9d2e8b1
--- /dev/null
+++ b/Bwdif/Bwdif.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <type_traits>
+
+#ifdef BWDIF_X86
+#include "VCL2/vectorclass.h"
+#endif
+
+/*
+ * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
+ * Used when there is spatial and temporal interpolation.
+ * Filter coefficients coef_sp are used when there is spatial interpolation only.
+ * Adjusted for matching visual sharpness impression of spatial and temporal interpolation.
+ */
+static constexpr uint16_t coef_lf[2] = { 4309, 213 };
+static constexpr uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+static constexpr uint16_t coef_sp[2] = { 5077, 981 };
+static constexpr float coef_lf_f[2] = { 4309 / 8192.0f, 213 / 8192.0f };
+static constexpr float coef_hf_f[3] = { 5570 / 8192.0f, 3801 / 8192.0f, 1016 / 8192.0f };
+static constexpr float coef_sp_f[2] = { 5077 / 8192.0f, 981 / 8192.0f };
diff --git a/Bwdif/Bwdif.vcxproj b/Bwdif/Bwdif.vcxproj
index 3b6a56b..17a75fd 100644
--- a/Bwdif/Bwdif.vcxproj
+++ b/Bwdif/Bwdif.vcxproj
@@ -35,8 +35,9 @@
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <ClCompile>
-      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>BWDIF_X86;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <WarningLevel>Level3</WarningLevel>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
       <BufferSecurityCheck>false</BufferSecurityCheck>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp17</LanguageStandard>
@@ -49,6 +50,17 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClCompile Include="Bwdif.cpp" />
+    <ClCompile Include="Bwdif_AVX2.cpp">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="Bwdif_AVX512.cpp">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="Bwdif_SSE2.cpp" />
+    <ClCompile Include="VCL2\instrset_detect.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="Bwdif.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/Bwdif/Bwdif.vcxproj.filters b/Bwdif/Bwdif.vcxproj.filters
index 964a8c1..5aeb46a 100644
--- a/Bwdif/Bwdif.vcxproj.filters
+++ b/Bwdif/Bwdif.vcxproj.filters
@@ -18,5 +18,22 @@
     <ClCompile Include="Bwdif.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="Bwdif_SSE2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="Bwdif_AVX2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="Bwdif_AVX512.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="VCL2\instrset_detect.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="Bwdif.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/Bwdif/Bwdif_AVX2.cpp b/Bwdif/Bwdif_AVX2.cpp
new file mode 100644
index 0000000..e500d7f
--- /dev/null
+++ b/Bwdif/Bwdif_AVX2.cpp
@@ -0,0 +1,224 @@
+#ifdef BWDIF_X86
+#include "Bwdif.h"
+
+template<typename pixel_t, bool spat>
+void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                     const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prevAbove = prev + negativeStride;
+    const pixel_t * prevBelow = prev + positiveStride;
+    const pixel_t * curAbove = cur + negativeStride;
+    const pixel_t * curBelow = cur + positiveStride;
+    const pixel_t * nextAbove = next + negativeStride;
+    const pixel_t * nextBelow = next + positiveStride;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+
+    for (int x = 0; x < width; x += step) {
+        if constexpr (std::is_same_v<pixel_t, uint8_t>) {
+            const Vec16s c = Vec16s().load_16uc(curAbove + x);
+            const Vec16s d = (Vec16s().load_16uc(prev2 + x) + Vec16s().load_16uc(next2 + x)) >> 1;
+            const Vec16s e = Vec16s().load_16uc(curBelow + x);
+            const Vec16s temporal_diff0 = abs(Vec16s().load_16uc(prev2 + x) - Vec16s().load_16uc(next2 + x));
+            const Vec16s temporal_diff1 = (abs(Vec16s().load_16uc(prevAbove + x) - c) + abs(Vec16s().load_16uc(prevBelow + x) - e)) >> 1;
+            const Vec16s temporal_diff2 = (abs(Vec16s().load_16uc(nextAbove + x) - c) + abs(Vec16s().load_16uc(nextBelow + x) - e)) >> 1;
+            const Vec16s temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            Vec16s diff;
+            if constexpr (spat) {
+                const Vec16s b = ((Vec16s().load_16uc(prev2Above2 + x) + Vec16s().load_16uc(next2Above2 + x)) >> 1) - c;
+                const Vec16s f = ((Vec16s().load_16uc(prev2Below2 + x) + Vec16s().load_16uc(next2Below2 + x)) >> 1) - e;
+                const Vec16s dc = d - c;
+                const Vec16s de = d - e;
+                const Vec16s maximum = max(max(de, dc), min(b, f));
+                const Vec16s minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            Vec16s interpol = min(max((c + e) >> 1, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si256()).get_low();
+            result.store_nt(dst + x);
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const Vec8i c = Vec8i().load_8us(curAbove + x);
+            const Vec8i d = (Vec8i().load_8us(prev2 + x) + Vec8i().load_8us(next2 + x)) >> 1;
+            const Vec8i e = Vec8i().load_8us(curBelow + x);
+            const Vec8i temporal_diff0 = abs(Vec8i().load_8us(prev2 + x) - Vec8i().load_8us(next2 + x));
+            const Vec8i temporal_diff1 = (abs(Vec8i().load_8us(prevAbove + x) - c) + abs(Vec8i().load_8us(prevBelow + x) - e)) >> 1;
+            const Vec8i temporal_diff2 = (abs(Vec8i().load_8us(nextAbove + x) - c) + abs(Vec8i().load_8us(nextBelow + x) - e)) >> 1;
+            const Vec8i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            Vec8i diff;
+            if constexpr (spat) {
+                const Vec8i b = ((Vec8i().load_8us(prev2Above2 + x) + Vec8i().load_8us(next2Above2 + x)) >> 1) - c;
+                const Vec8i f = ((Vec8i().load_8us(prev2Below2 + x) + Vec8i().load_8us(next2Below2 + x)) >> 1) - e;
+                const Vec8i dc = d - c;
+                const Vec8i de = d - e;
+                const Vec8i maximum = max(max(de, dc), min(b, f));
+                const Vec8i minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            Vec8i interpol = min(max((c + e) >> 1, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si256()).get_low();
+            min(result, peak).store_nt(dst + x);
+        } else {
+            const Vec8f c = Vec8f().load_a(curAbove + x);
+            const Vec8f d = (Vec8f().load_a(prev2 + x) + Vec8f().load_a(next2 + x)) * 0.5f;
+            const Vec8f e = Vec8f().load_a(curBelow + x);
+            const Vec8f temporal_diff0 = abs(Vec8f().load_a(prev2 + x) - Vec8f().load_a(next2 + x));
+            const Vec8f temporal_diff1 = (abs(Vec8f().load_a(prevAbove + x) - c) + abs(Vec8f().load_a(prevBelow + x) - e)) * 0.5f;
+            const Vec8f temporal_diff2 = (abs(Vec8f().load_a(nextAbove + x) - c) + abs(Vec8f().load_a(nextBelow + x) - e)) * 0.5f;
+            const Vec8f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+            Vec8f diff;
+            if constexpr (spat) {
+                const Vec8f b = ((Vec8f().load_a(prev2Above2 + x) + Vec8f().load_a(next2Above2 + x)) * 0.5f) - c;
+                const Vec8f f = ((Vec8f().load_a(prev2Below2 + x) + Vec8f().load_a(next2Below2 + x)) * 0.5f) - e;
+                const Vec8f dc = d - c;
+                const Vec8f de = d - e;
+                const Vec8f maximum = max(max(de, dc), min(b, f));
+                const Vec8f minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            const Vec8f interpol = min(max((c + e) * 0.5f, d - diff), d + diff);
+            select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+        }
+    }
+}
+
+template<typename pixel_t>
+void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                     const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above4 = prev2 - stride4;
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prev2Below4 = prev2 + stride4;
+    const pixel_t * prevAbove = prev - stride;
+    const pixel_t * prevBelow = prev + stride;
+    const pixel_t * curAbove3 = cur - stride3;
+    const pixel_t * curAbove = cur - stride;
+    const pixel_t * curBelow = cur + stride;
+    const pixel_t * curBelow3 = cur + stride3;
+    const pixel_t * nextAbove = next - stride;
+    const pixel_t * nextBelow = next + stride;
+    const pixel_t * next2Above4 = next2 - stride4;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+    const pixel_t * next2Below4 = next2 + stride4;
+
+    for (int x = 0; x < width; x += step) {
+        if constexpr (std::is_same_v<pixel_t, uint8_t>) {
+            const Vec8i c = Vec8i().load_8uc(curAbove + x);
+            const Vec8i d = (Vec8i().load_8uc(prev2 + x) + Vec8i().load_8uc(next2 + x)) >> 1;
+            const Vec8i e = Vec8i().load_8uc(curBelow + x);
+            const Vec8i temporal_diff0 = abs(Vec8i().load_8uc(prev2 + x) - Vec8i().load_8uc(next2 + x));
+            const Vec8i temporal_diff1 = (abs(Vec8i().load_8uc(prevAbove + x) - c) + abs(Vec8i().load_8uc(prevBelow + x) - e)) >> 1;
+            const Vec8i temporal_diff2 = (abs(Vec8i().load_8uc(nextAbove + x) - c) + abs(Vec8i().load_8uc(nextBelow + x) - e)) >> 1;
+            const Vec8i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            const Vec8i b = ((Vec8i().load_8uc(prev2Above2 + x) + Vec8i().load_8uc(next2Above2 + x)) >> 1) - c;
+            const Vec8i f = ((Vec8i().load_8uc(prev2Below2 + x) + Vec8i().load_8uc(next2Below2 + x)) >> 1) - e;
+            const Vec8i dc = d - c;
+            const Vec8i de = d - e;
+            const Vec8i maximum = max(max(de, dc), min(b, f));
+            const Vec8i minimum = min(min(de, dc), max(b, f));
+            const Vec8i diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec8i interpol1 = (((coef_hf[0] * (Vec8i().load_8uc(prev2 + x) + Vec8i().load_8uc(next2 + x))
+                                        - coef_hf[1] * (Vec8i().load_8uc(prev2Above2 + x) + Vec8i().load_8uc(next2Above2 + x) + Vec8i().load_8uc(prev2Below2 + x) + Vec8i().load_8uc(next2Below2 + x))
+                                        + coef_hf[2] * (Vec8i().load_8uc(prev2Above4 + x) + Vec8i().load_8uc(next2Above4 + x) + Vec8i().load_8uc(prev2Below4 + x) + Vec8i().load_8uc(next2Below4 + x))) >> 2)
+                                      + coef_lf[0] * (c + e) - coef_lf[1] * (Vec8i().load_8uc(curAbove3 + x) + Vec8i().load_8uc(curBelow3 + x))) >> 13;
+            const Vec8i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec8i().load_8uc(curAbove3 + x) + Vec8i().load_8uc(curBelow3 + x))) >> 13;
+
+            Vec8i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(compress_saturated(interpol, zero_si256()), zero_si256()).get_low();
+            result.storel(dst + x);
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const Vec8i c = Vec8i().load_8us(curAbove + x);
+            const Vec8i d = (Vec8i().load_8us(prev2 + x) + Vec8i().load_8us(next2 + x)) >> 1;
+            const Vec8i e = Vec8i().load_8us(curBelow + x);
+            const Vec8i temporal_diff0 = abs(Vec8i().load_8us(prev2 + x) - Vec8i().load_8us(next2 + x));
+            const Vec8i temporal_diff1 = (abs(Vec8i().load_8us(prevAbove + x) - c) + abs(Vec8i().load_8us(prevBelow + x) - e)) >> 1;
+            const Vec8i temporal_diff2 = (abs(Vec8i().load_8us(nextAbove + x) - c) + abs(Vec8i().load_8us(nextBelow + x) - e)) >> 1;
+            const Vec8i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            const Vec8i b = ((Vec8i().load_8us(prev2Above2 + x) + Vec8i().load_8us(next2Above2 + x)) >> 1) - c;
+            const Vec8i f = ((Vec8i().load_8us(prev2Below2 + x) + Vec8i().load_8us(next2Below2 + x)) >> 1) - e;
+            const Vec8i dc = d - c;
+            const Vec8i de = d - e;
+            const Vec8i maximum = max(max(de, dc), min(b, f));
+            const Vec8i minimum = min(min(de, dc), max(b, f));
+            const Vec8i diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec8i interpol1 = (((coef_hf[0] * (Vec8i().load_8us(prev2 + x) + Vec8i().load_8us(next2 + x))
+                                       - coef_hf[1] * (Vec8i().load_8us(prev2Above2 + x) + Vec8i().load_8us(next2Above2 + x) + Vec8i().load_8us(prev2Below2 + x) + Vec8i().load_8us(next2Below2 + x))
+                                       + coef_hf[2] * (Vec8i().load_8us(prev2Above4 + x) + Vec8i().load_8us(next2Above4 + x) + Vec8i().load_8us(prev2Below4 + x) + Vec8i().load_8us(next2Below4 + x))) >> 2)
+                                     + coef_lf[0] * (c + e) - coef_lf[1] * (Vec8i().load_8us(curAbove3 + x) + Vec8i().load_8us(curBelow3 + x))) >> 13;
+            const Vec8i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec8i().load_8us(curAbove3 + x) + Vec8i().load_8us(curBelow3 + x))) >> 13;
+
+            Vec8i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si256()).get_low();
+            min(result, peak).store_nt(dst + x);
+        } else {
+            const Vec8f c = Vec8f().load_a(curAbove + x);
+            const Vec8f d = (Vec8f().load_a(prev2 + x) + Vec8f().load_a(next2 + x)) * 0.5f;
+            const Vec8f e = Vec8f().load_a(curBelow + x);
+            const Vec8f temporal_diff0 = abs(Vec8f().load_a(prev2 + x) - Vec8f().load_a(next2 + x));
+            const Vec8f temporal_diff1 = (abs(Vec8f().load_a(prevAbove + x) - c) + abs(Vec8f().load_a(prevBelow + x) - e)) * 0.5f;
+            const Vec8f temporal_diff2 = (abs(Vec8f().load_a(nextAbove + x) - c) + abs(Vec8f().load_a(nextBelow + x) - e)) * 0.5f;
+            const Vec8f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+            const Vec8f b = ((Vec8f().load_a(prev2Above2 + x) + Vec8f().load_a(next2Above2 + x)) * 0.5f) - c;
+            const Vec8f f = ((Vec8f().load_a(prev2Below2 + x) + Vec8f().load_a(next2Below2 + x)) * 0.5f) - e;
+            const Vec8f dc = d - c;
+            const Vec8f de = d - e;
+            const Vec8f maximum = max(max(de, dc), min(b, f));
+            const Vec8f minimum = min(min(de, dc), max(b, f));
+            const Vec8f diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec8f interpol1 = ((coef_hf_f[0] * (Vec8f().load_a(prev2 + x) + Vec8f().load_a(next2 + x))
+                                      - coef_hf_f[1] * (Vec8f().load_a(prev2Above2 + x) + Vec8f().load_a(next2Above2 + x) + Vec8f().load_a(prev2Below2 + x) + Vec8f().load_a(next2Below2 + x))
+                                      + coef_hf_f[2] * (Vec8f().load_a(prev2Above4 + x) + Vec8f().load_a(next2Above4 + x) + Vec8f().load_a(prev2Below4 + x) + Vec8f().load_a(next2Below4 + x))) * 0.25f
+                                     + coef_lf_f[0] * (c + e) - coef_lf_f[1] * (Vec8f().load_a(curAbove3 + x) + Vec8f().load_a(curBelow3 + x)));
+            const Vec8f interpol2 = coef_sp_f[0] * (c + e) - coef_sp_f[1] * (Vec8f().load_a(curAbove3 + x) + Vec8f().load_a(curBelow3 + x));
+
+            Vec8f interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+        }
+    }
+}
+
+template void filterEdge_avx2<uint8_t, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2<uint8_t, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2<uint16_t, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2<uint16_t, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2<float, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2<float, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template void filterLine_avx2<uint8_t>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx2<uint16_t>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx2<float>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
diff --git a/Bwdif/Bwdif_AVX512.cpp b/Bwdif/Bwdif_AVX512.cpp
new file mode 100644
index 0000000..0390fc5
--- /dev/null
+++ b/Bwdif/Bwdif_AVX512.cpp
@@ -0,0 +1,224 @@
+#ifdef BWDIF_X86
+#include "Bwdif.h"
+
+template<typename pixel_t, bool spat>
+void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                       const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prevAbove = prev + negativeStride;
+    const pixel_t * prevBelow = prev + positiveStride;
+    const pixel_t * curAbove = cur + negativeStride;
+    const pixel_t * curBelow = cur + positiveStride;
+    const pixel_t * nextAbove = next + negativeStride;
+    const pixel_t * nextBelow = next + positiveStride;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+
+    for (int x = 0; x < width; x += step) {
+        if constexpr (std::is_same_v<pixel_t, uint8_t>) {
+            const Vec32s c = Vec32s().load_32uc(curAbove + x);
+            const Vec32s d = (Vec32s().load_32uc(prev2 + x) + Vec32s().load_32uc(next2 + x)) >> 1;
+            const Vec32s e = Vec32s().load_32uc(curBelow + x);
+            const Vec32s temporal_diff0 = abs(Vec32s().load_32uc(prev2 + x) - Vec32s().load_32uc(next2 + x));
+            const Vec32s temporal_diff1 = (abs(Vec32s().load_32uc(prevAbove + x) - c) + abs(Vec32s().load_32uc(prevBelow + x) - e)) >> 1;
+            const Vec32s temporal_diff2 = (abs(Vec32s().load_32uc(nextAbove + x) - c) + abs(Vec32s().load_32uc(nextBelow + x) - e)) >> 1;
+            const Vec32s temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            Vec32s diff;
+            if constexpr (spat) {
+                const Vec32s b = ((Vec32s().load_32uc(prev2Above2 + x) + Vec32s().load_32uc(next2Above2 + x)) >> 1) - c;
+                const Vec32s f = ((Vec32s().load_32uc(prev2Below2 + x) + Vec32s().load_32uc(next2Below2 + x)) >> 1) - e;
+                const Vec32s dc = d - c;
+                const Vec32s de = d - e;
+                const Vec32s maximum = max(max(de, dc), min(b, f));
+                const Vec32s minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            Vec32s interpol = min(max((c + e) >> 1, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si512()).get_low();
+            result.store_nt(dst + x);
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const Vec16i c = Vec16i().load_16us(curAbove + x);
+            const Vec16i d = (Vec16i().load_16us(prev2 + x) + Vec16i().load_16us(next2 + x)) >> 1;
+            const Vec16i e = Vec16i().load_16us(curBelow + x);
+            const Vec16i temporal_diff0 = abs(Vec16i().load_16us(prev2 + x) - Vec16i().load_16us(next2 + x));
+            const Vec16i temporal_diff1 = (abs(Vec16i().load_16us(prevAbove + x) - c) + abs(Vec16i().load_16us(prevBelow + x) - e)) >> 1;
+            const Vec16i temporal_diff2 = (abs(Vec16i().load_16us(nextAbove + x) - c) + abs(Vec16i().load_16us(nextBelow + x) - e)) >> 1;
+            const Vec16i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            Vec16i diff;
+            if constexpr (spat) {
+                const Vec16i b = ((Vec16i().load_16us(prev2Above2 + x) + Vec16i().load_16us(next2Above2 + x)) >> 1) - c;
+                const Vec16i f = ((Vec16i().load_16us(prev2Below2 + x) + Vec16i().load_16us(next2Below2 + x)) >> 1) - e;
+                const Vec16i dc = d - c;
+                const Vec16i de = d - e;
+                const Vec16i maximum = max(max(de, dc), min(b, f));
+                const Vec16i minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            Vec16i interpol = min(max((c + e) >> 1, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si512()).get_low();
+            min(result, peak).store_nt(dst + x);
+        } else {
+            const Vec16f c = Vec16f().load_a(curAbove + x);
+            const Vec16f d = (Vec16f().load_a(prev2 + x) + Vec16f().load_a(next2 + x)) * 0.5f;
+            const Vec16f e = Vec16f().load_a(curBelow + x);
+            const Vec16f temporal_diff0 = abs(Vec16f().load_a(prev2 + x) - Vec16f().load_a(next2 + x));
+            const Vec16f temporal_diff1 = (abs(Vec16f().load_a(prevAbove + x) - c) + abs(Vec16f().load_a(prevBelow + x) - e)) * 0.5f;
+            const Vec16f temporal_diff2 = (abs(Vec16f().load_a(nextAbove + x) - c) + abs(Vec16f().load_a(nextBelow + x) - e)) * 0.5f;
+            const Vec16f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+            Vec16f diff;
+            if constexpr (spat) {
+                const Vec16f b = ((Vec16f().load_a(prev2Above2 + x) + Vec16f().load_a(next2Above2 + x)) * 0.5f) - c;
+                const Vec16f f = ((Vec16f().load_a(prev2Below2 + x) + Vec16f().load_a(next2Below2 + x)) * 0.5f) - e;
+                const Vec16f dc = d - c;
+                const Vec16f de = d - e;
+                const Vec16f maximum = max(max(de, dc), min(b, f));
+                const Vec16f minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            const Vec16f interpol = min(max((c + e) * 0.5f, d - diff), d + diff);
+            select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+        }
+    }
+}
+
+template<typename pixel_t>
+void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                       const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above4 = prev2 - stride4;
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prev2Below4 = prev2 + stride4;
+    const pixel_t * prevAbove = prev - stride;
+    const pixel_t * prevBelow = prev + stride;
+    const pixel_t * curAbove3 = cur - stride3;
+    const pixel_t * curAbove = cur - stride;
+    const pixel_t * curBelow = cur + stride;
+    const pixel_t * curBelow3 = cur + stride3;
+    const pixel_t * nextAbove = next - stride;
+    const pixel_t * nextBelow = next + stride;
+    const pixel_t * next2Above4 = next2 - stride4;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+    const pixel_t * next2Below4 = next2 + stride4;
+
+    for (int x = 0; x < width; x += step) {
+        if constexpr (std::is_same_v<pixel_t, uint8_t>) {
+            const Vec16i c = Vec16i().load_16uc(curAbove + x);
+            const Vec16i d = (Vec16i().load_16uc(prev2 + x) + Vec16i().load_16uc(next2 + x)) >> 1;
+            const Vec16i e = Vec16i().load_16uc(curBelow + x);
+            const Vec16i temporal_diff0 = abs(Vec16i().load_16uc(prev2 + x) - Vec16i().load_16uc(next2 + x));
+            const Vec16i temporal_diff1 = (abs(Vec16i().load_16uc(prevAbove + x) - c) + abs(Vec16i().load_16uc(prevBelow + x) - e)) >> 1;
+            const Vec16i temporal_diff2 = (abs(Vec16i().load_16uc(nextAbove + x) - c) + abs(Vec16i().load_16uc(nextBelow + x) - e)) >> 1;
+            const Vec16i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            const Vec16i b = ((Vec16i().load_16uc(prev2Above2 + x) + Vec16i().load_16uc(next2Above2 + x)) >> 1) - c;
+            const Vec16i f = ((Vec16i().load_16uc(prev2Below2 + x) + Vec16i().load_16uc(next2Below2 + x)) >> 1) - e;
+            const Vec16i dc = d - c;
+            const Vec16i de = d - e;
+            const Vec16i maximum = max(max(de, dc), min(b, f));
+            const Vec16i minimum = min(min(de, dc), max(b, f));
+            const Vec16i diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec16i interpol1 = (((coef_hf[0] * (Vec16i().load_16uc(prev2 + x) + Vec16i().load_16uc(next2 + x))
+                                        - coef_hf[1] * (Vec16i().load_16uc(prev2Above2 + x) + Vec16i().load_16uc(next2Above2 + x) + Vec16i().load_16uc(prev2Below2 + x) + Vec16i().load_16uc(next2Below2 + x))
+                                        + coef_hf[2] * (Vec16i().load_16uc(prev2Above4 + x) + Vec16i().load_16uc(next2Above4 + x) + Vec16i().load_16uc(prev2Below4 + x) + Vec16i().load_16uc(next2Below4 + x))) >> 2)
+                                      + coef_lf[0] * (c + e) - coef_lf[1] * (Vec16i().load_16uc(curAbove3 + x) + Vec16i().load_16uc(curBelow3 + x))) >> 13;
+            const Vec16i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec16i().load_16uc(curAbove3 + x) + Vec16i().load_16uc(curBelow3 + x))) >> 13;
+
+            Vec16i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(compress_saturated(interpol, zero_si512()), zero_si512()).get_low().get_low();
+            result.store_nt(dst + x);
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const Vec16i c = Vec16i().load_16us(curAbove + x);
+            const Vec16i d = (Vec16i().load_16us(prev2 + x) + Vec16i().load_16us(next2 + x)) >> 1;
+            const Vec16i e = Vec16i().load_16us(curBelow + x);
+            const Vec16i temporal_diff0 = abs(Vec16i().load_16us(prev2 + x) - Vec16i().load_16us(next2 + x));
+            const Vec16i temporal_diff1 = (abs(Vec16i().load_16us(prevAbove + x) - c) + abs(Vec16i().load_16us(prevBelow + x) - e)) >> 1;
+            const Vec16i temporal_diff2 = (abs(Vec16i().load_16us(nextAbove + x) - c) + abs(Vec16i().load_16us(nextBelow + x) - e)) >> 1;
+            const Vec16i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            const Vec16i b = ((Vec16i().load_16us(prev2Above2 + x) + Vec16i().load_16us(next2Above2 + x)) >> 1) - c;
+            const Vec16i f = ((Vec16i().load_16us(prev2Below2 + x) + Vec16i().load_16us(next2Below2 + x)) >> 1) - e;
+            const Vec16i dc = d - c;
+            const Vec16i de = d - e;
+            const Vec16i maximum = max(max(de, dc), min(b, f));
+            const Vec16i minimum = min(min(de, dc), max(b, f));
+            const Vec16i diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec16i interpol1 = (((coef_hf[0] * (Vec16i().load_16us(prev2 + x) + Vec16i().load_16us(next2 + x))
+                                       - coef_hf[1] * (Vec16i().load_16us(prev2Above2 + x) + Vec16i().load_16us(next2Above2 + x) + Vec16i().load_16us(prev2Below2 + x) + Vec16i().load_16us(next2Below2 + x))
+                                       + coef_hf[2] * (Vec16i().load_16us(prev2Above4 + x) + Vec16i().load_16us(next2Above4 + x) + Vec16i().load_16us(prev2Below4 + x) + Vec16i().load_16us(next2Below4 + x))) >> 2)
+                                     + coef_lf[0] * (c + e) - coef_lf[1] * (Vec16i().load_16us(curAbove3 + x) + Vec16i().load_16us(curBelow3 + x))) >> 13;
+            const Vec16i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec16i().load_16us(curAbove3 + x) + Vec16i().load_16us(curBelow3 + x))) >> 13;
+
+            Vec16i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si512()).get_low();
+            min(result, peak).store_nt(dst + x);
+        } else {
+            const Vec16f c = Vec16f().load_a(curAbove + x);
+            const Vec16f d = (Vec16f().load_a(prev2 + x) + Vec16f().load_a(next2 + x)) * 0.5f;
+            const Vec16f e = Vec16f().load_a(curBelow + x);
+            const Vec16f temporal_diff0 = abs(Vec16f().load_a(prev2 + x) - Vec16f().load_a(next2 + x));
+            const Vec16f temporal_diff1 = (abs(Vec16f().load_a(prevAbove + x) - c) + abs(Vec16f().load_a(prevBelow + x) - e)) * 0.5f;
+            const Vec16f temporal_diff2 = (abs(Vec16f().load_a(nextAbove + x) - c) + abs(Vec16f().load_a(nextBelow + x) - e)) * 0.5f;
+            const Vec16f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+            const Vec16f b = ((Vec16f().load_a(prev2Above2 + x) + Vec16f().load_a(next2Above2 + x)) * 0.5f) - c;
+            const Vec16f f = ((Vec16f().load_a(prev2Below2 + x) + Vec16f().load_a(next2Below2 + x)) * 0.5f) - e;
+            const Vec16f dc = d - c;
+            const Vec16f de = d - e;
+            const Vec16f maximum = max(max(de, dc), min(b, f));
+            const Vec16f minimum = min(min(de, dc), max(b, f));
+            const Vec16f diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec16f interpol1 = ((coef_hf_f[0] * (Vec16f().load_a(prev2 + x) + Vec16f().load_a(next2 + x))
+                                      - coef_hf_f[1] * (Vec16f().load_a(prev2Above2 + x) + Vec16f().load_a(next2Above2 + x) + Vec16f().load_a(prev2Below2 + x) + Vec16f().load_a(next2Below2 + x))
+                                      + coef_hf_f[2] * (Vec16f().load_a(prev2Above4 + x) + Vec16f().load_a(next2Above4 + x) + Vec16f().load_a(prev2Below4 + x) + Vec16f().load_a(next2Below4 + x))) * 0.25f
+                                     + coef_lf_f[0] * (c + e) - coef_lf_f[1] * (Vec16f().load_a(curAbove3 + x) + Vec16f().load_a(curBelow3 + x)));
+            const Vec16f interpol2 = coef_sp_f[0] * (c + e) - coef_sp_f[1] * (Vec16f().load_a(curAbove3 + x) + Vec16f().load_a(curBelow3 + x));
+
+            Vec16f interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+        }
+    }
+}
+
+template void filterEdge_avx512<uint8_t, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512<uint8_t, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512<uint16_t, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512<uint16_t, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512<float, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512<float, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template void filterLine_avx512<uint8_t>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx512<uint16_t>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx512<float>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
diff --git a/Bwdif/Bwdif_SSE2.cpp b/Bwdif/Bwdif_SSE2.cpp
new file mode 100644
index 0000000..e54a6e7
--- /dev/null
+++ b/Bwdif/Bwdif_SSE2.cpp
@@ -0,0 +1,224 @@
+#ifdef BWDIF_X86
+#include "Bwdif.h"
+
+template<typename pixel_t, bool spat>
+void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                     const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prevAbove = prev + negativeStride;
+    const pixel_t * prevBelow = prev + positiveStride;
+    const pixel_t * curAbove = cur + negativeStride;
+    const pixel_t * curBelow = cur + positiveStride;
+    const pixel_t * nextAbove = next + negativeStride;
+    const pixel_t * nextBelow = next + positiveStride;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+
+    for (int x = 0; x < width; x += step) {
+        if constexpr (std::is_same_v<pixel_t, uint8_t>) {
+            const Vec8s c = Vec8s().load_8uc(curAbove + x);
+            const Vec8s d = (Vec8s().load_8uc(prev2 + x) + Vec8s().load_8uc(next2 + x)) >> 1;
+            const Vec8s e = Vec8s().load_8uc(curBelow + x);
+            const Vec8s temporal_diff0 = abs(Vec8s().load_8uc(prev2 + x) - Vec8s().load_8uc(next2 + x));
+            const Vec8s temporal_diff1 = (abs(Vec8s().load_8uc(prevAbove + x) - c) + abs(Vec8s().load_8uc(prevBelow + x) - e)) >> 1;
+            const Vec8s temporal_diff2 = (abs(Vec8s().load_8uc(nextAbove + x) - c) + abs(Vec8s().load_8uc(nextBelow + x) - e)) >> 1;
+            const Vec8s temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            Vec8s diff;
+            if constexpr (spat) {
+                const Vec8s b = ((Vec8s().load_8uc(prev2Above2 + x) + Vec8s().load_8uc(next2Above2 + x)) >> 1) - c;
+                const Vec8s f = ((Vec8s().load_8uc(prev2Below2 + x) + Vec8s().load_8uc(next2Below2 + x)) >> 1) - e;
+                const Vec8s dc = d - c;
+                const Vec8s de = d - e;
+                const Vec8s maximum = max(max(de, dc), min(b, f));
+                const Vec8s minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            Vec8s interpol = min(max((c + e) >> 1, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si128());
+            result.storel(dst + x);
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const Vec4i c = Vec4i().load_4us(curAbove + x);
+            const Vec4i d = (Vec4i().load_4us(prev2 + x) + Vec4i().load_4us(next2 + x)) >> 1;
+            const Vec4i e = Vec4i().load_4us(curBelow + x);
+            const Vec4i temporal_diff0 = abs(Vec4i().load_4us(prev2 + x) - Vec4i().load_4us(next2 + x));
+            const Vec4i temporal_diff1 = (abs(Vec4i().load_4us(prevAbove + x) - c) + abs(Vec4i().load_4us(prevBelow + x) - e)) >> 1;
+            const Vec4i temporal_diff2 = (abs(Vec4i().load_4us(nextAbove + x) - c) + abs(Vec4i().load_4us(nextBelow + x) - e)) >> 1;
+            const Vec4i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            Vec4i diff;
+            if constexpr (spat) {
+                const Vec4i b = ((Vec4i().load_4us(prev2Above2 + x) + Vec4i().load_4us(next2Above2 + x)) >> 1) - c;
+                const Vec4i f = ((Vec4i().load_4us(prev2Below2 + x) + Vec4i().load_4us(next2Below2 + x)) >> 1) - e;
+                const Vec4i dc = d - c;
+                const Vec4i de = d - e;
+                const Vec4i maximum = max(max(de, dc), min(b, f));
+                const Vec4i minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            Vec4i interpol = min(max((c + e) >> 1, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si128());
+            min(result, peak).storel(dst + x);
+        } else {
+            const Vec4f c = Vec4f().load_a(curAbove + x);
+            const Vec4f d = (Vec4f().load_a(prev2 + x) + Vec4f().load_a(next2 + x)) * 0.5f;
+            const Vec4f e = Vec4f().load_a(curBelow + x);
+            const Vec4f temporal_diff0 = abs(Vec4f().load_a(prev2 + x) - Vec4f().load_a(next2 + x));
+            const Vec4f temporal_diff1 = (abs(Vec4f().load_a(prevAbove + x) - c) + abs(Vec4f().load_a(prevBelow + x) - e)) * 0.5f;
+            const Vec4f temporal_diff2 = (abs(Vec4f().load_a(nextAbove + x) - c) + abs(Vec4f().load_a(nextBelow + x) - e)) * 0.5f;
+            const Vec4f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+            Vec4f diff;
+            if constexpr (spat) {
+                const Vec4f b = ((Vec4f().load_a(prev2Above2 + x) + Vec4f().load_a(next2Above2 + x)) * 0.5f) - c;
+                const Vec4f f = ((Vec4f().load_a(prev2Below2 + x) + Vec4f().load_a(next2Below2 + x)) * 0.5f) - e;
+                const Vec4f dc = d - c;
+                const Vec4f de = d - e;
+                const Vec4f maximum = max(max(de, dc), min(b, f));
+                const Vec4f minimum = min(min(de, dc), max(b, f));
+                diff = max(max(temporal_diff, minimum), -maximum);
+            }
+
+            const Vec4f interpol = min(max((c + e) * 0.5f, d - diff), d + diff);
+            select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+        }
+    }
+}
+
+template<typename pixel_t>
+void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+                     const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+    const pixel_t * prev2 = reinterpret_cast<const pixel_t *>(_prev2);
+    const pixel_t * prev = reinterpret_cast<const pixel_t *>(_prev);
+    const pixel_t * cur = reinterpret_cast<const pixel_t *>(_cur);
+    const pixel_t * next = reinterpret_cast<const pixel_t *>(_next);
+    const pixel_t * next2 = reinterpret_cast<const pixel_t *>(_next2);
+    pixel_t * dst = reinterpret_cast<pixel_t *>(_dst);
+
+    const pixel_t * prev2Above4 = prev2 - stride4;
+    const pixel_t * prev2Above2 = prev2 - stride2;
+    const pixel_t * prev2Below2 = prev2 + stride2;
+    const pixel_t * prev2Below4 = prev2 + stride4;
+    const pixel_t * prevAbove = prev - stride;
+    const pixel_t * prevBelow = prev + stride;
+    const pixel_t * curAbove3 = cur - stride3;
+    const pixel_t * curAbove = cur - stride;
+    const pixel_t * curBelow = cur + stride;
+    const pixel_t * curBelow3 = cur + stride3;
+    const pixel_t * nextAbove = next - stride;
+    const pixel_t * nextBelow = next + stride;
+    const pixel_t * next2Above4 = next2 - stride4;
+    const pixel_t * next2Above2 = next2 - stride2;
+    const pixel_t * next2Below2 = next2 + stride2;
+    const pixel_t * next2Below4 = next2 + stride4;
+
+    for (int x = 0; x < width; x += step) {
+        if constexpr (std::is_same_v<pixel_t, uint8_t>) {
+            const Vec4i c = Vec4i().load_4uc(curAbove + x);
+            const Vec4i d = (Vec4i().load_4uc(prev2 + x) + Vec4i().load_4uc(next2 + x)) >> 1;
+            const Vec4i e = Vec4i().load_4uc(curBelow + x);
+            const Vec4i temporal_diff0 = abs(Vec4i().load_4uc(prev2 + x) - Vec4i().load_4uc(next2 + x));
+            const Vec4i temporal_diff1 = (abs(Vec4i().load_4uc(prevAbove + x) - c) + abs(Vec4i().load_4uc(prevBelow + x) - e)) >> 1;
+            const Vec4i temporal_diff2 = (abs(Vec4i().load_4uc(nextAbove + x) - c) + abs(Vec4i().load_4uc(nextBelow + x) - e)) >> 1;
+            const Vec4i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            const Vec4i b = ((Vec4i().load_4uc(prev2Above2 + x) + Vec4i().load_4uc(next2Above2 + x)) >> 1) - c;
+            const Vec4i f = ((Vec4i().load_4uc(prev2Below2 + x) + Vec4i().load_4uc(next2Below2 + x)) >> 1) - e;
+            const Vec4i dc = d - c;
+            const Vec4i de = d - e;
+            const Vec4i maximum = max(max(de, dc), min(b, f));
+            const Vec4i minimum = min(min(de, dc), max(b, f));
+            const Vec4i diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec4i interpol1 = (((coef_hf[0] * (Vec4i().load_4uc(prev2 + x) + Vec4i().load_4uc(next2 + x))
+                                       - coef_hf[1] * (Vec4i().load_4uc(prev2Above2 + x) + Vec4i().load_4uc(next2Above2 + x) + Vec4i().load_4uc(prev2Below2 + x) + Vec4i().load_4uc(next2Below2 + x))
+                                       + coef_hf[2] * (Vec4i().load_4uc(prev2Above4 + x) + Vec4i().load_4uc(next2Above4 + x) + Vec4i().load_4uc(prev2Below4 + x) + Vec4i().load_4uc(next2Below4 + x))) >> 2)
+                                     + coef_lf[0] * (c + e) - coef_lf[1] * (Vec4i().load_4uc(curAbove3 + x) + Vec4i().load_4uc(curBelow3 + x))) >> 13;
+            const Vec4i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec4i().load_4uc(curAbove3 + x) + Vec4i().load_4uc(curBelow3 + x))) >> 13;
+
+            Vec4i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(compress_saturated(interpol, zero_si128()), zero_si128());
+            result.store_si32(dst + x);
+        } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
+            const Vec4i c = Vec4i().load_4us(curAbove + x);
+            const Vec4i d = (Vec4i().load_4us(prev2 + x) + Vec4i().load_4us(next2 + x)) >> 1;
+            const Vec4i e = Vec4i().load_4us(curBelow + x);
+            const Vec4i temporal_diff0 = abs(Vec4i().load_4us(prev2 + x) - Vec4i().load_4us(next2 + x));
+            const Vec4i temporal_diff1 = (abs(Vec4i().load_4us(prevAbove + x) - c) + abs(Vec4i().load_4us(prevBelow + x) - e)) >> 1;
+            const Vec4i temporal_diff2 = (abs(Vec4i().load_4us(nextAbove + x) - c) + abs(Vec4i().load_4us(nextBelow + x) - e)) >> 1;
+            const Vec4i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+            const Vec4i b = ((Vec4i().load_4us(prev2Above2 + x) + Vec4i().load_4us(next2Above2 + x)) >> 1) - c;
+            const Vec4i f = ((Vec4i().load_4us(prev2Below2 + x) + Vec4i().load_4us(next2Below2 + x)) >> 1) - e;
+            const Vec4i dc = d - c;
+            const Vec4i de = d - e;
+            const Vec4i maximum = max(max(de, dc), min(b, f));
+            const Vec4i minimum = min(min(de, dc), max(b, f));
+            const Vec4i diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec4i interpol1 = (((coef_hf[0] * (Vec4i().load_4us(prev2 + x) + Vec4i().load_4us(next2 + x))
+                                       - coef_hf[1] * (Vec4i().load_4us(prev2Above2 + x) + Vec4i().load_4us(next2Above2 + x) + Vec4i().load_4us(prev2Below2 + x) + Vec4i().load_4us(next2Below2 + x))
+                                       + coef_hf[2] * (Vec4i().load_4us(prev2Above4 + x) + Vec4i().load_4us(next2Above4 + x) + Vec4i().load_4us(prev2Below4 + x) + Vec4i().load_4us(next2Below4 + x))) >> 2)
+                                     + coef_lf[0] * (c + e) - coef_lf[1] * (Vec4i().load_4us(curAbove3 + x) + Vec4i().load_4us(curBelow3 + x))) >> 13;
+            const Vec4i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec4i().load_4us(curAbove3 + x) + Vec4i().load_4us(curBelow3 + x))) >> 13;
+
+            Vec4i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            interpol = select(temporal_diff == 0, d, interpol);
+            const auto result = compress_saturated_s2u(interpol, zero_si128());
+            min(result, peak).storel(dst + x);
+        } else {
+            const Vec4f c = Vec4f().load_a(curAbove + x);
+            const Vec4f d = (Vec4f().load_a(prev2 + x) + Vec4f().load_a(next2 + x)) * 0.5f;
+            const Vec4f e = Vec4f().load_a(curBelow + x);
+            const Vec4f temporal_diff0 = abs(Vec4f().load_a(prev2 + x) - Vec4f().load_a(next2 + x));
+            const Vec4f temporal_diff1 = (abs(Vec4f().load_a(prevAbove + x) - c) + abs(Vec4f().load_a(prevBelow + x) - e)) * 0.5f;
+            const Vec4f temporal_diff2 = (abs(Vec4f().load_a(nextAbove + x) - c) + abs(Vec4f().load_a(nextBelow + x) - e)) * 0.5f;
+            const Vec4f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+            const Vec4f b = ((Vec4f().load_a(prev2Above2 + x) + Vec4f().load_a(next2Above2 + x)) * 0.5f) - c;
+            const Vec4f f = ((Vec4f().load_a(prev2Below2 + x) + Vec4f().load_a(next2Below2 + x)) * 0.5f) - e;
+            const Vec4f dc = d - c;
+            const Vec4f de = d - e;
+            const Vec4f maximum = max(max(de, dc), min(b, f));
+            const Vec4f minimum = min(min(de, dc), max(b, f));
+            const Vec4f diff = max(max(temporal_diff, minimum), -maximum);
+
+            const Vec4f interpol1 = ((coef_hf_f[0] * (Vec4f().load_a(prev2 + x) + Vec4f().load_a(next2 + x))
+                                      - coef_hf_f[1] * (Vec4f().load_a(prev2Above2 + x) + Vec4f().load_a(next2Above2 + x) + Vec4f().load_a(prev2Below2 + x) + Vec4f().load_a(next2Below2 + x))
+                                      + coef_hf_f[2] * (Vec4f().load_a(prev2Above4 + x) + Vec4f().load_a(next2Above4 + x) + Vec4f().load_a(prev2Below4 + x) + Vec4f().load_a(next2Below4 + x))) * 0.25f
+                                     + coef_lf_f[0] * (c + e) - coef_lf_f[1] * (Vec4f().load_a(curAbove3 + x) + Vec4f().load_a(curBelow3 + x)));
+            const Vec4f interpol2 = coef_sp_f[0] * (c + e) - coef_sp_f[1] * (Vec4f().load_a(curAbove3 + x) + Vec4f().load_a(curBelow3 + x));
+
+            Vec4f interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+            interpol = min(max(interpol, d - diff), d + diff);
+            select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+        }
+    }
+}
+
+template void filterEdge_sse2<uint8_t, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2<uint8_t, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2<uint16_t, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2<uint16_t, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2<float, true>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2<float, false>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template void filterLine_sse2<uint8_t>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_sse2<uint16_t>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_sse2<float>(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
diff --git a/Bwdif/VCL2/LICENSE b/Bwdif/VCL2/LICENSE
new file mode 100644
index 0000000..fd2deda
--- /dev/null
+++ b/Bwdif/VCL2/LICENSE
@@ -0,0 +1,191 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+  
+   Copyright 2012-2019 Agner Fog.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Bwdif/VCL2/instrset.h b/Bwdif/VCL2/instrset.h
new file mode 100644
index 0000000..34e34b7
--- /dev/null
+++ b/Bwdif/VCL2/instrset.h
@@ -0,0 +1,1485 @@
+/****************************  instrset.h   **********************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-06-08
+* Version:       2.01.03
+* Project:       vector class library
+* Description:
+* Header file for various compiler-specific tasks as well as common
+* macros and templates. This file contains:
+*
+* > Selection of the supported instruction set
+* > Defines compiler version macros
+* > Undefines certain macros that prevent function overloading
+* > Helper functions that depend on instruction set, compiler, or platform
+* > Common templates for permute, blend, etc.
+*
+* For instructions, see vcl_manual.pdf
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef INSTRSET_H
+#define INSTRSET_H 20102
+
+
+// Allow the use of floating point permute instructions on integer vectors.
+// Some CPU's have an extra latency of 1 or 2 clock cycles for this, but
+// it may still be faster than alternative implementations:
+#define ALLOW_FP_PERMUTE  true
+
+
+// Macro to indicate 64 bit mode
+#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__)
+#define __x86_64__ 1  // There are many different macros for this, decide on only one
+#endif
+
+// The following values of INSTRSET are currently defined:
+// 2:  SSE2
+// 3:  SSE3
+// 4:  SSSE3
+// 5:  SSE4.1
+// 6:  SSE4.2
+// 7:  AVX
+// 8:  AVX2
+// 9:  AVX512F
+// 10: AVX512BW/DQ/VL
+// In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this
+// decision cannot be made before the market situation for CPUs with these
+// instruction sets is known (these future instruction set extensions are already
+// used in some VCL functions and tested with an emulator)
+
+// Find instruction set from compiler macros if INSTRSET is not defined.
+// Note: Most of these macros are not defined in Microsoft compilers
+#ifndef INSTRSET
+#if defined ( __AVX512VL__ ) && defined ( __AVX512BW__ ) && defined ( __AVX512DQ__ )
+#define INSTRSET 10
+#elif defined ( __AVX512F__ ) || defined ( __AVX512__ )
+#define INSTRSET 9
+#elif defined ( __AVX2__ )
+#define INSTRSET 8
+#elif defined ( __AVX__ )
+#define INSTRSET 7
+#elif defined ( __SSE4_2__ )
+#define INSTRSET 6
+#elif defined ( __SSE4_1__ )
+#define INSTRSET 5
+#elif defined ( __SSSE3__ )
+#define INSTRSET 4
+#elif defined ( __SSE3__ )
+#define INSTRSET 3
+#elif defined ( __SSE2__ ) || defined ( __x86_64__ )
+#define INSTRSET 2
+#elif defined ( __SSE__ )
+#define INSTRSET 1
+#elif defined ( _M_IX86_FP )           // Defined in MS compiler. 1: SSE, 2: SSE2
+#define INSTRSET _M_IX86_FP
+#else
+#define INSTRSET 0
+#endif // instruction set defines
+#endif // INSTRSET
+
+// Include the appropriate header file for intrinsic functions
+#if INSTRSET > 7                       // AVX2 and later
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+#include <x86intrin.h>                 // x86intrin.h includes header files for whatever instruction
+                                       // sets are specified on the compiler command line, such as:
+                                       // xopintrin.h, fma4intrin.h
+#else
+#include <immintrin.h>                 // MS/Intel version of immintrin.h covers AVX and later
+#endif // __GNUC__
+#elif INSTRSET == 7
+#include <immintrin.h>                 // AVX
+#elif INSTRSET == 6
+#include <nmmintrin.h>                 // SSE4.2
+#elif INSTRSET == 5
+#include <smmintrin.h>                 // SSE4.1
+#elif INSTRSET == 4
+#include <tmmintrin.h>                 // SSSE3
+#elif INSTRSET == 3
+#include <pmmintrin.h>                 // SSE3
+#elif INSTRSET == 2
+#include <emmintrin.h>                 // SSE2
+#elif INSTRSET == 1
+#include <xmmintrin.h>                 // SSE
+#endif // INSTRSET
+
+#if INSTRSET >= 8 && !defined(__FMA__)
+// Assume that all processors that have AVX2 also have FMA3
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+// Prevent error message in g++ and Clang when using FMA intrinsics with avx2:
+#if !defined(DISABLE_WARNING_AVX2_WITHOUT_FMA)
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#endif
+#elif ! defined (__clang__)
+#define __FMA__  1
+#endif
+#endif
+
+// AMD  instruction sets
+#if defined (__XOP__) || defined (__FMA4__)
+#ifdef __GNUC__
+#include <x86intrin.h>                 // AMD XOP (Gnu)
+#else
+#include <ammintrin.h>                 // AMD XOP (Microsoft)
+#endif //  __GNUC__
+#elif defined (__SSE4A__)              // AMD SSE4A
+#include <ammintrin.h>
+#endif // __XOP__
+
+// FMA3 instruction set
+#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
+#include <fmaintrin.h>
+#endif // __FMA__
+
+// FMA4 instruction set
+#if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__))
+#include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
+#endif // __FMA4__
+
+
+#include <stdint.h>                    // Define integer types with known size
+#include <stdlib.h>                    // define abs(int)
+
+#ifdef _MSC_VER                        // Microsoft compiler or compatible Intel compiler
+#include <intrin.h>                    // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
+#endif // _MSC_VER
+
+
+// functions in instrset_detect.cpp:
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+    int  instrset_detect(void);        // tells which instruction sets are supported
+    bool hasFMA3(void);                // true if FMA3 instructions supported
+    bool hasFMA4(void);                // true if FMA4 instructions supported
+    bool hasXOP(void);                 // true if XOP  instructions supported
+    bool hasAVX512ER(void);            // true if AVX512ER instructions supported
+    bool hasAVX512VBMI(void);          // true if AVX512VBMI instructions supported
+    bool hasAVX512VBMI2(void);         // true if AVX512VBMI2 instructions supported
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+// functions in physical_processors.cpp:
+int physicalProcessors(int * logical_processors = 0);
+
+
+// GCC version
+#if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
+#define GCC_VERSION  ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
+#endif
+
+// Clang version
+#if defined (__clang__)
+#define CLANG_VERSION  ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__))
+// Problem: The version number is not consistent across platforms
+// http://llvm.org/bugs/show_bug.cgi?id=12643
+// Apple bug 18746972
+#endif
+
+// Fix problem with non-overloadable macros named min and max in WinDef.h
+#ifdef _MSC_VER
+#if defined (_WINDEF_) && defined(min) && defined(max)
+#undef min
+#undef max
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+// warning for poor support for AVX512F in MS compiler
+#ifndef __INTEL_COMPILER
+#if INSTRSET == 9
+#pragma message("Warning: MS compiler cannot generate code for AVX512F without AVX512DQ")
+#endif
+#if _MSC_VER < 1920 && INSTRSET > 8
+#pragma message("Warning: Your compiler has poor support for AVX512. Code may be erroneous.\nPlease use a newer compiler version or a different compiler!")
+#endif
+#endif // __INTEL_COMPILER
+#endif // _MSC_VER
+
+/* Intel compiler problem:
+The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have
+a problem with constexpr function returns not being constant enough.
+*/
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 9999
+#error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
+#endif
+
+/* Clang problem:
+The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical.
+See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
+Additional problem: The version number is not consistent across platforms. The Apple build has
+different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
+http://llvm.org/bugs/show_bug.cgi?id=12643
+We have to make switches here when - hopefully - the error some day has been fixed.
+We need different version checks with and whithout __apple_build_version__
+*/
+#if (defined (__clang__) || defined(__apple_build_version__)) && !defined(__INTEL_COMPILER)
+#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+#endif
+
+#if defined (GCC_VERSION) && GCC_VERSION < 99999 && !defined(__clang__)
+#define ZEXT_MISSING  // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+// Constant for indicating don't care in permute and blend functions.
+// V_DC is -256 in Vector class library version 1.xx
+// V_DC can be any value less than -1 in Vector class library version 2.00
+constexpr int V_DC = -256;
+
+
+/*****************************************************************************
+*
+*    Helper functions that depend on instruction set, compiler, or platform
+*
+*****************************************************************************/
+
+// Define interface to cpuid instruction.
+// input:  functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
+// output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
+static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) {
+#if defined(__GNUC__) || defined(__clang__)           // use inline assembly, Gnu/AT&T syntax
+    int a, b, c, d;
+    __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : );
+    output[0] = a;
+    output[1] = b;
+    output[2] = c;
+    output[3] = d;
+
+#elif defined (_MSC_VER)                              // Microsoft compiler, intrin.h included
+    __cpuidex(output, functionnumber, ecxleaf);       // intrinsic function for CPUID
+
+#else                                                 // unknown platform. try inline assembly with masm/intel syntax
+    __asm {
+        mov eax, functionnumber
+        mov ecx, ecxleaf
+        cpuid;
+        mov esi, output
+        mov[esi], eax
+        mov[esi + 4], ebx
+        mov[esi + 8], ecx
+        mov[esi + 12], edx
+    }
+#endif
+}
+
+
+// Define popcount function. Gives sum of bits
+#if INSTRSET >= 6   // SSE4.2
+// popcnt instruction is not officially part of the SSE4.2 instruction set,
+// but available in all known processors with SSE4.2
+static inline uint32_t vml_popcnt(uint32_t a) {
+    return (uint32_t)_mm_popcnt_u32(a);  // Intel intrinsic. Supported by gcc and clang
+}
+#ifdef __x86_64__
+static inline int64_t vml_popcnt(uint64_t a) {
+    return _mm_popcnt_u64(a);            // Intel intrinsic.
+}
+#else   // 32 bit mode
+static inline int64_t vml_popcnt(uint64_t a) {
+    return _mm_popcnt_u32(uint32_t(a >> 32)) + _mm_popcnt_u32(uint32_t(a));
+}
+#endif
+#else  // no SSE4.2
+static inline uint32_t vml_popcnt(uint32_t a) {
+    // popcnt instruction not available
+    uint32_t b = a - ((a >> 1) & 0x55555555);
+    uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+    uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+    uint32_t e = d * 0x01010101;
+    return   e >> 24;
+}
+
+static inline int32_t vml_popcnt(uint64_t a) {
+    return vml_popcnt(uint32_t(a >> 32)) + vml_popcnt(uint32_t(a));
+}
+
+#endif
+
+// Define bit-scan-forward function. Gives index to lowest set bit
+#if defined (__GNUC__) || defined(__clang__)
+    // gcc and Clang have no bit_scan_forward intrinsic
+#if defined(__clang__)   // fix clang bug
+    // Clang uses a k register as parameter a when inlined from horizontal_find_first
+__attribute__((noinline))
+#endif
+static uint32_t bit_scan_forward(uint32_t a) {
+    uint32_t r;
+    __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : );
+    return r;
+}
+static inline uint32_t bit_scan_forward(uint64_t a) {
+    uint32_t lo = uint32_t(a);
+    if (lo) return bit_scan_forward(lo);
+    uint32_t hi = uint32_t(a >> 32);
+    return bit_scan_forward(hi) + 32;
+}
+
+#else  // other compilers
+static inline uint32_t bit_scan_forward(uint32_t a) {
+    unsigned long r;
+    _BitScanForward(&r, a);            // defined in intrin.h for MS and Intel compilers
+    return r;
+}
+#ifdef __x86_64__
+static inline uint32_t bit_scan_forward(uint64_t a) {
+    unsigned long r;
+    _BitScanForward64(&r, a);          // defined in intrin.h for MS and Intel compilers
+    return (uint32_t)r;
+}
+#else
+static inline uint32_t bit_scan_forward(uint64_t a) {
+    uint32_t lo = uint32_t(a);
+    if (lo) return bit_scan_forward(lo);
+    uint32_t hi = uint32_t(a >> 32);
+    return bit_scan_forward(hi) + 32;
+}
+#endif
+#endif
+
+
+// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
+#if defined (__GNUC__) || defined(__clang__)
+static inline uint32_t bit_scan_reverse(uint32_t a) __attribute__((pure));
+static inline uint32_t bit_scan_reverse(uint32_t a) {
+    uint32_t r;
+    __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : );
+    return r;
+}
+#ifdef __x86_64__
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+    uint64_t r;
+    __asm("bsrq %1, %0" : "=r"(r) : "r"(a) : );
+    return r;
+}
+#else   // 32 bit mode
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+    uint64_t ahi = a >> 32;
+    if (ahi == 0) return bit_scan_reverse(uint32_t(a));
+    else return bit_scan_reverse(uint32_t(ahi)) + 32;
+}
+#endif
+#else
+static inline uint32_t bit_scan_reverse(uint32_t a) {
+    unsigned long r;
+    _BitScanReverse(&r, a);            // defined in intrin.h for MS and Intel compilers
+    return r;
+}
+#ifdef __x86_64__
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+    unsigned long r;
+    _BitScanReverse64(&r, a);          // defined in intrin.h for MS and Intel compilers
+    return r;
+}
+#else   // 32 bit mode
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+    uint64_t ahi = a >> 32;
+    if (ahi == 0) return bit_scan_reverse(uint32_t(a));
+    else return bit_scan_reverse(uint32_t(ahi)) + 32;
+}
+#endif
+#endif
+
+// Same function, for compile-time constants
+constexpr int bit_scan_reverse_const(uint64_t const n) {
+    if (n == 0) return -1;
+    uint64_t a = n, b = 0, j = 64, k = 0;
+    do {
+        j >>= 1;
+        k = (uint64_t)1 << j;
+        if (a >= k) {
+            a >>= j;
+            b += j;
+        }
+    } while (j > 0);
+    return int(b);
+}
+
+
+/*****************************************************************************
+*
+*    Common templates
+*
+*****************************************************************************/
+
+// Template class to represent compile-time integer constant
+template <int32_t  n> class Const_int_t {};      // represent compile-time signed integer constant
+template <uint32_t n> class Const_uint_t {};     // represent compile-time unsigned integer constant
+#define const_int(n)  (Const_int_t <n>())        // n must be compile-time integer constant
+#define const_uint(n) (Const_uint_t<n>())        // n must be compile-time unsigned integer constant
+
+
+// template for producing quiet NAN
+template <class VTYPE>
+static inline VTYPE nan_vec(uint32_t payload = 0x100) {
+    if constexpr ((VTYPE::elementtype() & 1) != 0) {  // double
+        union {
+            uint64_t q;
+            double f;
+        } ud;
+        // n is left justified to avoid loss of NAN payload when converting to float
+        ud.q = 0x7FF8000000000000 | uint64_t(payload) << 29;
+        return VTYPE(ud.f);
+    }
+    // float will be converted to double if necessary
+    union {
+        uint32_t i;
+        float f;
+    } uf;
+    uf.i = 0x7FC00000 | (payload & 0x003FFFFF);
+    return VTYPE(uf.f);
+}
+
+
+// Test if a parameter is a compile-time constant
+/* Unfortunately, this works only for macro parameters, not for inline function parameters.
+   I hope that some solution will appear in the future, but for now it appears to be
+   impossible to check if a function parameter is a compile-time constant.
+   This would be useful in operator / and in function pow:
+   #if defined(__GNUC__) || defined (__clang__)
+   #define is_constant(a) __builtin_constant_p(a)
+   #else
+   #define is_constant(a) false
+   #endif
+*/
+
+
+/*****************************************************************************
+*
+*    Helper functions for permute and blend functions
+*
+******************************************************************************
+Rules for constexpr functions:
+
+> All variable declarations must include initialization
+
+> Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; ..
+  Instead, you have to declare the loop counter before the for-loop.
+
+> Do not make constexpr functions that return vector types. This requires type
+  punning with a union, which is not allowed in constexpr functions under C++17.
+  It may be possible under C++20
+
+*****************************************************************************/
+
+// Define type for Encapsulated array to use as return type:
+template <typename T, int N>
+struct EList {
+    T a[N];
+};
+
+
+// get_inttype: get an integer of a size that matches the element size
+// of vector class V with the value -1
+template <typename V>
+constexpr auto get_inttype() {
+    constexpr int elementsize = sizeof(V) / V::size();  // size of vector elements
+
+    if constexpr (elementsize >= 8) {
+        return -int64_t(1);
+    }
+    else if constexpr (elementsize >= 4) {
+        return int32_t(-1);
+    }
+    else if constexpr (elementsize >= 2) {
+        return int16_t(-1);
+    }
+    else {
+        return int8_t(-1);
+    }
+}
+
+
+// zero_mask: return a compact bit mask mask for zeroing using AVX512 mask.
+// Parameter a is a reference to a constexpr int array of permutation indexes
+template <int N>
+constexpr auto zero_mask(int const (&a)[N]) {
+    uint64_t mask = 0;
+    int i = 0;
+
+    for (i = 0; i < N; i++) {
+        if (a[i] >= 0) mask |= uint64_t(1) << i;
+    }
+    if constexpr      (N <= 8 ) return uint8_t(mask);
+    else if constexpr (N <= 16) return uint16_t(mask);
+    else if constexpr (N <= 32) return uint32_t(mask);
+    else return mask;
+}
+
+
+// zero_mask_broad: return a broad byte mask for zeroing.
+// Parameter a is a reference to a constexpr int array of permutation indexes
+template <typename V>
+constexpr auto zero_mask_broad(int const (&A)[V::size()]) {
+    constexpr int N = V::size();                 // number of vector elements
+    typedef decltype(get_inttype<V>()) Etype;    // element type
+    EList <Etype, N> u = {{0}};                  // list for return
+    int i = 0;
+    for (i = 0; i < N; i++) {
+        u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0;
+    }
+    return u;                                    // return encapsulated array
+}
+
+
+// make_bit_mask: return a compact mask of bits from a list of N indexes:
+// B contains options indicating how to gather the mask
+// bit 0-7 in B indicates which bit in each index to collect
+// bit 8 = 0x100:  set 1 in the lower half of the bit mask if the indicated bit is 1.
+// bit 8 = 0    :  set 1 in the lower half of the bit mask if the indicated bit is 0.
+// bit 9 = 0x200:  set 1 in the upper half of the bit mask if the indicated bit is 1.
+// bit 9 = 0    :  set 1 in the upper half of the bit mask if the indicated bit is 0.
+// bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC
+// Parameter a is a reference to a constexpr int array of permutation indexes
+template <int N, int B>
+constexpr uint64_t make_bit_mask(int const (&a)[N]) {
+    uint64_t r = 0;                              // return value
+    uint8_t  j = uint8_t(B & 0xFF);              // index to selected bit
+    uint64_t s = 0;                              // bit number i in r
+    uint64_t f = 0;                              // 1 if bit not flipped
+    int i = 0;
+    for (i = 0; i < N; i++) {
+        int ix = a[i];
+        if (ix < 0) {                            // -1 or V_DC
+            s = (B >> 10) & 1;
+        }
+        else {
+            s = ((uint32_t)ix >> j) & 1;         // extract selected bit
+            if (i < N/2) {
+                f = (B >> 8) & 1;                // lower half
+            }
+            else {
+                f = (B >> 9) & 1;                // upper half
+            }
+            s ^= f ^ 1;                          // flip bit if needed
+        }
+        r |= uint64_t(s) << i;                   // set bit in return value
+    }
+    return r;
+}
+
+
+// make_broad_mask: Convert a bit mask m to a broad mask
+// The return value will be a broad boolean mask with elementsize matching vector class V
+template <typename V>
+constexpr auto make_broad_mask(uint64_t const m) {
+    constexpr int N = V::size();                 // number of vector elements
+    typedef decltype(get_inttype<V>()) Etype;    // element type
+    EList <Etype, N> u = {{0}};                  // list for returning
+    int i = 0;
+    for (i = 0; i < N; i++) {
+        u.a[i] = ((m >> i) & 1) != 0 ? get_inttype<V>() : 0;
+    }
+    return u;                                    // return encapsulated array
+}
+
+
+// perm_mask_broad: return a mask for permutation by a vector register index.
+// Parameter A is a reference to a constexpr int array of permutation indexes
+template <typename V>
+constexpr auto perm_mask_broad(int const (&A)[V::size()]) {
+    constexpr int N = V::size();                 // number of vector elements
+    typedef decltype(get_inttype<V>()) Etype;    // vector element type
+    EList <Etype, N> u = {{0}};                  // list for returning
+    int i = 0;
+    for (i = 0; i < N; i++) {
+        u.a[i] = Etype(A[i]);
+    }
+    return u;                                    // return encapsulated array
+}
+
+
+// perm_flags: returns information about how a permute can be implemented.
+// The return value is composed of these flag bits:
+const int perm_zeroing             = 1;  // needs zeroing
+const int perm_perm                = 2;  // permutation needed
+const int perm_allzero             = 4;  // all is zero or don't care
+const int perm_largeblock          = 8;  // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i)
+const int perm_addz             = 0x10;  // additional zeroing needed after permute with larger block size or shift
+const int perm_addz2            = 0x20;  // additional zeroing needed after perm_zext, perm_compress, or perm_expand
+const int perm_cross_lane       = 0x40;  // permutation crossing 128-bit lanes
+const int perm_same_pattern     = 0x80;  // same permute pattern in all 128-bit lanes
+const int perm_punpckh         = 0x100;  // permutation pattern fits punpckh instruction
+const int perm_punpckl         = 0x200;  // permutation pattern fits punpckl instruction
+const int perm_rotate          = 0x400;  // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count
+const int perm_shright        = 0x1000;  // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count
+const int perm_shleft         = 0x2000;  // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count
+const int perm_rotate_big     = 0x4000;  // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count
+const int perm_broadcast      = 0x8000;  // permutation pattern fits broadcast of a single element.
+const int perm_zext          = 0x10000;  // permutation pattern fits zero extension
+const int perm_compress      = 0x20000;  // permutation pattern fits vpcompress instruction
+const int perm_expand        = 0x40000;  // permutation pattern fits vpexpand instruction
+const int perm_outofrange = 0x10000000;  // index out of range
+const int perm_rot_count          = 32;  // rotate or shift count is in bits perm_rot_count to perm_rot_count+3
+const int perm_ipattern           = 40;  // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4
+
+template <typename V>
+constexpr uint64_t perm_flags(int const (&a)[V::size()]) {
+    // a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    constexpr int N = V::size();                           // number of elements
+    uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
+    uint32_t i = 0;                                        // loop counter
+    int      j = 0;                                        // loop counter
+    int ix = 0;                                            // index number i
+    const uint32_t nlanes = sizeof(V) / 16;                // number of 128-bit lanes
+    const uint32_t lanesize = N / nlanes;                  // elements per lane
+    const uint32_t elementsize = sizeof(V) / N;            // size of each vector element
+    uint32_t lane = 0;                                     // current lane
+    uint32_t rot = 999;                                    // rotate left count
+    int32_t  broadc = 999;                                 // index to broadcasted element
+    uint32_t patfail = 0;                                  // remember certain patterns that do not fit
+    uint32_t addz2 = 0;                                    // remember certain patterns need extra zeroing
+    int32_t  compresslasti = -1;                           // last index in perm_compress fit
+    int32_t  compresslastp = -1;                           // last position in perm_compress fit
+    int32_t  expandlasti = -1;                             // last index in perm_expand fit
+    int32_t  expandlastp = -1;                             // last position in perm_expand fit
+
+    int lanepattern[lanesize] = {0};                       // pattern in each lane
+
+    for (i = 0; i < N; i++) {                              // loop through indexes
+        ix = a[i];                                         // current index
+        // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
+        if (ix == -1) {
+            r |= perm_zeroing;                             // zeroing requested
+        }
+        else if (ix != V_DC && uint32_t(ix) >= N) {
+            r |= perm_outofrange;                          // index out of range
+        }
+        if (ix >= 0) {
+            r &= ~ perm_allzero;                           // not all zero
+            if (ix != (int)i) r |= perm_perm;              // needs permutation
+            if (broadc == 999) broadc = ix;                // remember broadcast index
+            else if (broadc != ix) broadc = 1000;          // does not fit broadcast
+        }
+        // check if pattern fits a larger block size:
+        // even indexes must be even, odd indexes must fit the preceding even index + 1
+        if ((i & 1) == 0) {                                // even index
+            if (ix >= 0 && (ix & 1)) r &= ~perm_largeblock;// not even. does not fit larger block size
+            int iy = a[i + 1];                             // next odd index
+            if (iy >= 0 && (iy & 1) == 0) r &= ~ perm_largeblock; // not odd. does not fit larger block size
+            if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ perm_largeblock; // does not fit preceding index + 1
+            if (ix == -1 && iy >= 0) r |= perm_addz;       // needs additional zeroing at current block size
+            if (iy == -1 && ix >= 0) r |= perm_addz;       // needs additional zeroing at current block size
+        }
+        lane = i / lanesize;                               // current lane
+        if (lane == 0) {                                   // first lane, or no pattern yet
+            lanepattern[i] = ix;                           // save pattern
+        }
+        // check if crossing lanes
+        if (ix >= 0) {
+            uint32_t lanei = (uint32_t)ix / lanesize;      // source lane
+            if (lanei != lane) r |= perm_cross_lane;       // crossing lane
+        }
+        // check if same pattern in all lanes
+        if (lane != 0 && ix >= 0) {                        // not first lane
+            int j1  = i - int(lane * lanesize);            // index into lanepattern
+            int jx = ix - int(lane * lanesize);            // pattern within lane
+            if (jx < 0 || jx >= (int)lanesize) r &= ~perm_same_pattern; // source is in another lane
+            if (lanepattern[j1] < 0) {
+                lanepattern[j1] = jx;                      // pattern not known from previous lane
+            }
+            else {
+                if (lanepattern[j1] != jx) r &= ~perm_same_pattern; // not same pattern
+            }
+        }
+        if (ix >= 0) {
+            // check if pattern fits zero extension (perm_zext)
+            if (uint32_t(ix*2) != i) {
+                patfail |= 1;                              // does not fit zero extension
+            }
+            // check if pattern fits compress (perm_compress)
+            if (ix > compresslasti && ix - compresslasti >= (int)i - compresslastp) {
+                if ((int)i - compresslastp > 1) addz2 |= 2;// perm_compress may need additional zeroing
+                compresslasti = ix;  compresslastp = i;
+            }
+            else {
+                patfail |= 2;                              // does not fit perm_compress
+            }
+            // check if pattern fits expand (perm_expand)
+            if (ix > expandlasti && ix - expandlasti <= (int)i - expandlastp) {
+                if (ix - expandlasti > 1) addz2 |= 4;      // perm_expand may need additional zeroing
+                expandlasti = ix;  expandlastp = i;
+            }
+            else {
+                patfail |= 4;                              // does not fit perm_compress
+            }
+        }
+        else if (ix == -1) {
+            if ((i & 1) == 0) addz2 |= 1;                  // zero extension needs additional zeroing
+        }
+    }
+    if (!(r & perm_perm)) return r;                        // more checks are superfluous
+
+    if (!(r & perm_largeblock)) r &= ~ perm_addz;          // remove irrelevant flag
+    if (r & perm_cross_lane) r &= ~ perm_same_pattern;     // remove irrelevant flag
+    if ((patfail & 1) == 0) {
+        r |= perm_zext;                                    // fits zero extension
+        if ((addz2 & 1) != 0) r |= perm_addz2;
+    }
+    else if ((patfail & 2) == 0) {
+        r |= perm_compress;                                // fits compression
+        if ((addz2 & 2) != 0) {                            // check if additional zeroing needed
+            for (j = 0; j < compresslastp; j++) {
+                if (a[j] == -1) r |= perm_addz2;
+            }
+        }
+    }
+    else if ((patfail & 4) == 0) {
+        r |= perm_expand;                                  // fits expansion
+        if ((addz2 & 4) != 0) {                            // check if additional zeroing needed
+            for (j = 0; j < expandlastp; j++) {
+                if (a[j] == -1) r |= perm_addz2;
+            }
+        }
+    }
+
+    if (r & perm_same_pattern) {
+        // same pattern in all lanes. check if it fits specific patterns
+        bool fit = true;
+        // fit shift or rotate
+        for (i = 0; i < lanesize; i++) {
+            if (lanepattern[i] >= 0) {
+                uint32_t rot1 = uint32_t(lanepattern[i] + lanesize - i) % lanesize;
+                if (rot == 999) {
+                    rot = rot1;
+                }
+                else { // check if fit
+                    if (rot != rot1) fit = false;
+                }
+            }
+        }
+        rot &= lanesize-1;  // prevent out of range values
+        if (fit) {   // fits rotate, and possibly shift
+            uint64_t rot2 = (rot * elementsize) & 0xF;     // rotate right count in bytes
+            r |= rot2 << perm_rot_count;                   // put shift/rotate count in output bit 16-19
+#if INSTRSET >= 4  // SSSE3
+            r |= perm_rotate;                              // allow palignr
+#endif
+            // fit shift left
+            fit = true;
+            for (i = 0; i < lanesize-rot; i++) {           // check if first rot elements are zero or don't care
+                if (lanepattern[i] >= 0) fit = false;
+            }
+            if (fit) {
+                r |= perm_shleft;
+                for (; i < lanesize; i++) if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed
+            }
+            // fit shift right
+            fit = true;
+            for (i = lanesize-(uint32_t)rot; i < lanesize; i++) {    // check if last (lanesize-rot) elements are zero or don't care
+                if (lanepattern[i] >= 0) fit = false;
+            }
+            if (fit) {
+                r |= perm_shright;
+                for (i = 0; i < lanesize-rot; i++) {
+                    if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed
+                }
+            }
+        }
+        // fit punpckhi
+        fit = true;
+        uint32_t j2 = lanesize / 2;
+        for (i = 0; i < lanesize; i++) {
+            if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false;
+            if ((i & 1) != 0) j2++;
+        }
+        if (fit) r |= perm_punpckh;
+        // fit punpcklo
+        fit = true;
+        j2 = 0;
+        for (i = 0; i < lanesize; i++) {
+            if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false;
+            if ((i & 1) != 0) j2++;
+        }
+        if (fit) r |= perm_punpckl;
+        // fit pshufd
+        if (elementsize >= 4) {
+            uint64_t p = 0;
+            for (i = 0; i < lanesize; i++) {
+                if (lanesize == 4) {
+                    p |= (lanepattern[i] & 3) << 2 * i;
+                }
+                else {  // lanesize = 2
+                    p |= ((lanepattern[i] & 1) * 10 + 4) << 4 * i;
+                }
+            }
+            r |= p << perm_ipattern;
+        }
+    }
+#if INSTRSET >= 7
+    else {  // not same pattern in all lanes
+        if constexpr (nlanes > 1) {                        // Try if it fits big rotate
+            for (i = 0; i < N; i++) {
+                ix = a[i];
+                if (ix >= 0) {
+                    uint32_t rot2 = (ix + N - i) % N;      // rotate count
+                    if (rot == 999) {
+                        rot = rot2;                        // save rotate count
+                    }
+                    else if (rot != rot2) {
+                        rot = 1000; break;                 // does not fit big rotate
+                    }
+                }
+            }
+            if (rot < N) {                                 // fits big rotate
+                r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
+            }
+        }
+    }
+#endif
+    if (broadc < 999 && (r & (perm_rotate|perm_shright|perm_shleft|perm_rotate_big)) == 0) {
+        r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
+    }
+    return r;
+}
+
+
+// compress_mask: returns a bit mask to use for compression instruction.
+// It is presupposed that perm_flags indicates perm_compress.
+// Additional zeroing is needed if perm_flags indicates perm_addz2
+template <int N>
+constexpr uint64_t compress_mask(int const (&a)[N]) {
+    // a is a reference to a constexpr array of permutation indexes
+    int ix = 0, lasti = -1, lastp = -1;
+    uint64_t m = 0;
+    int i = 0; int j = 1;                                  // loop counters
+    for (i = 0; i < N; i++) {
+        ix = a[i];                                         // permutation index
+        if (ix >= 0) {
+            m |= (uint64_t)1 << ix;                        // mask for compression source
+            for (j = 1; j < i - lastp; j++) {
+                m |= (uint64_t)1 << (lasti + j);           // dummy filling source
+            }
+            lastp = i; lasti = ix;
+        }
+    }
+    return m;
+}
+
+// expand_mask: returns a bit mask to use for expansion instruction.
+// It is presupposed that perm_flags indicates perm_expand.
+// Additional zeroing is needed if perm_flags indicates perm_addz2
+template <int N>
+constexpr uint64_t expand_mask(int const (&a)[N]) {
+    // a is a reference to a constexpr array of permutation indexes
+    int ix = 0, lasti = -1, lastp = -1;
+    uint64_t m = 0;
+    int i = 0; int j = 1;
+    for (i = 0; i < N; i++) {
+        ix = a[i];                                         // permutation index
+        if (ix >= 0) {
+            m |= (uint64_t)1 << i;                         // mask for expansion destination
+            for (j = 1; j < ix - lasti; j++) {
+                m |= (uint64_t)1 << (lastp + j);           // dummy filling destination
+            }
+            lastp = i; lasti = ix;
+        }
+    }
+    return m;
+}
+
+// perm16_flags: returns information about how to permute a vector of 16-bit integers
+// Note: It is presupposed that perm_flags reports perm_same_pattern
+// The return value is composed of these bits:
+// 1:  data from low  64 bits to low  64 bits. pattern in bit 32-39
+// 2:  data from high 64 bits to high 64 bits. pattern in bit 40-47
+// 4:  data from high 64 bits to low  64 bits. pattern in bit 48-55
+// 8:  data from low  64 bits to high 64 bits. pattern in bit 56-63
+template <typename V>
+constexpr uint64_t perm16_flags(int const (&a)[V::size()]) {
+    // a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    constexpr int N = V::size();                           // number of elements
+
+    uint64_t retval = 0;                                   // return value
+    uint32_t pat[4] = {0,0,0,0};                           // permute patterns
+    uint32_t i = 0;                                        // loop counter
+    int ix = 0;                                            // index number i
+    const uint32_t lanesize = 8;                           // elements per lane
+    uint32_t lane = 0;                                     // current lane
+    int lanepattern[lanesize] = {0};                       // pattern in each lane
+
+    for (i = 0; i < N; i++) {
+        ix = a[i];
+        lane = i / lanesize;                               // current lane
+        if (lane == 0) {
+            lanepattern[i] = ix;                           // save pattern
+        }
+        else if (ix >= 0) {                                // not first lane
+            uint32_t j = i - lane * lanesize;              // index into lanepattern
+            int jx = ix - lane * lanesize;                 // pattern within lane
+            if (lanepattern[j] < 0) {
+                lanepattern[j] = jx;                       // pattern not known from previous lane
+            }
+        }
+    }
+    // four patterns: low2low, high2high, high2low, low2high
+    for (i = 0; i < 4; i++) {
+        // loop through low pattern
+        if (lanepattern[i] >= 0) {
+            if (lanepattern[i] < 4) { // low2low
+                retval |= 1;
+                pat[0] |= uint32_t(lanepattern[i] & 3) << (2 * i);
+            }
+            else {  // high2low
+                retval |= 4;
+                pat[2] |= uint32_t(lanepattern[i] & 3) << (2 * i);
+            }
+        }
+        // loop through high pattern
+        if (lanepattern[i+4] >= 0) {
+            if (lanepattern[i+4] < 4) { // low2high
+                retval |= 8;
+                pat[3] |= uint32_t(lanepattern[i+4] & 3) << (2 * i);
+            }
+            else {  // high2high
+                retval |= 2;
+                pat[1] |= uint32_t(lanepattern[i+4] & 3) << (2 * i);
+            }
+        }
+    }
+    // join return data
+    for (i = 0; i < 4; i++) {
+        retval |= (uint64_t)pat[i] << (32 + i*8);
+    }
+    return retval;
+}
+
+
+// pshufb_mask: return a broad byte mask for permutation within lanes
+// for use with the pshufb instruction (_mm..._shuffle_epi8).
+// The pshufb instruction provides fast permutation and zeroing,
+// allowing different patterns in each lane but no crossing of lane boundaries
+template <typename V, int oppos = 0>
+constexpr auto pshufb_mask(int const (&A)[V::size()]) {
+    // Parameter a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
+    constexpr uint32_t N = V::size();                      // number of vector elements
+    constexpr uint32_t elementsize = sizeof(V) / N;        // size of each vector element
+    constexpr uint32_t nlanes = sizeof(V) / 16;            // number of 128 bit lanes in vector
+    constexpr uint32_t elements_per_lane = N / nlanes;     // number of vector elements per lane
+
+    EList <int8_t, sizeof(V)> u = {{0}};                   // list for returning
+
+    uint32_t i = 0;                                        // loop counters
+    uint32_t j = 0;
+    int m = 0;
+    int k = 0;
+    uint32_t lane = 0;
+
+    for (lane = 0; lane < nlanes; lane++) {                // loop through lanes
+        for (i = 0; i < elements_per_lane; i++) {          // loop through elements in lane
+            // permutation index for element within lane
+            int8_t p = -1;
+            int ix = A[m];
+            if (ix >= 0) {
+                ix ^= oppos * elements_per_lane;           // flip bit if opposite lane
+            }
+            ix -= int(lane * elements_per_lane);           // index relative to lane
+            if (ix >= 0 && ix < (int)elements_per_lane) {  // index points to desired lane
+                p = ix * elementsize;
+            }
+            for (j = 0; j < elementsize; j++) {            // loop through bytes in element
+                u.a[k++] = p < 0 ? -1 : p + j;             // store byte permutation index
+            }
+            m++;
+        }
+    }
+    return u;                                              // return encapsulated array
+}
+
+
+// largeblock_perm: return indexes for replacing a permute or blend with
+// a certain block size by a permute or blend with the double block size.
+// Note: it is presupposed that perm_flags() indicates perm_largeblock
+// It is required that additional zeroing is added if perm_flags() indicates perm_addz
+template <int N>
+constexpr EList<int, N/2> largeblock_perm(int const (&a)[N]) {
+    // Parameter a is a reference to a constexpr array of permutation indexes
+    EList<int, N/2> list = {{0}};                 // result indexes
+    int ix = 0;                                  // even index
+    int iy = 0;                                  // odd index
+    int iz = 0;                                  // combined index
+    bool fit_addz = false;                       // additional zeroing needed at the lower block level
+    int i = 0;                                   // loop counter
+
+    // check if additional zeroing is needed at current block size
+    for (i = 0; i < N; i += 2) {
+        ix = a[i];                               // even index
+        iy = a[i+1];                             // odd index
+        if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) {
+            fit_addz = true;
+        }
+    }
+
+    // loop through indexes
+    for (i = 0; i < N; i += 2) {
+        ix = a[i];                               // even index
+        iy = a[i+1];                             // odd index
+        if (ix >= 0) {
+            iz = ix / 2;                         // half index
+        }
+        else if (iy >= 0) {
+            iz = iy / 2;
+        }
+        else {
+            iz = ix | iy;                        // -1 or V_DC. -1 takes precedence
+            if (fit_addz) iz = V_DC;             // V_DC, because result will be zeroed later
+        }
+        list.a[i/2] = iz;                        // save to list
+    }
+    return list;
+}
+
+
+// blend_flags: returns information about how a blend function can be implemented
+// The return value is composed of these flag bits:
+const int blend_zeroing            = 1;  // needs zeroing
+const int blend_allzero            = 2;  // all is zero or don't care
+const int blend_largeblock         = 4;  // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i)
+const int blend_addz               = 8;  // additional zeroing needed after blend with larger block size or shift
+const int blend_a               = 0x10;  // has data from a
+const int blend_b               = 0x20;  // has data from b
+const int blend_perma           = 0x40;  // permutation of a needed
+const int blend_permb           = 0x80;  // permutation of b needed
+const int blend_cross_lane     = 0x100;  // permutation crossing 128-bit lanes
+const int blend_same_pattern   = 0x200;  // same permute/blend pattern in all 128-bit lanes
+const int blend_punpckhab     = 0x1000;  // pattern fits punpckh(a,b)
+const int blend_punpckhba     = 0x2000;  // pattern fits punpckh(b,a)
+const int blend_punpcklab     = 0x4000;  // pattern fits punpckl(a,b)
+const int blend_punpcklba     = 0x8000;  // pattern fits punpckl(b,a)
+const int blend_rotateab     = 0x10000;  // pattern fits palignr(a,b)
+const int blend_rotateba     = 0x20000;  // pattern fits palignr(b,a)
+const int blend_shufab       = 0x40000;  // pattern fits shufps/shufpd(a,b)
+const int blend_shufba       = 0x80000;  // pattern fits shufps/shufpd(b,a)
+const int blend_rotate_big  = 0x100000;  // pattern fits rotation across lanes. count returned in bits blend_rotpattern
+const int blend_outofrange= 0x10000000;  // index out of range
+const int blend_shufpattern       = 32;  // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7
+const int blend_rotpattern        = 40;  // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7
+
+template <typename V>
+constexpr uint64_t blend_flags(int const (&a)[V::size()]) {
+    // a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    constexpr int N = V::size();                           // number of elements
+    uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
+    uint32_t iu = 0;                                       // loop counter
+    int32_t ii = 0;                                        // loop counter
+    int ix = 0;                                            // index number i
+    const uint32_t nlanes = sizeof(V) / 16;                // number of 128-bit lanes
+    const uint32_t lanesize = N / nlanes;                  // elements per lane
+    uint32_t lane = 0;                                     // current lane
+    uint32_t rot = 999;                                    // rotate left count
+    int lanepattern[lanesize] = {0};                       // pattern in each lane
+    if (lanesize == 2 && N <= 8) {
+        r |= blend_shufab | blend_shufba;                  // check if it fits shufpd
+    }
+
+    for (ii = 0; ii < N; ii++) {                           // loop through indexes
+        ix = a[ii];                                        // index
+        if (ix < 0) {
+            if (ix == -1) r |= blend_zeroing;              // set to zero
+            else if (ix != V_DC) {
+                r = blend_outofrange;  break;              // illegal index
+            }
+        }
+        else {  // ix >= 0
+            r &= ~ blend_allzero;
+            if (ix < N) {
+                r |= blend_a;                              // data from a
+                if (ix != ii) r |= blend_perma;            // permutation of a
+            }
+            else if (ix < 2*N) {
+                r |= blend_b;                              // data from b
+                if (ix != ii + N) r |= blend_permb;        // permutation of b
+            }
+            else {
+                r = blend_outofrange;  break;              // illegal index
+            }
+        }
+        // check if pattern fits a larger block size:
+        // even indexes must be even, odd indexes must fit the preceding even index + 1
+        if ((ii & 1) == 0) {                               // even index
+            if (ix >= 0 && (ix&1)) r &= ~blend_largeblock; // not even. does not fit larger block size
+            int iy = a[ii+1];                              // next odd index
+            if (iy >= 0 && (iy & 1) == 0) r &= ~ blend_largeblock; // not odd. does not fit larger block size
+            if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ blend_largeblock; // does not fit preceding index + 1
+            if (ix == -1 && iy >= 0) r |= blend_addz;      // needs additional zeroing at current block size
+            if (iy == -1 && ix >= 0) r |= blend_addz;      // needs additional zeroing at current block size
+        }
+        lane = (uint32_t)ii / lanesize;                    // current lane
+        if (lane == 0) {                                   // first lane, or no pattern yet
+            lanepattern[ii] = ix;                          // save pattern
+        }
+        // check if crossing lanes
+        if (ix >= 0) {
+            uint32_t lanei = uint32_t(ix & ~N) / lanesize; // source lane
+            if (lanei != lane) {
+                r |= blend_cross_lane;                     // crossing lane
+            }
+            if (lanesize == 2) {   // check if it fits pshufd
+                if (lanei != lane) r &= ~(blend_shufab | blend_shufba);
+                if ((((ix & N) != 0) ^ ii) & 1) r &= ~blend_shufab;
+                else r &= ~blend_shufba;
+            }
+        }
+        // check if same pattern in all lanes
+        if (lane != 0 && ix >= 0) {                        // not first lane
+            int j  = ii - int(lane * lanesize);            // index into lanepattern
+            int jx = ix - int(lane * lanesize);            // pattern within lane
+            if (jx < 0 || (jx & ~N) >= (int)lanesize) r &= ~blend_same_pattern; // source is in another lane
+            if (lanepattern[j] < 0) {
+                lanepattern[j] = jx;                       // pattern not known from previous lane
+            }
+            else {
+                if (lanepattern[j] != jx) r &= ~blend_same_pattern; // not same pattern
+            }
+        }
+    }
+    if (!(r & blend_largeblock)) r &= ~ blend_addz;        // remove irrelevant flag
+    if (r & blend_cross_lane) r &= ~ blend_same_pattern;   // remove irrelevant flag
+    if (!(r & (blend_perma | blend_permb))) {
+        return r;                                          // no permutation. more checks are superfluous
+    }
+    if (r & blend_same_pattern) {
+        // same pattern in all lanes. check if it fits unpack patterns
+        r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba;
+        for (iu = 0; iu < lanesize; iu++) {                // loop through lanepattern
+            ix = lanepattern[iu];
+            if (ix >= 0) {
+                if ((uint32_t)ix != iu / 2 + (iu & 1) * N)                    r &= ~ blend_punpcklab;
+                if ((uint32_t)ix != iu / 2 + ((iu & 1) ^ 1) * N)              r &= ~ blend_punpcklba;
+                if ((uint32_t)ix != (iu + lanesize) / 2 + (iu & 1) * N)       r &= ~ blend_punpckhab;
+                if ((uint32_t)ix != (iu + lanesize) / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpckhba;
+            }
+        }
+#if INSTRSET >= 4  // SSSE3. check if it fits palignr
+        for (iu = 0; iu < lanesize; iu++) {
+            ix = lanepattern[iu];
+            if (ix >= 0) {
+                uint32_t t = ix & ~N;
+                if (ix & N) t += lanesize;
+                uint32_t tb = (t + 2*lanesize - iu) % (lanesize * 2);
+                if (rot == 999) {
+                    rot = tb;
+                }
+                else { // check if fit
+                    if (rot != tb) rot = 1000;
+                }
+            }
+        }
+        if (rot < 999) { // firs palignr
+            if (rot < lanesize) {
+                r |= blend_rotateba;
+            }
+            else {
+                r |= blend_rotateab;
+            }
+            const uint32_t elementsize = sizeof(V) / N;
+            r |= uint64_t((rot & (lanesize - 1)) * elementsize) << blend_rotpattern;
+        }
+#endif
+        if (lanesize == 4) {
+            // check if it fits shufps
+            r |= blend_shufab | blend_shufba;
+            for (ii = 0; ii < 2; ii++) {
+                ix = lanepattern[ii];
+                if (ix >= 0) {
+                    if (ix & N) r &= ~ blend_shufab;
+                    else        r &= ~ blend_shufba;
+                }
+            }
+            for (; ii < 4; ii++) {
+                ix = lanepattern[ii];
+                if (ix >= 0) {
+                    if (ix & N) r &= ~ blend_shufba;
+                    else        r &= ~ blend_shufab;
+                }
+            }
+            if (r & (blend_shufab | blend_shufba)) {       // fits shufps/shufpd
+                uint8_t shufpattern = 0;                   // get pattern
+                for (iu = 0; iu < lanesize; iu++) {
+                    shufpattern |= (lanepattern[iu] & 3) << iu * 2;
+                }
+                r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
+            }
+        }
+    }
+    else if  (nlanes > 1) {  // not same pattern in all lanes
+        rot = 999;                                         // check if it fits big rotate
+        for (ii = 0; ii < N; ii++) {
+            ix = a[ii];
+            if (ix >= 0) {
+                uint32_t rot2 = (ix + 2 * N - ii) % (2 * N);// rotate count
+                if (rot == 999) {
+                    rot = rot2;                            // save rotate count
+                }
+                else if (rot != rot2) {
+                    rot = 1000; break;                     // does not fit big rotate
+                }
+            }
+        }
+        if (rot < 2 * N) {                                 // fits big rotate
+            r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
+        }
+    }
+    if (lanesize == 2 && (r & (blend_shufab | blend_shufba))) {  // fits shufpd. Get pattern
+        for (ii = 0; ii < N; ii++) {
+            r |= uint64_t(a[ii] & 1) << (blend_shufpattern + ii);
+        }
+    }
+    return r;
+}
+
+// blend_perm_indexes: return an Indexlist for implementing a blend function as
+// two permutations. N = vector size.
+// dozero = 0: let unused elements be don't care. The two permutation results must be blended
+// dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed
+// dozero = 2: indexes that are -1 or V_DC are preserved
+template <int N, int dozero>
+constexpr EList<int, 2*N> blend_perm_indexes(int const (&a)[N]) {
+    // a is a reference to a constexpr array of permutation indexes
+    EList<int, 2*N> list = {{0}};       // list to return
+    int u = dozero ? -1 : V_DC;        // value to use for unused entries
+    int j = 0;
+
+    for (j = 0; j < N; j++) {          // loop through indexes
+        int ix = a[j];                 // current index
+        if (ix < 0) {                  // zero or don't care
+            if (dozero == 2) {
+                // list.a[j] = list.a[j + N] = ix;  // fails in gcc in complicated cases
+                list.a[j] = ix;
+                list.a[j + N] = ix;
+            }
+            else {
+                // list.a[j] = list.a[j + N] = u;
+                list.a[j] = u;
+                list.a[j + N] = u;
+            }
+        }
+        else if (ix < N) {             // value from a
+            list.a[j]   = ix;
+            list.a[j+N] = u;
+        }
+        else {
+            list.a[j]   = u;           // value from b
+            list.a[j+N] = ix - N;
+        }
+    }
+    return list;
+}
+
+// largeblock_indexes: return indexes for replacing a permute or blend with a
+// certain block size by a permute or blend with the double block size.
+// Note: it is presupposed that perm_flags or blend_flags indicates _largeblock
+// It is required that additional zeroing is added if perm_flags or blend_flags
+// indicates _addz
+template <int N>
+constexpr EList<int, N/2> largeblock_indexes(int const (&a)[N]) {
+    // Parameter a is a reference to a constexpr array of N permutation indexes
+    EList<int, N/2> list = {{0}};                 // list to return
+
+    bool fit_addz = false;                       // additional zeroing needed at the lower block level
+    int ix = 0;                                  // even index
+    int iy = 0;                                  // odd index
+    int iz = 0;                                  // combined index
+    int i  = 0;                                  // loop counter
+
+    for (i = 0; i < N; i += 2) {
+        ix = a[i];                               // even index
+        iy = a[i+1];                             // odd index
+        if (ix >= 0) {
+            iz = ix / 2;                         // half index
+        }
+        else if (iy >= 0) {
+            iz = iy / 2;                         // half index
+        }
+        else iz = ix | iy;                       // -1 or V_DC. -1 takes precedence
+        list.a[i/2] = iz;                        // save to list
+        // check if additional zeroing is needed at current block size
+        if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) {
+            fit_addz = true;
+        }
+    }
+    // replace -1 by V_DC if fit_addz
+    if (fit_addz) {
+        for (i = 0; i < N/2; i++) {
+            if (list.a[i] < 0) list.a[i] = V_DC;
+        }
+    }
+    return list;
+}
+
+
+/****************************************************************************************
+*
+*          Vector blend helper function templates
+*
+* These templates are for emulating a blend with a vector size that is not supported by
+* the instruction set, using multiple blends or permutations of half the vector size
+*
+****************************************************************************************/
+
+// Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined
+template <typename dummy> void blend2(){}
+template <typename dummy> void blend4(){}
+template <typename dummy> void blend8(){}
+template <typename dummy> void blend16(){}
+template <typename dummy> void blend32(){}
+
+// blend_half_indexes: return an Indexlist for emulating a blend function as
+// blends or permutations from multiple sources
+// dozero = 0: let unused elements be don't care. Multiple permutation results must be blended
+// dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed
+// dozero = 2: indexes that are -1 or V_DC are preserved
+// src1, src2: sources to blend in a partial implementation
+template <int N, int dozero, int src1, int src2>
+constexpr EList<int, N> blend_half_indexes(int const (&a)[N]) {
+    // a is a reference to a constexpr array of permutation indexes
+    EList<int, N> list = {{0}};         // list to return
+    int u = dozero ? -1 : V_DC;        // value to use for unused entries
+    int j = 0;                         // loop counter
+
+    for (j = 0; j < N; j++) {          // loop through indexes
+        int ix = a[j];                 // current index
+        if (ix < 0) {                  // zero or don't care
+            list.a[j] = (dozero == 2) ? ix : u;
+        }
+        else {
+            int src = ix / N;          // source
+            if (src == src1) {
+                list.a[j] = ix & (N - 1);
+            }
+            else if (src == src2) {
+                list.a[j] = (ix & (N - 1)) + N;
+            }
+            else list.a[j] = u;
+        }
+    }
+    return list;
+}
+
+// selectblend: select one of four sources for blending
+template <typename W, int s>
+static inline auto selectblend(W const a, W const b) {
+    if      constexpr (s == 0) return a.get_low();
+    else if constexpr (s == 1) return a.get_high();
+    else if constexpr (s == 2) return b.get_low();
+    else                       return b.get_high();
+}
+
+// blend_half: Emulate a blend with a vector size that is not supported
+// by multiple blends with half the vector size.
+// blend_half is called twice, to give the low and high half of the result
+// Parameters: W: type of full-size vector
+// i0...: indexes for low or high half
+// a, b: full size input vectors
+// return value: half-size vector for lower or upper part
+template <typename W, int ... i0>
+auto blend_half(W const& a, W const& b) {
+    typedef decltype(a.get_low()) V;             // type for half-size vector
+    constexpr int N = V::size();                 // size of half-size vector
+    static_assert(sizeof...(i0) == N, "wrong number of indexes in blend_half");
+    constexpr int ind[N] = { i0... };            // array of indexes
+
+    // lambda to find which of the four possible sources are used
+    // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
+    auto listsources = [](int const n, int const (&ind)[N]) constexpr {
+        bool source_used[4] = { false,false,false,false }; // list of sources used
+        int i = 0;
+        for (i = 0; i < n; i++) {
+            int ix = ind[i];                     // index
+            if (ix >= 0) {
+                int src = ix / n;                // source used
+                source_used[src & 3] = true;
+            }
+        }
+        // return a list of sources used. The last element is the number of sources used
+        EList<int, 5> sources = {{0}};
+        int nsrc = 0;                            // number of sources
+        for (i = 0; i < 4; i++) {
+            if (source_used[i]) {
+                sources.a[nsrc++] = i;
+            }
+        }
+        sources.a[4] = nsrc;
+        return sources;
+    };
+    // list of sources used
+    constexpr EList<int, 5> sources = listsources(N, ind);
+    constexpr int nsrc = sources.a[4];           // number of sources used
+
+    if constexpr (nsrc == 0) {                   // no sources
+        return V(0);
+    }
+    // get indexes for the first one or two sources
+    constexpr int uindex = (nsrc > 2) ? 1 : 2;   // unused elements set to zero if two blends are combined
+    constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>(ind);
+    V x0;
+    V src0 = selectblend<W, sources.a[0]>(a, b); // first source
+    V src1 = selectblend<W, sources.a[1]>(a, b); // second source
+    if constexpr (N == 2) {
+        x0 = blend2  <L.a[0], L.a[1]> (src0, src1);
+    }
+    else if constexpr (N == 4) {
+        x0 = blend4  <L.a[0], L.a[1], L.a[2], L.a[3]> (src0, src1);
+    }
+    else if constexpr (N == 8) {
+        x0 = blend8  <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]> (src0, src1);
+    }
+    else if constexpr (N == 16) {
+        x0 = blend16 <L.a[0], L.a[1], L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
+            L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (src0, src1);
+    }
+    else if constexpr (N == 32) {
+        x0 = blend32 <L.a[0], L.a[1],  L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
+            L.a[8],  L.a[9],  L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15],
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (src0, src1);
+    }
+    if constexpr (nsrc > 2) {    // get last one or two sources
+        constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>(ind);
+        V x1;
+        V src2 = selectblend<W, sources.a[2]>(a, b);  // third source
+        V src3 = selectblend<W, sources.a[3]>(a, b);  // fourth source
+        if constexpr (N == 2) {
+            x1 = blend2  <M.a[0], M.a[1]> (src0, src1);
+        }
+        else if constexpr (N == 4) {
+            x1 = blend4  <M.a[0], M.a[1], M.a[2], M.a[3]> (src2, src3);
+        }
+        else if constexpr (N == 8) {
+            x1 = blend8  <M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]> (src2, src3);
+        }
+        else if constexpr (N == 16) {
+            x1 = blend16 <M.a[0], M.a[1], M.a[2],  M.a[3],  M.a[4],  M.a[5],  M.a[6],  M.a[7],
+                M.a[8], M.a[9], M.a[10], M.a[11], M.a[12], M.a[13], M.a[14], M.a[15] > (src2, src3);
+        }
+        else if constexpr (N == 32) {
+            x1 = blend32 <M.a[0], M.a[1],  M.a[2],   M.a[3],  M.a[4],  M.a[5],  M.a[6],  M.a[7],
+                M.a[8], M.a[9],  M.a[10],  M.a[11], M.a[12], M.a[13], M.a[14], M.a[15],
+                M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22], M.a[23],
+                M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31] > (src2, src3);
+        }
+        x0 |= x1;      // combine result of two blends. Unused elements are zero
+    }
+    return x0;
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+
+#endif // INSTRSET_H
diff --git a/Bwdif/VCL2/instrset_detect.cpp b/Bwdif/VCL2/instrset_detect.cpp
new file mode 100644
index 0000000..119661b
--- /dev/null
+++ b/Bwdif/VCL2/instrset_detect.cpp
@@ -0,0 +1,167 @@
+/**************************  instrset_detect.cpp   ****************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2019-08-01
+* Version:       2.00.00
+* Project:       vector class library
+* Description:
+* Functions for checking which instruction sets are supported.
+*
+* (c) Copyright 2012-2019 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#include "instrset.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+// Define interface to xgetbv instruction
+static inline uint64_t xgetbv (int ctr) {
+#if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
+    // Microsoft or Intel compiler supporting _xgetbv intrinsic
+
+    return uint64_t(_xgetbv(ctr));                    // intrinsic function for XGETBV
+
+#elif defined(__GNUC__) ||  defined (__clang__)       // use inline assembly, Gnu/AT&T syntax
+
+   uint32_t a, d;
+   __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
+   return a | (uint64_t(d) << 32);
+
+#else  // #elif defined (_WIN32)                      // other compiler. try inline assembly with masm/intel/MS syntax
+   uint32_t a, d;
+    __asm {
+        mov ecx, ctr
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd0 ; // xgetbv
+        mov a, eax
+        mov d, edx
+    }
+   return a | (uint64_t(d) << 32);
+
+#endif
+}
+
+/* find supported instruction set
+    return value:
+    0           = 80386 instruction set
+    1  or above = SSE (XMM) supported by CPU (not testing for OS support)
+    2  or above = SSE2
+    3  or above = SSE3
+    4  or above = Supplementary SSE3 (SSSE3)
+    5  or above = SSE4.1
+    6  or above = SSE4.2
+    7  or above = AVX supported by CPU and operating system
+    8  or above = AVX2
+    9  or above = AVX512F
+   10  or above = AVX512VL, AVX512BW, AVX512DQ
+*/
+int instrset_detect(void) {
+
+    static int iset = -1;                                  // remember value for next call
+    if (iset >= 0) {
+        return iset;                                       // called before
+    }
+    iset = 0;                                              // default value
+    int abcd[4] = {0,0,0,0};                               // cpuid results
+    cpuid(abcd, 0);                                        // call cpuid function 0
+    if (abcd[0] == 0) return iset;                         // no further cpuid function supported
+    cpuid(abcd, 1);                                        // call cpuid function 1 for feature flags
+    if ((abcd[3] & (1 <<  0)) == 0) return iset;           // no floating point
+    if ((abcd[3] & (1 << 23)) == 0) return iset;           // no MMX
+    if ((abcd[3] & (1 << 15)) == 0) return iset;           // no conditional move
+    if ((abcd[3] & (1 << 24)) == 0) return iset;           // no FXSAVE
+    if ((abcd[3] & (1 << 25)) == 0) return iset;           // no SSE
+    iset = 1;                                              // 1: SSE supported
+    if ((abcd[3] & (1 << 26)) == 0) return iset;           // no SSE2
+    iset = 2;                                              // 2: SSE2 supported
+    if ((abcd[2] & (1 <<  0)) == 0) return iset;           // no SSE3
+    iset = 3;                                              // 3: SSE3 supported
+    if ((abcd[2] & (1 <<  9)) == 0) return iset;           // no SSSE3
+    iset = 4;                                              // 4: SSSE3 supported
+    if ((abcd[2] & (1 << 19)) == 0) return iset;           // no SSE4.1
+    iset = 5;                                              // 5: SSE4.1 supported
+    if ((abcd[2] & (1 << 23)) == 0) return iset;           // no POPCNT
+    if ((abcd[2] & (1 << 20)) == 0) return iset;           // no SSE4.2
+    iset = 6;                                              // 6: SSE4.2 supported
+    if ((abcd[2] & (1 << 27)) == 0) return iset;           // no OSXSAVE
+    if ((xgetbv(0) & 6) != 6)       return iset;           // AVX not enabled in O.S.
+    if ((abcd[2] & (1 << 28)) == 0) return iset;           // no AVX
+    iset = 7;                                              // 7: AVX supported
+    cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
+    if ((abcd[1] & (1 <<  5)) == 0) return iset;           // no AVX2
+    iset = 8;
+    if ((abcd[1] & (1 << 16)) == 0) return iset;           // no AVX512
+    cpuid(abcd, 0xD);                                      // call cpuid leaf 0xD for feature flags
+    if ((abcd[0] & 0x60) != 0x60)   return iset;           // no AVX512
+    iset = 9;
+    cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
+    if ((abcd[1] & (1 << 31)) == 0) return iset;           // no AVX512VL
+    if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ
+    iset = 10;
+    return iset;
+}
+
+// detect if CPU supports the FMA3 instruction set
+bool hasFMA3(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 1);                                        // call cpuid function 1
+    return ((abcd[2] & (1 << 12)) != 0);                   // ecx bit 12 indicates FMA3
+}
+
+// detect if CPU supports the FMA4 instruction set
+bool hasFMA4(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
+    return ((abcd[2] & (1 << 16)) != 0);                   // ecx bit 16 indicates FMA4
+}
+
+// detect if CPU supports the XOP instruction set
+bool hasXOP(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
+    return ((abcd[2] & (1 << 11)) != 0);                   // ecx bit 11 indicates XOP
+}
+
+// detect if CPU supports the F16C instruction set
+bool hasF16C(void) {
+    if (instrset_detect() < 7) return false;               // must have AVX
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 1);                                        // call cpuid function 1
+    return ((abcd[2] & (1 << 29)) != 0);                   // ecx bit 29 indicates F16C
+}
+
+// detect if CPU supports the AVX512ER instruction set
+bool hasAVX512ER(void) {
+    if (instrset_detect() < 9) return false;               // must have AVX512F
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 7);                                        // call cpuid function 7
+    return ((abcd[1] & (1 << 27)) != 0);                   // ebx bit 27 indicates AVX512ER
+}
+
+// detect if CPU supports the AVX512VBMI instruction set
+bool hasAVX512VBMI(void) {
+    if (instrset_detect() < 10) return false;              // must have AVX512BW
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 7);                                        // call cpuid function 7
+    return ((abcd[2] & (1 << 1)) != 0);                    // ecx bit 1 indicates AVX512VBMI
+}
+
+// detect if CPU supports the AVX512VBMI2 instruction set
+bool hasAVX512VBMI2(void) {
+    if (instrset_detect() < 10) return false;              // must have AVX512BW
+    int abcd[4];                                           // cpuid results
+    cpuid(abcd, 7);                                        // call cpuid function 7
+    return ((abcd[2] & (1 << 6)) != 0);                    // ecx bit 6 indicates AVX512VBMI2
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
diff --git a/Bwdif/VCL2/vector_convert.h b/Bwdif/VCL2/vector_convert.h
new file mode 100644
index 0000000..a7aa2b6
--- /dev/null
+++ b/Bwdif/VCL2/vector_convert.h
@@ -0,0 +1,574 @@
+/**************************  vector_convert.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2019-11-17
+* Version:       2.01.00
+* Project:       vector class library
+* Description:
+* Header file for conversion between different vector classes with different
+* sizes. Also includes verious generic template functions.
+*
+* (c) Copyright 2012-2019 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTOR_CONVERT_H
+#define VECTOR_CONVERT_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+#if MAX_VECTOR_SIZE >= 256
+
+/*****************************************************************************
+*
+*          Extend from 128 to 256 bit vectors
+*
+*****************************************************************************/
+
+#if INSTRSET >= 8  // AVX2. 256 bit integer vectors
+
+// sign extend
+static inline Vec16s extend (Vec16c const a) {
+    return _mm256_cvtepi8_epi16(a);
+}
+
+// zero extend
+static inline Vec16us extend (Vec16uc const a) {
+    return _mm256_cvtepu8_epi16(a);
+}
+
+// sign extend
+static inline Vec8i extend (Vec8s const a) {
+    return _mm256_cvtepi16_epi32(a);
+}
+
+// zero extend
+static inline Vec8ui extend (Vec8us const a) {
+    return _mm256_cvtepu16_epi32(a);
+}
+
+// sign extend
+static inline Vec4q extend (Vec4i const a) {
+    return _mm256_cvtepi32_epi64(a);
+}
+
+// zero extend
+static inline Vec4uq extend (Vec4ui const a) {
+    return _mm256_cvtepu32_epi64(a);
+}
+
+
+#else  // no AVX2. 256 bit integer vectors are emulated
+
+// sign extend and zero extend functions:
+static inline Vec16s extend (Vec16c const a) {
+    return Vec16s(extend_low(a), extend_high(a));
+}
+
+static inline Vec16us extend (Vec16uc const a) {
+    return Vec16us(extend_low(a), extend_high(a));
+}
+
+static inline Vec8i extend (Vec8s const a) {
+    return Vec8i(extend_low(a), extend_high(a));
+}
+
+static inline Vec8ui extend (Vec8us const a) {
+    return Vec8ui(extend_low(a), extend_high(a));
+}
+
+static inline Vec4q extend (Vec4i const a) {
+    return Vec4q(extend_low(a), extend_high(a));
+}
+
+static inline Vec4uq extend (Vec4ui const a) {
+    return Vec4uq(extend_low(a), extend_high(a));
+}
+
+#endif  // AVX2
+
+/*****************************************************************************
+*
+*          Conversions between float and double
+*
+*****************************************************************************/
+#if INSTRSET >= 7  // AVX. 256 bit float vectors
+
+// float to double
+static inline Vec4d to_double (Vec4f const a) {
+    return _mm256_cvtps_pd(a);
+}
+
+// double to float
+static inline Vec4f to_float (Vec4d const a) {
+    return _mm256_cvtpd_ps(a);
+}
+
+#else  // no AVX2. 256 bit float vectors are emulated
+
+// float to double
+static inline Vec4d to_double (Vec4f const a) {
+    Vec2d lo = _mm_cvtps_pd(a);
+    Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a));
+    return Vec4d(lo,hi);
+}
+
+// double to float
+static inline Vec4f to_float (Vec4d const a) {
+    Vec4f lo = _mm_cvtpd_ps(a.get_low());
+    Vec4f hi = _mm_cvtpd_ps(a.get_high());
+    return _mm_movelh_ps(lo, hi);
+}
+
+#endif
+
+/*****************************************************************************
+*
+*          Reduce from 256 to 128 bit vectors
+*
+*****************************************************************************/
+#if INSTRSET >= 10  // AVX512VL
+
+// compress functions. overflow wraps around
+static inline Vec16c compress (Vec16s const a) {
+    return _mm256_cvtepi16_epi8(a);
+}
+
+static inline Vec16uc compress (Vec16us const a) {
+    return _mm256_cvtepi16_epi8(a);
+}
+
+static inline Vec8s compress (Vec8i const a) {
+    return _mm256_cvtepi32_epi16(a);
+}
+
+static inline Vec8us compress (Vec8ui const a) {
+    return _mm256_cvtepi32_epi16(a);
+}
+
+static inline Vec4i compress (Vec4q const a) {
+    return _mm256_cvtepi64_epi32(a);
+}
+
+static inline Vec4ui compress (Vec4uq const a) {
+    return _mm256_cvtepi64_epi32(a);
+}
+
+#else  // no AVX512
+
+// compress functions. overflow wraps around
+static inline Vec16c compress (Vec16s const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec16uc compress (Vec16us const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8s compress (Vec8i const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8us compress (Vec8ui const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec4i compress (Vec4q const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec4ui compress (Vec4uq const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+#endif  // AVX512
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+
+#if MAX_VECTOR_SIZE >= 512
+
+/*****************************************************************************
+*
+*          Extend from 256 to 512 bit vectors
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9  // AVX512. 512 bit integer vectors
+
+// sign extend
+static inline Vec32s extend (Vec32c const a) {
+#if INSTRSET >= 10
+    return _mm512_cvtepi8_epi16(a);
+#else
+    return Vec32s(extend_low(a), extend_high(a));
+#endif
+}
+
+// zero extend
+static inline Vec32us extend (Vec32uc const a) {
+#if INSTRSET >= 10
+    return _mm512_cvtepu8_epi16(a);
+#else
+    return Vec32us(extend_low(a), extend_high(a));
+#endif
+}
+
+// sign extend
+static inline Vec16i extend (Vec16s const a) {
+    return _mm512_cvtepi16_epi32(a);
+}
+
+// zero extend
+static inline Vec16ui extend (Vec16us const a) {
+    return _mm512_cvtepu16_epi32(a);
+}
+
+// sign extend
+static inline Vec8q extend (Vec8i const a) {
+    return _mm512_cvtepi32_epi64(a);
+}
+
+// zero extend
+static inline Vec8uq extend (Vec8ui const a) {
+    return _mm512_cvtepu32_epi64(a);
+}
+
+#else  // no AVX512. 512 bit vectors are emulated
+
+
+
+// sign extend
+static inline Vec32s extend (Vec32c const a) {
+    return Vec32s(extend_low(a), extend_high(a));
+}
+
+// zero extend
+static inline Vec32us extend (Vec32uc const a) {
+    return Vec32us(extend_low(a), extend_high(a));
+}
+
+// sign extend
+static inline Vec16i extend (Vec16s const a) {
+    return Vec16i(extend_low(a), extend_high(a));
+}
+
+// zero extend
+static inline Vec16ui extend (Vec16us const a) {
+    return Vec16ui(extend_low(a), extend_high(a));
+}
+
+// sign extend
+static inline Vec8q extend (Vec8i const a) {
+    return Vec8q(extend_low(a), extend_high(a));
+}
+
+// zero extend
+static inline Vec8uq extend (Vec8ui const a) {
+    return Vec8uq(extend_low(a), extend_high(a));
+}
+
+#endif  // AVX512
+
+
+/*****************************************************************************
+*
+*          Reduce from 512 to 256 bit vectors
+*
+*****************************************************************************/
+#if INSTRSET >= 9  // AVX512F
+
+// compress functions. overflow wraps around
+static inline Vec32c compress (Vec32s const a) {
+#if INSTRSET >= 10  // AVVX512BW
+    return _mm512_cvtepi16_epi8(a);
+#else
+    return compress(a.get_low(), a.get_high());
+#endif
+}
+
+static inline Vec32uc compress (Vec32us const a) {
+    return Vec32uc(compress(Vec32s(a)));
+}
+
+static inline Vec16s compress (Vec16i const a) {
+    return _mm512_cvtepi32_epi16(a);
+}
+
+static inline Vec16us compress (Vec16ui const a) {
+    return _mm512_cvtepi32_epi16(a);
+}
+
+static inline Vec8i compress (Vec8q const a) {
+    return _mm512_cvtepi64_epi32(a);
+}
+
+static inline Vec8ui compress (Vec8uq const a) {
+    return _mm512_cvtepi64_epi32(a);
+}
+
+#else  // no AVX512
+
+// compress functions. overflow wraps around
+static inline Vec32c compress (Vec32s const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec32uc compress (Vec32us const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec16s compress (Vec16i const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec16us compress (Vec16ui const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8i compress (Vec8q const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8ui compress (Vec8uq const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+#endif  // AVX512
+
+/*****************************************************************************
+*
+*          Conversions between float and double
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9  // AVX512. 512 bit float vectors
+
+// float to double
+static inline Vec8d to_double (Vec8f const a) {
+    return _mm512_cvtps_pd(a);
+}
+
+// double to float
+static inline Vec8f to_float (Vec8d const a) {
+    return _mm512_cvtpd_ps(a);
+}
+
+#else  // no AVX512. 512 bit float vectors are emulated
+
+// float to double
+static inline Vec8d to_double (Vec8f const a) {
+    Vec4d lo = to_double(a.get_low());
+    Vec4d hi = to_double(a.get_high());
+    return Vec8d(lo,hi);
+}
+
+// double to float
+static inline Vec8f to_float (Vec8d const a) {
+    Vec4f lo = to_float(a.get_low());
+    Vec4f hi = to_float(a.get_high());
+    return Vec8f(lo, hi);
+}
+
+#endif
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+// double to float
+static inline Vec4f to_float (Vec2d const a) {
+    return _mm_cvtpd_ps(a);
+}
+
+
+/*****************************************************************************
+*
+*          Generic template functions
+*
+*  These templates define functions for multiple vector types in one template
+*
+*****************************************************************************/
+
+// horizontal min/max of vector elements
+// implemented with universal template, works for all vector types:
+
+template <typename T> auto horizontal_min(T const x) {
+    if constexpr ((T::elementtype() & 16) != 0) {
+        // T is a float or double vector
+        if (horizontal_or(is_nan(x))) {
+            // check for NAN because min does not guarantee NAN propagation
+            return x[horizontal_find_first(is_nan(x))];
+        }
+    }
+    return horizontal_min1(x);
+}
+
+template <typename T> auto horizontal_min1(T const x) {
+    if constexpr (T::elementtype() <= 3) {       // boolean vector type
+        return horizontal_and(x);
+    }
+    else if constexpr (sizeof(T) >= 32) {
+        // split recursively into smaller vectors
+        return horizontal_min1(min(x.get_low(), x.get_high()));
+    }
+    else if constexpr (T::size() == 2) {
+        T a = permute2 <1, V_DC>(x);             // high half
+        T b = min(a, x);
+        return b[0];
+    }
+    else if constexpr (T::size() == 4) {
+        T a = permute4<2, 3, V_DC, V_DC>(x);     // high half
+        T b = min(a, x);
+        a = permute4<1, V_DC, V_DC, V_DC>(b);
+        b = min(a, b);
+        return b[0];
+    }
+    else if constexpr (T::size() == 8) {
+        T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x);  // high half
+        T b = min(a, x);
+        a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = min(a, b);
+        a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = min(a, b);
+        return b[0];
+    }
+    else {
+        static_assert(T::size() == 16);          // no other size is allowed
+        T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x);  // high half
+        T b = min(a, x);
+        a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = min(a, b);
+        a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = min(a, b);
+        a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = min(a, b);
+        return b[0];
+    }
+}
+
+template <typename T> auto horizontal_max(T const x) {
+    if constexpr ((T::elementtype() & 16) != 0) {
+        // T is a float or double vector
+        if (horizontal_or(is_nan(x))) {
+            // check for NAN because max does not guarantee NAN propagation
+            return x[horizontal_find_first(is_nan(x))];
+        }
+    }
+    return horizontal_max1(x);
+}
+
+template <typename T> auto horizontal_max1(T const x) {
+    if constexpr (T::elementtype() <= 3) {       // boolean vector type
+        return horizontal_or(x);
+    }
+    else if constexpr (sizeof(T) >= 32) {
+        // split recursively into smaller vectors
+        return horizontal_max1(max(x.get_low(), x.get_high()));
+    }
+    else if constexpr (T::size() == 2) {
+        T a = permute2 <1, V_DC>(x);             // high half
+        T b = max(a, x);
+        return b[0];
+    }
+    else if constexpr (T::size() == 4) {
+        T a = permute4<2, 3, V_DC, V_DC>(x);     // high half
+        T b = max(a, x);
+        a = permute4<1, V_DC, V_DC, V_DC>(b);
+        b = max(a, b);
+        return b[0];
+    }
+    else if constexpr (T::size() == 8) {
+        T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x);  // high half
+        T b = max(a, x);
+        a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = max(a, b);
+        a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = max(a, b);
+        return b[0];
+    }
+    else {
+        static_assert(T::size() == 16);          // no other size is allowed
+        T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x);  // high half
+        T b = max(a, x);
+        a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = max(a, b);
+        a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = max(a, b);
+        a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+        b = max(a, b);
+        return b[0];
+    }
+}
+
+// Find first element that is true in a boolean vector
+template <typename V>
+static inline int horizontal_find_first(V const x) {
+    static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
+    auto bits = to_bits(x);                      // convert to bits
+    if (bits == 0) return -1;
+    if constexpr (V::size() < 32) {
+        return bit_scan_forward((uint32_t)bits);
+    }
+    else {
+        return bit_scan_forward(bits);
+    }
+}
+
+// Count the number of elements that are true in a boolean vector
+template <typename V>
+static inline int horizontal_count(V const x) {
+    static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
+    auto bits = to_bits(x);                      // convert to bits
+    if constexpr (V::size() < 32) {
+        return vml_popcnt((uint32_t)bits);
+    }
+    else {
+        return (int)vml_popcnt(bits);
+    }
+}
+
+// maximum and minimum functions. This version is sure to propagate NANs,
+// conforming to the new IEEE-754 2019 standard
+template <typename V>
+static inline V maximum(V const a, V const b) {
+    if constexpr (V::elementtype() < 16) {
+        return max(a, b);              // integer type
+    }
+    else {                             // float or double vector
+        V y = select(is_nan(a), a, max(a, b));
+#ifdef SIGNED_ZERO                     // pedantic about signed zero
+        y = select(a == b, a & b, y);  // maximum(+0, -0) = +0
+#endif
+        return y;
+    }
+}
+
+template <typename V>
+static inline V minimum(V const a, V const b) {
+    if constexpr (V::elementtype() < 16) {
+        return min(a, b);              // integer type
+    }
+    else {                             // float or double vector
+        V y = select(is_nan(a), a, min(a, b));
+#ifdef SIGNED_ZERO                     // pedantic about signed zero
+        y = select(a == b, a | b, y);  // minimum(+0, -0) = -0
+#endif
+        return y;
+    }
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTOR_CONVERT_H
diff --git a/Bwdif/VCL2/vectorclass.h b/Bwdif/VCL2/vectorclass.h
new file mode 100644
index 0000000..b105711
--- /dev/null
+++ b/Bwdif/VCL2/vectorclass.h
@@ -0,0 +1,86 @@
+/****************************  vectorclass.h   ********************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-04-11
+* Version:       2.01.02
+* Project:       vector class library
+* Home:          https://github.com/vectorclass
+* Description:
+* Header file defining vector classes as interface to intrinsic functions
+* in x86 and x86-64 microprocessors with SSE2 and later instruction sets.
+*
+* Instructions:
+* Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired
+* instruction set, which must be at least SSE2. Specify the supported
+* instruction set by a command line define, e.g. __SSE4_1__ if the
+* compiler does not automatically do so.
+* For detailed instructions, see vcl_manual.pdf
+*
+* Each vector object is represented internally in the CPU as a vector
+* register with 128, 256 or 512 bits.
+*
+* This header file includes the appropriate header files depending on the
+* selected instruction set.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+#ifndef VECTORCLASS_H
+#define VECTORCLASS_H  20102
+
+// Maximum vector size, bits. Allowed values are 128, 256, 512
+#ifndef MAX_VECTOR_SIZE
+#define MAX_VECTOR_SIZE 512
+#endif
+
+// Determine instruction set, and define platform-dependent functions
+#include "instrset.h"        // Select supported instruction set
+
+#if INSTRSET < 2             // instruction set SSE2 is the minimum
+#error Please compile for the SSE2 instruction set or higher
+#else
+
+// Select appropriate .h files depending on instruction set
+#include "vectori128.h"      // 128-bit integer vectors
+#include "vectorf128.h"      // 128-bit floating point vectors
+
+#if MAX_VECTOR_SIZE >= 256
+#if INSTRSET >= 8
+#include "vectori256.h"      // 256-bit integer vectors, requires AVX2 instruction set
+#else
+#include "vectori256e.h"     // 256-bit integer vectors, emulated
+#endif  // INSTRSET >= 8
+#if INSTRSET >= 7
+#include "vectorf256.h"      // 256-bit floating point vectors, requires AVX instruction set
+#else
+#include "vectorf256e.h"     // 256-bit floating point vectors, emulated
+#endif  //  INSTRSET >= 7
+#endif  //  MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+#if INSTRSET >= 9
+#include "vectori512.h"      // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set
+#include "vectorf512.h"      // 512-bit floating point vectors, requires AVX512F instruction set
+#else
+#include "vectori512e.h"     // 512-bit integer vectors, emulated
+#include "vectorf512e.h"     // 512-bit floating point vectors, emulated
+#endif  //  INSTRSET >= 9
+#if INSTRSET >= 10
+#include "vectori512s.h"     // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set
+#else
+#include "vectori512se.h"    // 512-bit vectors of 8 and 16 bit integers, emulated
+#endif
+#endif  //  MAX_VECTOR_SIZE >= 512
+
+#include "vector_convert.h"  // conversion between different vector sizes
+
+#endif  // INSTRSET >= 2
+
+
+#else   // VECTORCLASS_H
+
+#if VECTORCLASS_H < 20000
+#error Mixed versions of vector class library
+#endif
+
+#endif  // VECTORCLASS_H
diff --git a/Bwdif/VCL2/vectorf128.h b/Bwdif/VCL2/vectorf128.h
new file mode 100644
index 0000000..a7c4034
--- /dev/null
+++ b/Bwdif/VCL2/vectorf128.h
@@ -0,0 +1,2985 @@
+/****************************  vectorf128.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 128-bit floating point vector classes
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec4f     Vector of 4 single precision floating point numbers
+* Vec4fb    Vector of 4 Booleans for use with Vec4f
+* Vec2d     Vector of 2 double precision floating point numbers
+* Vec2db    Vector of 2 Booleans for use with Vec2d
+*
+* Each vector object is represented internally in the CPU as a 128-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORF128_H
+#define VECTORF128_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+/*****************************************************************************
+*
+*          select functions
+*
+*****************************************************************************/
+// Select between two __m128 sources, element by element, with broad boolean vector.
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
+// No other values are allowed for broad boolean vectors.
+// The implementation depends on the instruction set:
+// If SSE4.1 is supported then only bit 31 in each dword of s is checked,
+// otherwise all bits in s are used.
+static inline __m128 selectf(__m128 const s, __m128 const a, __m128 const b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_blendv_ps(b, a, s);
+#else
+    return _mm_or_ps(
+        _mm_and_ps(s, a),
+        _mm_andnot_ps(s, b));
+#endif
+}
+
+// Same, with two __m128d sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
+// No other values are allowed for broad boolean vectors.
+// The implementation depends on the instruction set:
+// If SSE4.1 is supported then only bit 63 in each dword of s is checked,
+// otherwise all bits in s are used.
+static inline __m128d selectd(__m128d const s, __m128d const a, __m128d const b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_blendv_pd(b, a, s);
+#else
+    return _mm_or_pd(
+        _mm_and_pd(s, a),
+        _mm_andnot_pd(s, b));
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vec4fb: Vector of 4 Booleans for use with Vec4f
+*
+*****************************************************************************/
+
+#if INSTRSET < 10 // broad boolean vectors
+
+class Vec4fb {
+protected:
+    __m128 xmm; // Float vector
+public:
+    // Default constructor:
+    Vec4fb() {
+    }
+    // Constructor to build from all elements:
+    Vec4fb(bool b0, bool b1, bool b2, bool b3) {
+        xmm = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3));
+    }
+    // Constructor to convert from type __m128 used in intrinsics:
+    Vec4fb(__m128 const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128 used in intrinsics:
+    Vec4fb & operator = (__m128 const x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4fb(bool b) {
+        xmm = _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4fb & operator = (bool b) {
+        *this = Vec4fb(b);
+        return *this;
+    }
+    // Constructor to convert from type Vec4ib used as Boolean for integer vectors
+    Vec4fb(Vec4ib const x) {
+        xmm = _mm_castsi128_ps(x);
+    }
+    // Assignment operator to convert from type Vec4ib used as Boolean for integer vectors
+    Vec4fb & operator = (Vec4ib const x) {
+        xmm = _mm_castsi128_ps(x);
+        return *this;
+    }
+    // Type cast operator to convert to __m128 used in intrinsics
+    operator __m128() const {
+        return xmm;
+    }
+    /* Clang problem:
+    The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128f as identical.
+    I have reported this problem in 2013 but it is still not fixed in 2019!
+    See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
+    Additional problem: The version number is not consistent across platforms. The Apple build has
+    different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
+    http://llvm.org/bugs/show_bug.cgi?id=12643
+    I have received reports that there was no aliasing of vector types on __apple_build_version__ = 6020053
+    but apparently the problem has come back. The aliasing of vector types has been reported on
+    __apple_build_version__ = 8000042
+    We have to make switches here when - hopefully - the error some day has been fixed.
+    We need different version checks with and whithout __apple_build_version__
+    */
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec4ib used as Boolean for integer vectors
+    operator Vec4ib() const {
+        return _mm_castps_si128(xmm);
+    }
+#endif
+    // Member function to change a single element in vector
+    Vec4fb const insert(int index, bool value) {
+        const int32_t maskl[8] = { 0,0,0,0,-1,0,0,0 };
+        __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 3))); // mask with FFFFFFFF at index position
+        if (value) {
+            xmm = _mm_or_ps(xmm, mask);
+        }
+        else {
+            xmm = _mm_andnot_ps(mask, xmm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec4ib(_mm_castps_si128(xmm)).extract(index);
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+
+    // Member function to change a bitfield to a boolean vector
+    Vec4fb & load_bits(uint8_t a) {
+        Vec4ib b;  b.load_bits(a);
+        xmm = _mm_castsi128_ps(b);
+        return *this;
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec4fb(int b) = delete;
+    Vec4fb & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec4b Vec4fb;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4fb
+*
+*****************************************************************************/
+
+#if INSTRSET < 10 // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec4fb operator & (Vec4fb const a, Vec4fb const b) {
+    return _mm_and_ps(a, b);
+}
+static inline Vec4fb operator && (Vec4fb const a, Vec4fb const b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec4fb & operator &= (Vec4fb & a, Vec4fb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4fb operator | (Vec4fb const a, Vec4fb const b) {
+    return _mm_or_ps(a, b);
+}
+static inline Vec4fb operator || (Vec4fb const a, Vec4fb const b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec4fb & operator |= (Vec4fb & a, Vec4fb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4fb operator ~ (Vec4fb const a) {
+    return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4fb operator ^ (Vec4fb const a, Vec4fb const b) {
+    return _mm_xor_ps(a, b);
+}
+
+// vector operator == : xnor
+static inline Vec4fb operator == (Vec4fb const a, Vec4fb const b) {
+    return Vec4fb(a ^ Vec4fb(~b));
+}
+
+// vector operator != : xor
+static inline Vec4fb operator != (Vec4fb const a, Vec4fb const b) {
+    return Vec4fb(a ^ b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4fb & operator ^= (Vec4fb & a, Vec4fb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same)
+static inline Vec4fb operator ! (Vec4fb const a) {
+    return Vec4fb(!Vec4ib(a));
+}
+
+// Functions for Vec4fb
+
+// andnot: a & ~ b
+static inline Vec4fb andnot(Vec4fb const a, Vec4fb const b) {
+    return _mm_andnot_ps(b, a);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec4fb const a) {
+    return _mm_movemask_ps(a) == 0x0F;
+    //return horizontal_and(Vec128b(_mm_castps_si128(a)));
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec4fb const a) {
+    return _mm_movemask_ps(a) != 0;
+    //return horizontal_or(Vec128b(_mm_castps_si128(a)));
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Vec2db: Vector of 2 Booleans for use with Vec2d
+*
+*****************************************************************************/
+
+#if INSTRSET < 10 // broad boolean vectors
+
+class Vec2db {
+protected:
+    __m128d xmm; // Double vector
+public:
+    // Default constructor:
+    Vec2db() {
+    }
+    // Constructor to broadcast scalar value:
+    Vec2db(bool b) {
+        xmm = _mm_castsi128_pd(_mm_set1_epi32(-int32_t(b)));
+    }
+    // Constructor to build from all elements:
+    Vec2db(bool b0, bool b1) {
+        xmm = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1));
+    }
+    // Constructor to convert from type __m128d used in intrinsics:
+    Vec2db(__m128d const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128d used in intrinsics:
+    Vec2db & operator = (__m128d const x) {
+        xmm = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec2db & operator = (bool b) {
+        *this = Vec2db(b);
+        return *this;
+    }
+    // Constructor to convert from type Vec2qb used as Boolean for integer vectors
+    Vec2db(Vec2qb const x) {
+        xmm = _mm_castsi128_pd(x);
+    }
+    // Assignment operator to convert from type Vec2qb used as Boolean for integer vectors
+    Vec2db & operator = (Vec2qb const x) {
+        xmm = _mm_castsi128_pd(x);
+        return *this;
+    }
+    // Type cast operator to convert to __m128d used in intrinsics
+    operator __m128d() const {
+        return xmm;
+    }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec2qb used as Boolean for integer vectors
+    operator Vec2qb() const {
+        return _mm_castpd_si128(xmm);
+    }
+#endif
+    // Member function to change a single element in vector
+    Vec2db const insert(int index, bool value) {
+        const int32_t maskl[8] = { 0,0,0,0,-1,-1,0,0 };
+        __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 1) * 2)); // mask with FFFFFFFFFFFFFFFF at index position
+        if (value) {
+            xmm = _mm_or_pd(xmm, _mm_castps_pd(mask));
+        }
+        else {
+            xmm = _mm_andnot_pd(_mm_castps_pd(mask), xmm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec2qb(_mm_castpd_si128(xmm)).extract(index);
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec2db & load_bits(uint8_t a) {
+        Vec2qb b; b.load_bits(a);
+        xmm = _mm_castsi128_pd(b);
+        return *this;
+    }
+    static constexpr int size() {
+        return 2;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec2db(int b) = delete;
+    Vec2db & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec2b Vec2db;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Operators for Vec2db
+*
+*****************************************************************************/
+
+#if INSTRSET < 10 // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec2db operator & (Vec2db const a, Vec2db const b) {
+    return _mm_and_pd(a, b);
+}
+static inline Vec2db operator && (Vec2db const a, Vec2db const b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec2db & operator &= (Vec2db & a, Vec2db const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2db operator | (Vec2db const a, Vec2db const b) {
+    return _mm_or_pd(a, b);
+}
+static inline Vec2db operator || (Vec2db const a, Vec2db const b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec2db & operator |= (Vec2db & a, Vec2db const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2db operator ~ (Vec2db const a) {
+    return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2db operator ^ (Vec2db const a, Vec2db const b) {
+    return _mm_xor_pd(a, b);
+}
+
+// vector operator == : xnor
+static inline Vec2db operator == (Vec2db const a, Vec2db const b) {
+    return Vec2db(a ^ Vec2db(~b));
+}
+
+// vector operator != : xor
+static inline Vec2db operator != (Vec2db const a, Vec2db const b) {
+    return Vec2db(a ^ b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec2db & operator ^= (Vec2db & a, Vec2db const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same)
+static inline Vec2db operator ! (Vec2db const a) {
+    return Vec2db(!Vec2qb(a));
+}
+
+// Functions for Vec2db
+
+// andnot: a & ~ b
+static inline Vec2db andnot(Vec2db const a, Vec2db const b) {
+    return _mm_andnot_pd(b, a);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and(Vec2db const a) {
+    return _mm_movemask_pd(a) == 3;
+    //return horizontal_and(Vec128b(_mm_castpd_si128(a)));
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec2db const a) {
+    return _mm_movemask_pd(a) != 0;
+    //return horizontal_or(Vec128b(_mm_castpd_si128(a)));
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Vec4f: Vector of 4 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec4f {
+protected:
+    __m128 xmm; // Float vector
+public:
+    // Default constructor:
+    Vec4f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4f(float f) {
+        xmm = _mm_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec4f(float f0, float f1, float f2, float f3) {
+        xmm = _mm_setr_ps(f0, f1, f2, f3);
+    }
+    // Constructor to convert from type __m128 used in intrinsics:
+    Vec4f(__m128 const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128 used in intrinsics:
+    Vec4f & operator = (__m128 const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128 used in intrinsics
+    operator __m128() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4f & load(float const * p) {
+        xmm = _mm_loadu_ps(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 16
+    // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 16.
+    Vec4f & load_a(float const * p) {
+        xmm = _mm_load_ps(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm_storeu_ps(p, xmm);
+    }
+    // Member function storing into array, aligned by 16
+    // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 16.
+    void store_a(float * p) const {
+        _mm_store_ps(p, xmm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 16
+    void store_nt(float * p) const {
+        _mm_stream_ps(p, xmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4f & load_partial(int n, float const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        xmm = _mm_maskz_loadu_ps(__mmask8((1u << n) - 1), p);
+#else
+        __m128 t1, t2;
+        switch (n) {
+        case 1:
+            xmm = _mm_load_ss(p); break;
+        case 2:
+            xmm = _mm_castpd_ps(_mm_load_sd((double const*)p)); break;
+        case 3:
+            t1 = _mm_castpd_ps(_mm_load_sd((double const*)p));
+            t2 = _mm_load_ss(p + 2);
+            xmm = _mm_movelh_ps(t1, t2); break;
+        case 4:
+            load(p); break;
+        default:
+            xmm = _mm_setzero_ps();
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+#if INSTRSET >= 10  // AVX512VL
+        _mm_mask_storeu_ps(p, __mmask8((1u << n) - 1), xmm);
+#else
+        __m128 t1;
+        switch (n) {
+        case 1:
+            _mm_store_ss(p, xmm); break;
+        case 2:
+            _mm_store_sd((double*)p, _mm_castps_pd(xmm)); break;
+        case 3:
+            _mm_store_sd((double*)p, _mm_castps_pd(xmm));
+            t1 = _mm_movehl_ps(xmm, xmm);
+            _mm_store_ss(p + 2, t1); break;
+        case 4:
+            store(p); break;
+        default:;
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec4f & cutoff(int n) {
+#if INSTRSET >= 10
+        xmm = _mm_maskz_mov_ps(__mmask8((1u << n) - 1), xmm);
+#else
+        if (uint32_t(n) >= 4) return *this;
+        const union {
+            int32_t i[8];
+            float   f[8];
+        } mask = { {1,-1,-1,-1,0,0,0,0} };
+        xmm = _mm_and_ps(xmm, Vec4f().load(mask.f + 4 - n));
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4f const insert(int index, float value) {
+#if INSTRSET >= 10   // AVX512VL
+        xmm = _mm_mask_broadcastss_ps(xmm, __mmask8(1u << index), _mm_set_ss(value));
+#elif INSTRSET >= 5   // SSE4.1
+        switch (index & 3) {
+        case 0:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 0 << 4);  break;
+        case 1:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 1 << 4);  break;
+        case 2:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 2 << 4);  break;
+        default:
+            xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 3 << 4);  break;
+        }
+#else
+        const int32_t maskl[8] = { 0,0,0,0,-1,0,0,0 };
+        __m128 broad = _mm_set1_ps(value);  // broadcast value into all elements
+        __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 3))); // mask with FFFFFFFF at index position
+        xmm = selectf(mask, broad, xmm);
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(int index) const {
+#if INSTRSET >= 10
+        __m128 x = _mm_maskz_compress_ps(__mmask8(1u << index), xmm);
+        return _mm_cvtss_f32(x);
+#else
+        float x[4];
+        store(x);
+        return x[index & 3];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 16;
+    }
+    typedef __m128 registertype;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4f operator + (Vec4f const a, Vec4f const b) {
+    return _mm_add_ps(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec4f operator + (Vec4f const a, float b) {
+    return a + Vec4f(b);
+}
+static inline Vec4f operator + (float a, Vec4f const b) {
+    return Vec4f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec4f & operator += (Vec4f & a, Vec4f const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4f operator ++ (Vec4f & a, int) {
+    Vec4f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4f & operator ++ (Vec4f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4f operator - (Vec4f const a, Vec4f const b) {
+    return _mm_sub_ps(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec4f operator - (Vec4f const a, float b) {
+    return a - Vec4f(b);
+}
+static inline Vec4f operator - (float a, Vec4f const b) {
+    return Vec4f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4f operator - (Vec4f const a) {
+    return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+}
+
+// vector operator -= : subtract
+static inline Vec4f & operator -= (Vec4f & a, Vec4f const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4f operator -- (Vec4f & a, int) {
+    Vec4f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4f & operator -- (Vec4f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4f operator * (Vec4f const a, Vec4f const b) {
+    return _mm_mul_ps(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec4f operator * (Vec4f const a, float b) {
+    return a * Vec4f(b);
+}
+static inline Vec4f operator * (float a, Vec4f const b) {
+    return Vec4f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec4f & operator *= (Vec4f & a, Vec4f const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4f operator / (Vec4f const a, Vec4f const b) {
+    return _mm_div_ps(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec4f operator / (Vec4f const a, float b) {
+    return a / Vec4f(b);
+}
+static inline Vec4f operator / (float a, Vec4f const b) {
+    return Vec4f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec4f & operator /= (Vec4f & a, Vec4f const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4fb operator == (Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_ps_mask(a, b, 0);
+#else
+    return _mm_cmpeq_ps(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4fb operator != (Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_ps_mask(a, b, 4);
+#else
+    return _mm_cmpneq_ps(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4fb operator < (Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_ps_mask(a, b, 1);
+#else
+    return _mm_cmplt_ps(a, b);
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4fb operator <= (Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_ps_mask(a, b, 2);
+#else
+    return _mm_cmple_ps(a, b);
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4fb operator > (Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_ps_mask(a, b, 6);
+#else
+    return b < a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4fb operator >= (Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_ps_mask(a, b, 5);
+#else
+    return b <= a;
+#endif
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4f operator & (Vec4f const a, Vec4f const b) {
+    return _mm_and_ps(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec4f & operator &= (Vec4f & a, Vec4f const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec4f and Vec4fb
+static inline Vec4f operator & (Vec4f const a, Vec4fb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_maskz_mov_ps(b, a);
+#else
+    return _mm_and_ps(a, b);
+#endif
+}
+static inline Vec4f operator & (Vec4fb const a, Vec4f const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4f operator | (Vec4f const a, Vec4f const b) {
+    return _mm_or_ps(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec4f & operator |= (Vec4f & a, Vec4f const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4f operator ^ (Vec4f const a, Vec4f const b) {
+    return _mm_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4f & operator ^= (Vec4f & a, Vec4f const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4fb operator ! (Vec4f const a) {
+    return a == Vec4f(0.0f);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec4f
+*
+*****************************************************************************/
+
+static inline Vec4f zero_4f() {
+    return _mm_setzero_ps();
+}
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec4f select(Vec4fb const s, Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_ps(b, s, a);
+#else
+    return selectf(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4f if_add(Vec4fb const f, Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10
+    return _mm_mask_add_ps (a, f, a, b);
+#else
+    return a + (Vec4f(f) & b);
+#endif
+}
+
+// Conditional subtract: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec4f if_sub(Vec4fb const f, Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10
+    return _mm_mask_sub_ps (a, f, a, b);
+#else
+    return a - (Vec4f(f) & b);
+#endif
+}
+
+// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4f if_mul(Vec4fb const f, Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10
+    return _mm_mask_mul_ps (a, f, a, b);
+#else
+    return a * select(f, b, 1.f);
+#endif
+}
+
+// Conditional divide: For all vector elements i: result[i] = f[i] ? (a[i] / b[i]) : a[i]
+static inline Vec4f if_div(Vec4fb const f, Vec4f const a, Vec4f const b) {
+#if INSTRSET >= 10
+    return _mm_mask_div_ps (a, f, a, b);
+#else
+    return a / select(f, b, 1.f);
+#endif
+}
+
+// Sign functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec4f(-0.0f)) gives true, while Vec4f(-0.0f) < Vec4f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb sign_bit(Vec4f const a) {
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 >> 31;               // extend sign bit
+#if INSTRSET >= 10
+    return t2 != 0;
+#else
+    return _mm_castsi128_ps(t2);       // reinterpret as 32-bit Boolean
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4f sign_combine(Vec4f const a, Vec4f const b) {
+#if INSTRSET < 10
+    return a ^ (b & Vec4f(-0.0f));
+#else
+    return _mm_castsi128_ps (_mm_ternarylogic_epi32(
+        _mm_castps_si128(a), _mm_castps_si128(b), Vec4i(0x80000000), 0x78));
+#endif
+}
+
+// Categorization functions
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_finite(Vec4f const a) {
+#if INSTRSET >= 10
+    return __mmask8(_mm_fpclass_ps_mask(a, 0x99) ^ 0x0F);
+#else
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                // shift out sign bit
+    Vec4i t3 = Vec4i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
+    return Vec4ib(t3);
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec4fb is_inf(Vec4f const a) {
+#if INSTRSET >= 10
+    return __mmask8(_mm_fpclass_ps_mask(a, 0x18));
+#else
+    Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                // shift out sign bit
+    return t2 == Vec4i(0xFF000000);    // exponent is all 1s, fraction is 0
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
+static inline Vec4fb is_nan(Vec4f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec4fb(_mm_fpclass_ps_mask(a, 0x81));
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec4fb is_nan(Vec4f const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec4fb is_nan(Vec4f const a) {
+    __m128 aa = a;
+    __m128i unordered;
+    __asm volatile("vcmpps $3,  %1, %1, %0" : "=x" (unordered) :  "x" (aa) );
+    return Vec4fb(unordered);
+}
+#else
+static inline Vec4fb is_nan(Vec4f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm_cmp_ps(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
+}
+#endif
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4fb is_subnormal(Vec4f const a) {
+#if INSTRSET >= 10
+    return Vec4fb(_mm_fpclass_ps_mask(a, 0x20));
+#else
+    Vec4i t1 = _mm_castps_si128(a);              // reinterpret as 32-bit integer
+    Vec4i t2 = t1 << 1;                          // shift out sign bit
+    Vec4i t3 = 0xFF000000;                       // exponent mask
+    Vec4i t4 = t2 & t3;                          // exponent
+    Vec4i t5 = _mm_andnot_si128(t3, t2);         // fraction
+    return Vec4ib((t4 == 0) & (t5 != 0));        // exponent = 0 and fraction != 0
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4fb is_zero_or_subnormal(Vec4f const a) {
+#if INSTRSET >= 10
+    return Vec4fb(_mm_fpclass_ps_mask(a, 0x26));
+#else
+    Vec4i t = _mm_castps_si128(a);     // reinterpret as 32-bit integer
+    t &= 0x7F800000;                   // isolate exponent
+    return t == 0;                     // exponent = 0
+#endif
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec4f infinite4f() {
+    return _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
+}
+
+// Function nan4f: returns a vector where all elements are NAN (quiet)
+static inline Vec4f nan4f(int n = 0x10) {
+    return nan_vec<Vec4f>(n);
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add(Vec4f const a) {
+#if  INSTRSET >= 3 && false // SSE3
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128 t1 = _mm_hadd_ps(a, a);
+    __m128 t2 = _mm_hadd_ps(t1, t1);
+    return _mm_cvtss_f32(t2);
+#else
+    __m128 t1 = _mm_movehl_ps(a, a);
+    __m128 t2 = _mm_add_ps(a, t1);
+    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+    __m128 t4 = _mm_add_ss(t2, t3);
+    return _mm_cvtss_f32(t4);
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec4f max(Vec4f const a, Vec4f const b) {
+    return _mm_max_ps(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec4f min(Vec4f const a, Vec4f const b) {
+    return _mm_min_ps(a, b);
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+static inline Vec4f abs(Vec4f const a) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm_range_ps(a, a, 8);
+#else
+    __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    return _mm_and_ps(a, mask);
+#endif
+}
+
+// function sqrt: square root
+static inline Vec4f sqrt(Vec4f const a) {
+    return _mm_sqrt_ps(a);
+}
+
+// function square: a * a
+static inline Vec4f square(Vec4f const a) {
+    return a * a;
+}
+
+// pow(vector,int) function template
+template <typename VTYPE>
+static inline VTYPE pow_template_i(VTYPE const x0, int n) {
+    VTYPE x = x0;                      // a^(2^i)
+    VTYPE y(1.0f);                     // accumulator
+    if (n >= 0) {                      // make sure n is not negative
+        while (true) {                 // loop for each bit in n
+            if (n & 1) y *= x;         // multiply if bit = 1
+            n >>= 1;                   // get next bit of n
+            if (n == 0) return y;      // finished
+            x *= x;                    // x = a^2, a^4, a^8, etc.
+        }
+    }
+    else {
+        // n < 0
+        if (uint32_t(n) == 0x80000000u) return nan_vec<VTYPE>();  // integer overflow
+        return VTYPE(1.0f) / pow_template_i<VTYPE>(x0, -n);       // reciprocal
+    }
+}
+
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included
+template <typename TT> static Vec4f pow(Vec4f const a, TT const n);  // = delete
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4f pow<int>(Vec4f const x0, int const n) {
+    return pow_template_i<Vec4f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4f pow<uint32_t>(Vec4f const x0, uint32_t const n) {
+    return pow_template_i<Vec4f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+
+// gcc can optimize pow_template_i to generate the same as the code below. MS and Clang can not.
+// Therefore, this code is kept
+// to do: test on Intel compiler
+template <typename V, int n>
+static inline V pow_n(V const a) {
+    if (n == 0x80000000) return nan_vec<V>();    // integer overflow
+    if (n < 0)    return V(1.0f) / pow_n<V, -n>(a);
+    if (n == 0)   return V(1.0f);
+    if (n >= 256) return pow(a, n);
+    V x = a;                                     // a^(2^i)
+    V y;                                         // accumulator
+    const int lowest = n - (n & (n - 1));        // lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x * x;                                   // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x * x;                                   // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x * x;                                   // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x * x;                                   // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x * x;                                   // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x * x;                                   // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x * x;                                   // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+// implement as function pow(vector, const_int)
+template <int n>
+static inline Vec4f pow(Vec4f const a, Const_int_t<n>) {
+    return pow_n<Vec4f, n>(a);
+}
+
+// implement the same as macro pow_const(vector, int)
+#ifdef VCL_NAMESPACE
+#define pow_const(x,n) pow(x, VCL_NAMESPACE::Const_int_t<n>())
+#else
+#define pow_const(x,n) pow(x,Const_int_t<n>())
+#endif
+
+static inline Vec4f round(Vec4f const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 8);
+#else  // SSE2
+    Vec4i y1 = _mm_cvtps_epi32(a);           // convert to integer
+    Vec4f y2 = _mm_cvtepi32_ps(y1);          // convert back to float
+#ifdef SIGNED_ZERO
+    y2 |= (a & Vec4f(-0.0f));                // sign of zero
+#endif
+    return select(y1 != 0x80000000, y2, a);  // use original value if integer overflows
+#endif
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec4f truncate(Vec4f const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 3 + 8);
+#else  // SSE2
+    Vec4i y1 = _mm_cvttps_epi32(a);          // truncate to integer
+    Vec4f y2 = _mm_cvtepi32_ps(y1);          // convert back to float
+#ifdef SIGNED_ZERO
+    y2 |= (a & Vec4f(-0.0f));                // sign of zero
+#endif
+    return select(y1 != 0x80000000, y2, a);  // use original value if integer overflows
+#endif
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec4f floor(Vec4f const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 1 + 8);
+#else  // SSE2
+    Vec4f y = round(a);                      // round
+    y -= Vec4f(1.f) & (y > a);               // subtract 1 if bigger
+#ifdef SIGNED_ZERO
+    y |= (a & Vec4f(-0.0f));                 // sign of zero
+#endif
+    return y;
+#endif
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec4f ceil(Vec4f const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_ps(a, 2 + 8);
+#else  // SSE2
+    Vec4f y = round(a);                      // round
+    y += Vec4f(1.f) & (y < a);               // add 1 if bigger
+#ifdef SIGNED_ZERO
+    y |= (a & Vec4f(-0.0f));                 // sign of zero
+#endif
+    return y;
+#endif
+}
+
+// function roundi: round to nearest integer (even). (result as integer vector)
+static inline Vec4i roundi(Vec4f const a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm_cvtps_epi32(a);
+}
+//static inline Vec4i round_to_int(Vec4f const a) { return roundi(a); } // deprecated
+
+// function truncatei: round towards zero. (result as integer vector)
+static inline Vec4i truncatei(Vec4f const a) {
+    return _mm_cvttps_epi32(a);
+}
+//static inline Vec4i truncate_to_int(Vec4f const a) { return truncatei(a); } // deprecated
+
+// function to_float: convert integer vector to float vector
+static inline Vec4f to_float(Vec4i const a) {
+    return _mm_cvtepi32_ps(a);
+}
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec4f to_float(Vec4ui const a) {
+#if INSTRSET >= 10 && (!defined(_MSC_VER) || defined(__INTEL_COMPILER)) // _mm_cvtepu32_ps missing in MS VS2019
+    return _mm_cvtepu32_ps(a);
+#elif INSTRSET >= 9  // __AVX512F__
+    return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a)));
+#else
+    Vec4f b = to_float(Vec4i(a & 0xFFFFF));             // 20 bits
+    Vec4f c = to_float(Vec4i(a >> 20));                 // remaining bits
+    Vec4f d = b + c * 1048576.f;  // 2^20
+    return d;
+#endif
+}
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec4f approx_recipr(Vec4f const a) {
+#ifdef __AVX512ER__  // AVX512ER: full precision
+    // todo: if future processors have both AVX512ER and AVX512VL: _mm128_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
+    return _mm512_castps512_ps128(_mm512_rcp28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
+#elif INSTRSET >= 10   // AVX512VL: 14 bit precision
+    return _mm_rcp14_ps(a);
+#elif INSTRSET >= 9    // AVX512F: 14 bit precision
+    return _mm512_castps512_ps128(_mm512_rcp14_ps(_mm512_castps128_ps512(a)));
+#else  // AVX: 11 bit precision
+    return _mm_rcp_ps(a);
+#endif
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec4f approx_rsqrt(Vec4f const a) {
+    // use more accurate version if available. (none of these will raise exceptions on zero)
+#ifdef __AVX512ER__  // AVX512ER: full precision
+    // todo: if future processors have both AVX512ER and AVX521VL: _mm128_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
+    return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
+#elif INSTRSET >= 10 && !defined(_MSC_VER)  // missing in VS2019
+    return _mm_rsqrt14_ps(a);
+#elif INSTRSET >= 9  // AVX512F: 14 bit precision
+    return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
+#else  // SSE: 11 bit precision
+    return _mm_rsqrt_ps(a);
+#endif
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4f mul_add(Vec4f const a, Vec4f const b, Vec4f const c) {
+#ifdef __FMA__
+    return _mm_fmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_macc_ps(a, b, c);
+#else
+    return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec4f mul_sub(Vec4f const a, Vec4f const b, Vec4f const c) {
+#ifdef __FMA__
+    return _mm_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_ps(a, b, c);
+#else
+    return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec4f nmul_add(Vec4f const a, Vec4f const b, Vec4f const c) {
+#ifdef __FMA__
+    return _mm_fnmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_nmacc_ps(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split.
+// This is used in mathematical functions. Do not use it in general code
+// because it is inaccurate in certain cases
+static inline Vec4f mul_sub_x(Vec4f const a, Vec4f const b, Vec4f const c) {
+#ifdef __FMA__
+    return _mm_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_ps(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    Vec4i upper_mask = -(1 << 12);                         // mask to remove lower 12 bits
+    Vec4f a_high = a & Vec4f(_mm_castsi128_ps(upper_mask));// split into high and low parts
+    Vec4f b_high = b & Vec4f(_mm_castsi128_ps(upper_mask));
+    Vec4f a_low = a - a_high;
+    Vec4f b_low = b - b_high;
+    Vec4f r1 = a_high * b_high;                            // this product is exact
+    Vec4f r2 = r1 - c;                                     // subtract c from high product
+    Vec4f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec4i exponent(Vec4f const a) {
+    Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
+    Vec4ui t2 = t1 << 1;               // shift out sign bit
+    Vec4ui t3 = t2 >> 24;              // shift down logical to position 0
+    Vec4i  t4 = Vec4i(t3) - 0x7F;      // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h !
+static inline Vec4f fraction(Vec4f const a) {
+#if INSTRSET >= 10
+    return _mm_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+#else
+    Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
+    Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F800000); // set exponent to 0 + bias
+    return _mm_castsi128_ps(t2);
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4f exp2(Vec4i const n) {
+    Vec4i t1 = max(n, -0x7F);         // limit to allowed range
+    Vec4i t2 = min(t1, 0x80);
+    Vec4i t3 = t2 + 0x7F;              // add bias
+    Vec4i t4 = t3 << 23;               // put exponent into position 23
+    return _mm_castsi128_ps(t4);       // reinterpret as float
+}
+//static Vec4f exp2(Vec4f const x);    // defined in vectormath_exp.h
+
+
+// Control word manipulaton
+// ------------------------
+// The MXCSR control word has the following bits:
+//  0:    Invalid Operation Flag
+//  1:    Denormal Flag (=subnormal)
+//  2:    Divide-by-Zero Flag
+//  3:    Overflow Flag
+//  4:    Underflow Flag
+//  5:    Precision Flag
+//  6:    Denormals Are Zeros (=subnormals)
+//  7:    Invalid Operation Mask
+//  8:    Denormal Operation Mask (=subnormal)
+//  9:    Divide-by-Zero Mask
+// 10:    Overflow Mask
+// 11:    Underflow Mask
+// 12:    Precision Mask
+// 13-14: Rounding control
+//        00: round to nearest or even
+//        01: round down towards -infinity
+//        10: round up   towards +infinity
+//        11: round towards zero (truncate)
+// 15: Flush to Zero
+
+// Function get_control_word:
+// Read the MXCSR control word
+static inline uint32_t get_control_word() {
+    return _mm_getcsr();
+}
+
+// Function set_control_word:
+// Write the MXCSR control word
+static inline void set_control_word(uint32_t w) {
+    _mm_setcsr(w);
+}
+
+// Function no_subnormals:
+// Set "Denormals Are Zeros" and "Flush to Zero" mode to avoid the extremely
+// time-consuming denormals in case of underflow
+static inline void no_subnormals() {
+    uint32_t t1 = get_control_word();
+    t1 |= (1 << 6) | (1 << 15);     // set bit 6 and 15 in MXCSR
+    set_control_word(t1);
+}
+
+// Function reset_control_word:
+// Set the MXCSR control word to the default value 0x1F80.
+// This will mask floating point exceptions, set rounding mode to nearest (or even),
+// and allow denormals.
+static inline void reset_control_word() {
+    set_control_word(0x1F80);
+}
+
+
+// change signs on vectors Vec4f
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f change_sign(Vec4f const a) {
+    if ((i0 | i1 | i2 | i3) == 0) return a;
+    __m128i mask = constant4ui<i0 ? 0x80000000 : 0, i1 ? 0x80000000 : 0, i2 ? 0x80000000 : 0, i3 ? 0x80000000 : 0>();
+    return  _mm_xor_ps(a, _mm_castsi128_ps(mask));     // flip sign bits
+}
+
+
+/*****************************************************************************
+*
+*          Vec2d: Vector of 2 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec2d {
+protected:
+    __m128d xmm; // double vector
+public:
+    // Default constructor:
+    Vec2d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec2d(double d) {
+        xmm = _mm_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec2d(double d0, double d1) {
+        xmm = _mm_setr_pd(d0, d1);
+    }
+    // Constructor to convert from type __m128d used in intrinsics:
+    Vec2d(__m128d const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128d used in intrinsics:
+    Vec2d & operator = (__m128d const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128d used in intrinsics
+    operator __m128d() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec2d & load(double const * p) {
+        xmm = _mm_loadu_pd(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 16
+    // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 16.
+    Vec2d const load_a(double const * p) {
+        xmm = _mm_load_pd(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm_storeu_pd(p, xmm);
+    }
+    // Member function storing into array, aligned by 16
+    // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 16.
+    void store_a(double * p) const {
+        _mm_store_pd(p, xmm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 16
+    void store_nt(double * p) const {
+        _mm_stream_pd(p, xmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec2d & load_partial(int n, double const * p) {
+#if INSTRSET >= 10   // AVX512VL
+        xmm = _mm_maskz_loadu_pd(__mmask8((1u << n) - 1), p);
+#else
+        if (n == 1) {
+            xmm = _mm_load_sd(p);
+        }
+        else if (n == 2) {
+            load(p);
+        }
+        else {
+            xmm = _mm_setzero_pd();
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+#if INSTRSET >= 10  // AVX512VL
+        _mm_mask_storeu_pd(p, __mmask8((1u << n) - 1), xmm);
+#else
+        if (n == 1) {
+            _mm_store_sd(p, xmm);
+        }
+        else if (n == 2) {
+            store(p);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec2d & cutoff(int n) {
+#if INSTRSET >= 10
+        xmm = _mm_maskz_mov_pd(__mmask8((1u << n) - 1), xmm);
+#else
+        xmm = _mm_castps_pd(Vec4f(_mm_castpd_ps(xmm)).cutoff(n * 2));
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec2d const insert(int index, double value) {
+#if INSTRSET >= 10   // AVX512VL
+        xmm = _mm_mask_movedup_pd(xmm, __mmask8(1u << index), _mm_set_sd(value));
+#else
+        __m128d v2 = _mm_set_sd(value);
+        if (index == 0) {
+            xmm = _mm_shuffle_pd(v2, xmm, 2);
+        }
+        else {
+            xmm = _mm_shuffle_pd(xmm, v2, 0);
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(int index) const {
+#if INSTRSET >= 10   // AVX512VL
+        __m128d x = _mm_mask_unpackhi_pd(xmm, __mmask8(index), xmm, xmm);
+        return _mm_cvtsd_f64(x);
+#else
+        double x[2];
+        store(x);
+        return x[index & 1];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 2;
+    }
+    static constexpr int elementtype() {
+        return 17;
+    }
+    typedef __m128d registertype;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec2d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec2d operator + (Vec2d const a, Vec2d const b) {
+    return _mm_add_pd(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec2d operator + (Vec2d const a, double b) {
+    return a + Vec2d(b);
+}
+static inline Vec2d operator + (double a, Vec2d const b) {
+    return Vec2d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec2d & operator += (Vec2d & a, Vec2d const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec2d operator ++ (Vec2d & a, int) {
+    Vec2d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec2d & operator ++ (Vec2d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec2d operator - (Vec2d const a, Vec2d const b) {
+    return _mm_sub_pd(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec2d operator - (Vec2d const a, double b) {
+    return a - Vec2d(b);
+}
+static inline Vec2d operator - (double a, Vec2d const b) {
+    return Vec2d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec2d operator - (Vec2d const a) {
+    return _mm_xor_pd(a, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
+}
+
+// vector operator -= : subtract
+static inline Vec2d & operator -= (Vec2d & a, Vec2d const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec2d operator -- (Vec2d & a, int) {
+    Vec2d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec2d & operator -- (Vec2d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec2d operator * (Vec2d const a, Vec2d const b) {
+    return _mm_mul_pd(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec2d operator * (Vec2d const a, double b) {
+    return a * Vec2d(b);
+}
+static inline Vec2d operator * (double a, Vec2d const b) {
+    return Vec2d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec2d & operator *= (Vec2d & a, Vec2d const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec2d operator / (Vec2d const a, Vec2d const b) {
+    return _mm_div_pd(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec2d operator / (Vec2d const a, double b) {
+    return a / Vec2d(b);
+}
+static inline Vec2d operator / (double a, Vec2d const b) {
+    return Vec2d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec2d & operator /= (Vec2d & a, Vec2d const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec2db operator == (Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_pd_mask(a, b, 0);
+#else
+    return _mm_cmpeq_pd(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec2db operator != (Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_pd_mask(a, b, 4);
+#else
+    return _mm_cmpneq_pd(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec2db operator < (Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_pd_mask(a, b, 1);
+#else
+    return _mm_cmplt_pd(a, b);
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec2db operator <= (Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_pd_mask(a, b, 2);
+#else
+    return _mm_cmple_pd(a, b);
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec2db operator > (Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_pd_mask(a, b, 6);
+#else
+    return b < a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec2db operator >= (Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_pd_mask(a, b, 5);
+#else
+    return b <= a;
+#endif
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec2d operator & (Vec2d const a, Vec2d const b) {
+    return _mm_and_pd(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec2d & operator &= (Vec2d & a, Vec2d const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec2d and Vec2db
+static inline Vec2d operator & (Vec2d const a, Vec2db const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_maskz_mov_pd(b, a);
+#else
+    return _mm_and_pd(a, b);
+#endif
+}
+static inline Vec2d operator & (Vec2db const a, Vec2d const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2d operator | (Vec2d const a, Vec2d const b) {
+    return _mm_or_pd(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec2d & operator |= (Vec2d & a, Vec2d const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2d operator ^ (Vec2d const a, Vec2d const b) {
+    return _mm_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec2d & operator ^= (Vec2d & a, Vec2d const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec2db operator ! (Vec2d const a) {
+    return a == Vec2d(0.0);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec2d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
+// No other values are allowed.
+static inline Vec2d select(Vec2db const s, Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_pd(b, s, a);
+#else
+    return selectd(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2d if_add(Vec2db const f, Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10
+    return _mm_mask_add_pd (a, f, a, b);
+#else
+    return a + (Vec2d(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec2d if_sub(Vec2db const f, Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10
+    return _mm_mask_sub_pd (a, f, a, b);
+#else
+    return a - (Vec2d(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec2d if_mul(Vec2db const f, Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10
+    return _mm_mask_mul_pd (a, f, a, b);
+#else
+    return a * select(f, b, 1.);
+#endif
+}
+
+// Conditional divide
+static inline Vec2d if_div(Vec2db const f, Vec2d const a, Vec2d const b) {
+#if INSTRSET >= 10
+    return _mm_mask_div_pd (a, f, a, b);
+#else
+    return a / select(f, b, 1.);
+#endif
+}
+
+// Sign functions
+
+// change signs on vectors Vec2d
+// Each index i0 - i1 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1>
+static inline Vec2d change_sign(Vec2d const a) {
+    if ((i0 | i1) == 0) return a;
+    __m128i mask = constant4ui<0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0>();
+    return  _mm_xor_pd(a, _mm_castsi128_pd(mask));  // flip sign bits
+}
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec2d(-0.0)) gives true, while Vec2d(-0.0) < Vec2d(0.0) gives false
+static inline Vec2db sign_bit(Vec2d const a) {
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as 64-bit integer
+    Vec2q t2 = t1 >> 63;               // extend sign bit
+#if INSTRSET >= 10
+    return t2 != 0;
+#else
+    return _mm_castsi128_pd(t2);       // reinterpret as 64-bit Boolean
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec2d sign_combine(Vec2d const a, Vec2d const b) {
+#if INSTRSET < 10
+    return a ^ (b & Vec2d(-0.0));
+#else
+    return _mm_castsi128_pd (_mm_ternarylogic_epi64(
+        _mm_castpd_si128(a), _mm_castpd_si128(b), Vec2q(0x8000000000000000), 0x78));
+#endif
+}
+
+// Categorization functions
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec2db is_finite(Vec2d const a) {
+#if INSTRSET >= 10
+    return __mmask8(_mm_fpclass_pd_mask(a, 0x99) ^ 0x03);
+#else
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec2qb t4 = Vec2q(t2 & t3) != t3;  // exponent field is not all 1s
+    return t4;
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec2db is_inf(Vec2d const a) {
+#if INSTRSET >= 10
+    return _mm_fpclass_pd_mask(a, 0x18);
+#else
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    return t2 == 0xFFE0000000000000ll; // exponent is all 1s, fraction is 0
+#endif
+}
+
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
+static inline Vec2db is_nan(Vec2d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec2db(_mm_fpclass_pd_mask(a, 0x81));
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec2db is_nan(Vec2d const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec2db is_nan(Vec2d const a) {
+    __m128d aa = a;
+    __m128i unordered;
+    __asm volatile("vcmppd $3,  %1, %1, %0" : "=x" (unordered) :  "x" (aa) );
+    return Vec2db(unordered);
+}
+#else
+static inline Vec2db is_nan(Vec2d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm_cmp_pd(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
+}
+#endif
+
+
+// Function is_subnormal: gives true for elements that are subnormal (denormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec2db is_subnormal(Vec2d const a) {
+#if INSTRSET >= 10
+    return _mm_fpclass_pd_mask(a, 0x20);
+#else
+    Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as 32-bit integer
+    Vec2q t2 = t1 << 1;                // shift out sign bit
+    Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec2q t4 = t2 & t3;                // exponent
+    Vec2q t5 = _mm_andnot_si128(t3, t2);// fraction
+    return Vec2qb((t4 == 0) & (t5 != 0));  // exponent = 0 and fraction != 0
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec2db is_zero_or_subnormal(Vec2d const a) {
+#if INSTRSET >= 10
+    return _mm_fpclass_pd_mask(a, 0x26);
+#else
+    Vec2q t = _mm_castpd_si128(a);     // reinterpret as 32-bit integer
+    t &= 0x7FF0000000000000ll;   // isolate exponent
+    return t == 0;                     // exponent = 0
+#endif
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add(Vec2d const a) {
+
+#if false &&  INSTRSET >= 3  // SSE3
+    // This version causes errors in Clang version 9.0 (https://bugs.llvm.org/show_bug.cgi?id=44111)
+    // It is also inefficient on most processors, so we drop it
+    __m128d t1 = _mm_hadd_pd(a, a);
+    return _mm_cvtsd_f64(t1);
+
+#elif true
+    // This version is OK
+    __m128d t1 = _mm_unpackhi_pd(a, a);
+    __m128d t2 = _mm_add_pd(a, t1);
+    return _mm_cvtsd_f64(t2);
+
+#else
+    // This version is also OK
+    __m128  t0 = _mm_castpd_ps(a);
+    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0, t0));
+    __m128d t2 = _mm_add_sd(a, t1);
+    return _mm_cvtsd_f64(t2);
+
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec2d max(Vec2d const a, Vec2d const b) {
+    return _mm_max_pd(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec2d min(Vec2d const a, Vec2d const b) {
+    return _mm_min_pd(a, b);
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+static inline Vec2d abs(Vec2d const a) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm_range_pd(a, a, 8);
+#else
+    __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(-1, 0x7FFFFFFF, -1, 0x7FFFFFFF));
+    return _mm_and_pd(a, mask);
+#endif
+}
+
+// function sqrt: square root
+static inline Vec2d sqrt(Vec2d const a) {
+    return _mm_sqrt_pd(a);
+}
+
+// function square: a * a
+static inline Vec2d square(Vec2d const a) {
+    return a * a;
+}
+
+// pow(Vec2d, int):
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included
+template <typename TT> static Vec2d pow(Vec2d const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec2d pow<int>(Vec2d const x0, int const n) {
+    return pow_template_i<Vec2d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec2d pow<uint32_t>(Vec2d const x0, uint32_t const n) {
+    return pow_template_i<Vec2d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec2d pow(Vec2d const a, Const_int_t<n>) {
+    return pow_n<Vec2d, n>(a);
+}
+
+// function round: round to nearest integer (even). (result as double vector)
+#if INSTRSET >= 5   // SSE4.1 supported
+static inline Vec2d round(Vec2d const a) {
+    return _mm_round_pd(a, 0 + 8);
+}
+#else
+
+// avoid unsafe optimization in function round
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
+static inline Vec2d round(Vec2d const a) __attribute__((optimize("-fno-unsafe-math-optimizations")));
+#elif defined(__clang__) && INSTRSET < 5
+static inline Vec2d round(Vec2d const a) __attribute__((optnone));
+#elif defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
+#pragma float_control(push)
+#pragma float_control(precise,on)
+#endif
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec2d round(Vec2d const a) {
+    // Note: assume MXCSR control register is set to rounding
+    // (don't use conversion to int, it will limit the value to +/- 2^31)
+    Vec2d signmask = _mm_castsi128_pd(constant4ui<0, 0x80000000, 0, 0x80000000>()); // -0.0
+    Vec2d magic = _mm_castsi128_pd(constant4ui<0, 0x43300000, 0, 0x43300000>());    // magic number = 2^52
+    Vec2d sign = _mm_and_pd(a, signmask);        // signbit of a
+    Vec2d signedmagic = _mm_or_pd(magic, sign);  // magic number with sign of a
+    Vec2d y = a + signedmagic - signedmagic;     // round by adding magic number
+#ifdef SIGNED_ZERO
+    y |= (a & Vec2d(-0.0));                      // sign of zero
+#endif
+    return y;
+}
+#if defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
+#pragma float_control(pop)
+#endif
+#endif
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec2d truncate(Vec2d const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 3 + 8);
+#else  // SSE2
+    Vec2d a1 = abs(a);                        // abs
+    Vec2d y1 = round(a1);                     // round
+    Vec2d y2 = y1 - (Vec2d(1.0) & (y1 > a1)); // subtract 1 if bigger
+    Vec2d y3 = y2 | (a & Vec2d(-0.));         // put the sign back in
+    return y3;
+#endif
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec2d floor(Vec2d const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 1 + 8);
+#else  // SSE2
+    Vec2d y = round(a);                      // round
+    y -= Vec2d(1.0) & (y > a);               // subtract 1 if bigger
+#ifdef SIGNED_ZERO
+    y |= (a & Vec2d(-0.0));                  // sign of zero
+#endif
+    return y;
+#endif
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec2d ceil(Vec2d const a) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_round_pd(a, 2 + 8);
+#else  // SSE2
+    Vec2d y = round(a);                      // round
+    y += Vec2d(1.0) & (y < a);               // add 1 if smaller
+#ifdef SIGNED_ZERO
+    y |= (a & Vec2d(-0.0));                  // sign of zero
+#endif
+    return y;
+#endif
+}
+
+// function truncate_to_int32: round towards zero.
+static inline Vec4i truncate_to_int32(Vec2d const a, Vec2d const b) {
+    Vec4i t1 = _mm_cvttpd_epi32(a);
+    Vec4i t2 = _mm_cvttpd_epi32(b);
+    return _mm_unpacklo_epi64(t1,t2);
+}
+//static inline Vec4i truncate_to_int(Vec2d const a, Vec2d const b) { // deprecated
+//    return truncate_to_int32(a, b);}
+
+// function truncate_to_int32: round towards zero.
+static inline Vec4i truncate_to_int32(Vec2d const a) {
+    return _mm_cvttpd_epi32(a);
+}
+//static inline Vec4i truncate_to_int(Vec2d const a) { // deprecated
+//    return truncate_to_int32(a);}
+
+// function truncatei: round towards zero. (inefficient for lower instruction sets)
+static inline Vec2q truncatei(Vec2d const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    //return _mm_maskz_cvttpd_epi64( __mmask8(0xFF), a);
+    return _mm_cvttpd_epi64(a);
+#else
+    double aa[2];
+    a.store(aa);
+    return Vec2q(int64_t(aa[0]), int64_t(aa[1]));
+#endif
+}
+//static inline Vec2q truncate_to_int64(Vec2d const a) { return truncatei(a); } // deprecated
+
+// function round_to_int: round to nearest integer (even).
+// result as 32-bit integer vector
+static inline Vec4i round_to_int32(Vec2d const a, Vec2d const b) {
+    // Note: assume MXCSR control register is set to rounding
+    Vec4i t1 = _mm_cvtpd_epi32(a);
+    Vec4i t2 = _mm_cvtpd_epi32(b);
+    return _mm_unpacklo_epi64(t1,t2);
+}
+//static inline Vec4i round_to_int(Vec2d const a, Vec2d const b) {  // deprecated
+//    return round_to_int32(a, b);}
+
+// function round_to_int: round to nearest integer (even).
+// result as 32-bit integer vector. Upper two values of result are 0
+static inline Vec4i round_to_int32(Vec2d const a) {
+    Vec4i t1 = _mm_cvtpd_epi32(a);
+    return t1;
+}
+//static inline Vec4i round_to_int(Vec2d const a) { return round_to_int32(a); }  // deprecated
+
+// function round_to_int64: round to nearest or even. (inefficient for lower instruction sets)
+static inline Vec2q roundi(Vec2d const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm_cvtpd_epi64(a);
+#else
+    return truncatei(round(a));
+#endif
+}
+//static inline Vec2q round_to_int64(Vec2d const a) { return roundi(a); } // deprecated
+
+// function to_double: convert integer vector elements to double vector (inefficient for lower instruction sets)
+static inline Vec2d to_double(Vec2q const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm_maskz_cvtepi64_pd(__mmask8(0xFF), a);
+#else
+    int64_t aa[2];
+    a.store(aa);
+    return Vec2d(double(aa[0]), double(aa[1]));
+#endif
+}
+
+static inline Vec2d to_double(Vec2uq const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm_cvtepu64_pd(a);
+#else
+    uint64_t aa[2];      // inefficient
+    a.store(aa);
+    return Vec2d(double(aa[0]), double(aa[1]));
+#endif
+}
+
+// function to_double_low: convert integer vector elements [0] and [1] to double vector
+static inline Vec2d to_double_low(Vec4i const a) {
+    return _mm_cvtepi32_pd(a);
+}
+
+// function to_double_high: convert integer vector elements [2] and [3] to double vector
+static inline Vec2d to_double_high(Vec4i const a) {
+    return to_double_low(_mm_srli_si128(a, 8));
+}
+
+// function compress: convert two Vec2d to one Vec4f
+static inline Vec4f compress(Vec2d const low, Vec2d const high) {
+    Vec4f t1 = _mm_cvtpd_ps(low);
+    Vec4f t2 = _mm_cvtpd_ps(high);
+    return _mm_shuffle_ps(t1, t2, 0x44);
+}
+
+// Function extend_low : convert Vec4f vector elements [0] and [1] to Vec2d
+static inline Vec2d extend_low(Vec4f const a) {
+    return _mm_cvtps_pd(a);
+}
+
+// Function extend_high : convert Vec4f vector elements [2] and [3] to Vec2d
+static inline Vec2d extend_high(Vec4f const a) {
+    return _mm_cvtps_pd(_mm_movehl_ps(a, a));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec2d mul_add(Vec2d const a, Vec2d const b, Vec2d const c) {
+#ifdef __FMA__
+    return _mm_fmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_macc_pd(a, b, c);
+#else
+    return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec2d mul_sub(Vec2d const a, Vec2d const b, Vec2d const c) {
+#ifdef __FMA__
+    return _mm_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_pd(a, b, c);
+#else
+    return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec2d nmul_add(Vec2d const a, Vec2d const b, Vec2d const c) {
+#ifdef __FMA__
+    return _mm_fnmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_nmacc_pd(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split.
+// This is used in mathematical functions. Do not use it in general code
+// because it is inaccurate in certain cases
+static inline Vec2d mul_sub_x(Vec2d const a, Vec2d const b, Vec2d const c) {
+#ifdef __FMA__
+    return _mm_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm_msub_pd(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    Vec2q upper_mask = -(1LL << 27);                       // mask to remove lower 27 bits
+    Vec2d a_high = a & Vec2d(_mm_castsi128_pd(upper_mask));// split into high and low parts
+    Vec2d b_high = b & Vec2d(_mm_castsi128_pd(upper_mask));
+    Vec2d a_low = a - a_high;
+    Vec2d b_low = b - b_high;
+    Vec2d r1 = a_high * b_high;                            // this product is exact
+    Vec2d r2 = r1 - c;                                     // subtract c from high product
+    Vec2d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec2q exponent(Vec2d const a) {
+    Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
+    Vec2uq t2 = t1 << 1;               // shift out sign bit
+    Vec2uq t3 = t2 >> 53;              // shift down logical to position 0
+    Vec2q  t4 = Vec2q(t3) - 0x3FF;     // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h !
+static inline Vec2d fraction(Vec2d const a) {
+#if INSTRSET >= 10
+    return _mm_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+#else
+    Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
+    Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FF0000000000000ll); // set exponent to 0 + bias
+    return _mm_castsi128_pd(t2);
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec2d exp2(Vec2q const n) {
+    Vec2q t1 = max(n, -0x3FF);        // limit to allowed range
+    Vec2q t2 = min(t1, 0x400);
+    Vec2q t3 = t2 + 0x3FF;             // add bias
+    Vec2q t4 = t3 << 52;               // put exponent into position 52
+    return _mm_castsi128_pd(t4);       // reinterpret as double
+}
+//static Vec2d exp2(Vec2d const x); // defined in vectormath_exp.h
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+static inline __m128i reinterpret_i(__m128i const x) {
+    return x;
+}
+
+static inline __m128i reinterpret_i(__m128  const x) {
+    return _mm_castps_si128(x);
+}
+
+static inline __m128i reinterpret_i(__m128d const x) {
+    return _mm_castpd_si128(x);
+}
+
+static inline __m128  reinterpret_f(__m128i const x) {
+    return _mm_castsi128_ps(x);
+}
+
+static inline __m128  reinterpret_f(__m128  const x) {
+    return x;
+}
+
+static inline __m128  reinterpret_f(__m128d const x) {
+    return _mm_castpd_ps(x);
+}
+
+static inline __m128d reinterpret_d(__m128i const x) {
+    return _mm_castsi128_pd(x);
+}
+
+static inline __m128d reinterpret_d(__m128  const x) {
+    return _mm_castps_pd(x);
+}
+
+static inline __m128d reinterpret_d(__m128d const x) {
+    return x;
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec2d infinite2d() {
+    return reinterpret_d(Vec2q(0x7FF0000000000000));
+}
+
+// Function nan2d: returns a vector where all elements are +NAN (quiet)
+static inline Vec2d nan2d(int n = 0x10) {
+    return nan_vec<Vec2d>(n);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+******************************************************************************
+*
+* The permute function can reorder the elements of a vector and optionally
+* set some elements to zero.
+*
+* See vectori128.h for details
+*
+*****************************************************************************/
+
+// permute vector Vec2d
+template <int i0, int i1>
+static inline Vec2d permute2(Vec2d const a) {
+    int constexpr indexs[2] = { i0, i1 };                  // indexes as array
+    __m128d y = a;                                         // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec2q>(indexs);
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+    if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_pd();  // just return zero
+
+    constexpr bool fit_shleft  = (flags & perm_shleft)  != 0;
+    constexpr bool fit_shright = (flags & perm_shright) != 0;
+    constexpr bool fit_punpckh = (flags & perm_punpckh) != 0;
+    constexpr bool fit_punpckl = (flags & perm_punpckl) != 0;
+    constexpr bool fit_zeroing = (flags & perm_zeroing) != 0;
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+        // try to fit various instructions
+        if constexpr (fit_shleft && fit_zeroing) {
+            // pslldq does both permutation and zeroing. if zeroing not needed use punpckl instead
+            return _mm_castsi128_pd(_mm_bslli_si128(_mm_castpd_si128(a), 8));
+        }
+        if constexpr (fit_shright && fit_zeroing) {
+            // psrldq does both permutation and zeroing. if zeroing not needed use punpckh instead
+            return _mm_castsi128_pd(_mm_bsrli_si128(_mm_castpd_si128(a), 8));
+        }
+        if constexpr (fit_punpckh) {       // fits punpckhi
+            y = _mm_unpackhi_pd(a, a);
+        }
+        else if constexpr (fit_punpckl) {  // fits punpcklo
+            y = _mm_unpacklo_pd(a, a);
+        }
+        else {  // needs general permute
+            y = _mm_shuffle_pd(a, a, (i0 & 1) | (i1 & 1) * 2);
+        }
+    }
+    if constexpr (fit_zeroing) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_pd(zero_mask<2>(indexs), y);
+#else  // use unpack to avoid using data cache
+        if constexpr (i0 == -1) {
+            y = _mm_unpackhi_pd(_mm_setzero_pd(), y);
+        }
+        else if constexpr (i1 == -1) {
+            y = _mm_unpacklo_pd(y, _mm_setzero_pd());
+        }
+#endif
+    }
+    return y;
+}
+
+
+// permute vector Vec4f
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f permute4(Vec4f const a) {
+    constexpr int indexs[4] = {i0, i1, i2, i3};            // indexes as array
+    __m128 y = a;                                          // result
+
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec4f>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_ps();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {
+            // use larger permutation
+            constexpr EList<int, 2> L = largeblock_perm<4>(indexs); // permutation pattern
+            y = reinterpret_f(permute2 <L.a[0], L.a[1]> (Vec2d(reinterpret_d(a))));
+            if (!(flags & perm_addz)) return y;                 // no remaining zeroing
+        }
+#if  INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask
+        else if constexpr ((flags & perm_zeroing) != 0) {
+            // Do both permutation and zeroing with PSHUFB instruction
+            const EList <int8_t, 16> bm = pshufb_mask<Vec4i>(indexs);
+            return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), Vec4i().load(bm.a)));
+        }
+#endif
+        else if constexpr ((flags & perm_punpckh) != 0) {  // fits punpckhi
+            y = _mm_unpackhi_ps(a, a);
+        }
+        else if constexpr ((flags & perm_punpckl) != 0) {  // fits punpcklo
+            y = _mm_unpacklo_ps(a, a);
+        }
+        else if constexpr ((flags & perm_shleft) != 0) {   // fits pslldq
+            y = _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128(a), (16-(flags >> perm_rot_count)) & 0xF));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_shright) != 0) {  // fits psrldq
+            y = _mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(a), (flags >> perm_rot_count) & 0xF));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+#if INSTRSET >= 3  // SSE3
+        else if constexpr (i0 == 0 && i1 == 0 && i2 == 2 && i3 == 2) {
+            return _mm_moveldup_ps(a);
+        }
+        else if constexpr (i0 == 1 && i1 == 1 && i2 == 3 && i3 == 3) {
+            return _mm_movehdup_ps(a);
+        }
+#endif
+        else {  // needs general permute
+            y = _mm_shuffle_ps(a, a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6);
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        // The mask-zero operation can be merged into the preceding instruction, whatever that is.
+        // A good optimizing compiler will do this automatically.
+        // I don't want to clutter all the branches above with this
+        y = _mm_maskz_mov_ps (zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 4> bm = zero_mask_broad<Vec4i>(indexs);
+        y = _mm_and_ps(_mm_castsi128_ps(Vec4i().load(bm.a)), y);
+#endif
+    }
+    return y;
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+// permute and blend Vec2d
+template <int i0, int i1>
+static inline Vec2d blend2(Vec2d const a, Vec2d const b) {
+    int constexpr indexs[2] = { i0, i1 };                  // indexes as array
+    __m128d y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec2d>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_pd ();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute2 <i0, i1> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute2 <i0<0 ? i0 : i0&1, i1<0 ? i1 : i1&1> (b);
+    }
+
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_pd (a, (uint8_t)make_bit_mask<2, 0x301>(indexs), b);
+#elif INSTRSET >= 5  // SSE4.1
+        y = _mm_blend_pd (a, b, ((i0 & 2) ? 0x01 : 0) | ((i1 & 2) ? 0x02 : 0));
+#else  // SSE2
+        const EList <int64_t, 2> bm = make_broad_mask<Vec2d>(make_bit_mask<2, 0x301>(indexs));
+        y = selectd(_mm_castsi128_pd(Vec2q().load(bm.a)), b, a);
+#endif
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm_unpacklo_pd (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm_unpacklo_pd (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm_unpackhi_pd (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm_unpackhi_pd (b, a);
+    }
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm_shuffle_pd(a, b, (flags >> blend_shufpattern) & 3);
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm_shuffle_pd(b, a, (flags >> blend_shufpattern) & 3);
+    }
+    else { // No special cases. permute a and b separately, then blend.
+           // This will not occur if ALLOW_FP_PERMUTE is true
+#if INSTRSET >= 5  // SSE4.1
+        constexpr bool dozero = false;
+#else  // SSE2
+        constexpr bool dozero = true;
+#endif
+        constexpr EList<int, 4> L = blend_perm_indexes<2, (int)dozero>(indexs); // get permutation indexes
+        __m128d ya = permute2<L.a[0], L.a[1]>(a);
+        __m128d yb = permute2<L.a[2], L.a[3]>(b);
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_pd (ya, (uint8_t)make_bit_mask<2, 0x301>(indexs), yb);
+#elif INSTRSET >= 5  // SSE4.1
+        y = _mm_blend_pd (ya, yb, ((i0 & 2) ? 0x01 : 0) | ((i1 & 2) ? 0x02 : 0));
+#else  // SSE2
+        return _mm_or_pd(ya, yb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_pd(zero_mask<2>(indexs), y);
+#else  // use broad mask
+        const EList <int64_t, 2> bm = zero_mask_broad<Vec2q>(indexs);
+        y = _mm_and_pd(_mm_castsi128_pd(Vec2q().load(bm.a)), y);
+#endif
+    }
+    return y;
+}
+
+
+// permute and blend Vec4f
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f blend4(Vec4f const a, Vec4f const b) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 };          // indexes as array
+    __m128 y = a;                                          // result
+    constexpr uint64_t flags = blend_flags<Vec4f>(indexs); // get flags for possibilities that fit the index pattern
+
+    constexpr bool blendonly = (flags & (blend_perma | blend_permb)) == 0; // no permutation, only blending
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_ps();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute4 <i0, i1, i2, i3> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute4 < i0<0?i0:i0&3, i1<0?i1:i1&3, i2<0?i2:i2&3, i3<0?i3:i3&3> (b);
+    }
+    if constexpr ((flags & blend_largeblock) != 0) {       // fits blending with larger block size
+        constexpr EList<int, 2> L = largeblock_indexes<4>(indexs);
+        y = _mm_castpd_ps(blend2 <L.a[0], L.a[1]> (Vec2d(_mm_castps_pd(a)), Vec2d(_mm_castps_pd(b))));
+        if constexpr ((flags & blend_addz) == 0) {
+            return y;                                      // any zeroing has been done by larger blend
+        }
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm_unpacklo_ps (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm_unpacklo_ps (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm_unpackhi_ps (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm_unpackhi_ps (b, a);
+    }
+    else if constexpr ((flags & blend_shufab) != 0 && !blendonly) { // use floating point instruction shufps
+        y = _mm_shuffle_ps(a, b, uint8_t(flags >> blend_shufpattern));
+    }
+    else if constexpr ((flags & blend_shufba) != 0 && !blendonly) { // use floating point instruction shufps
+        y = _mm_shuffle_ps(b, a, uint8_t(flags >> blend_shufpattern));
+    }
+#if INSTRSET >= 4 // SSSE3
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(a), _mm_castps_si128(b), flags >> blend_rotpattern));
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), flags >> blend_rotpattern));
+    }
+#endif
+    else { // No special cases. permute a and b separately, then blend.
+#if INSTRSET >= 5  // SSE4.1
+        constexpr bool dozero = false;
+#else  // SSE2
+        constexpr bool dozero = true;
+#endif
+        Vec4f ya = a, yb = b;   // a and b permuted
+        constexpr EList<int, 8> L = blend_perm_indexes<4, (int)dozero>(indexs); // get permutation indexes
+        if constexpr ((flags & blend_perma) != 0 || dozero) {
+            ya = permute4 <L.a[0], L.a[1], L.a[2], L.a[3]>(a);
+        }
+        if constexpr ((flags & blend_permb) != 0 || dozero) {
+            yb = permute4 <L.a[4], L.a[5], L.a[6], L.a[7]>(b);
+        }
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_ps (ya, (uint8_t)make_bit_mask<4, 0x302>(indexs), yb);
+#elif INSTRSET >= 5  // SSE4.1
+        constexpr uint8_t mm = ((i0 & 4) ? 0x01 : 0) | ((i1 & 4) ? 0x02 : 0) | ((i2 & 4) ? 0x04 : 0) | ((i3 & 4) ? 0x08 : 0);
+        if constexpr (mm == 0x01) y = _mm_move_ss(ya, yb);
+        else if constexpr (mm == 0x0E) y = _mm_move_ss(yb, ya);
+        else {
+            y = _mm_blend_ps (ya, yb, mm);
+        }
+#else  // SSE2. dozero = true
+        return _mm_or_ps(ya, yb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_ps(zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 4> bm = zero_mask_broad<Vec4i>(indexs);
+        y = _mm_and_ps(_mm_castsi128_ps(Vec4i().load(bm.a)), y);
+#endif
+    }
+    return y;
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec4f lookup4(Vec4i const index, Vec4f const table) {
+#if INSTRSET >= 7  // AVX
+    return _mm_permutevar_ps(table, index);
+#else
+    int32_t ii[4];
+    float   tt[6];
+    table.store(tt);  (index & 3).store(ii);
+    __m128 r01 = _mm_loadh_pi(_mm_load_ss(&tt[ii[0]]), (const __m64 *) & tt[ii[1]]);
+    __m128 r23 = _mm_loadh_pi(_mm_load_ss(&tt[ii[2]]), (const __m64 *) & tt[ii[3]]);
+    return _mm_shuffle_ps(r01, r23, 0x88);
+#endif
+}
+
+static inline Vec4f lookup8(Vec4i const index, Vec4f const table0, Vec4f const table1) {
+#if INSTRSET >= 8  // AVX2
+    __m256 tt = _mm256_insertf128_ps(_mm256_castps128_ps256(table0), table1, 1); // combine tables
+    __m128 r  = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
+    return r;
+
+#elif INSTRSET >= 7  // AVX
+    __m128  r0 = _mm_permutevar_ps(table0, index);
+    __m128  r1 = _mm_permutevar_ps(table1, index);
+    __m128i i4 = _mm_slli_epi32(index, 29);
+    return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
+
+#elif INSTRSET >= 5  // SSE4.1
+    Vec4f   r0 = lookup4(index, table0);
+    Vec4f   r1 = lookup4(index, table1);
+    __m128i i4 = _mm_slli_epi32(index, 29);
+    return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
+
+#else               // SSE2
+    Vec4f   r0 = lookup4(index, table0);
+    Vec4f   r1 = lookup4(index, table1);
+    __m128i i4 = _mm_srai_epi32(_mm_slli_epi32(index, 29), 31);
+    return selectf(_mm_castsi128_ps(i4), r1, r0);
+#endif
+}
+
+template <int n>
+static inline Vec4f lookup(Vec4i const index, float const * table) {
+    if constexpr (n <= 0) return 0.0f;
+    if constexpr (n <= 4) return lookup4(index, Vec4f().load(table));
+    if constexpr (n <= 8) {
+#if INSTRSET >= 8  // AVX2
+        __m256 tt = _mm256_loadu_ps(table);
+        __m128 r  = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
+        return r;
+#else   // not AVX2
+        return lookup8(index, Vec4f().load(table), Vec4f().load(table + 4));
+#endif
+    }
+    // n > 8. Limit index
+    Vec4ui index1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4ui(index) & (n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec4ui(index), n - 1);
+    }
+#if INSTRSET >= 8  // AVX2
+    return _mm_i32gather_ps(table, index1, 4);
+#else
+    uint32_t ii[4];  index1.store(ii);
+    return Vec4f(table[ii[0]], table[ii[1]], table[ii[2]], table[ii[3]]);
+#endif
+}
+
+static inline Vec2d lookup2(Vec2q const index, Vec2d const table) {
+#if INSTRSET >= 7  // AVX
+    return _mm_permutevar_pd(table, index + index);
+#else
+    int32_t ii[4];
+    double  tt[2];
+    table.store(tt);  (index & 1).store(ii);
+    return Vec2d(tt[ii[0]], tt[ii[2]]);
+#endif
+}
+
+static inline Vec2d lookup4(Vec2q const index, Vec2d const table0, Vec2d const table1) {
+#if INSTRSET >= 7  // AVX
+    Vec2q index2 = index + index;          // index << 1
+    __m128d r0 = _mm_permutevar_pd(table0, index2);
+    __m128d r1 = _mm_permutevar_pd(table1, index2);
+    __m128i i4 = _mm_slli_epi64(index, 62);
+    return _mm_blendv_pd(r0, r1, _mm_castsi128_pd(i4));
+#else
+    int32_t ii[4];
+    double  tt[4];
+    table0.store(tt);  table1.store(tt + 2);
+    (index & 3).store(ii);
+    return Vec2d(tt[ii[0]], tt[ii[2]]);
+#endif
+}
+
+template <int n>
+static inline Vec2d lookup(Vec2q const index, double const * table) {
+    if constexpr (n <= 0) return 0.0;
+    if constexpr (n <= 2) return lookup2(index, Vec2d().load(table));
+#if INSTRSET < 8  // not AVX2
+    if constexpr (n <= 4) return lookup4(index, Vec2d().load(table), Vec2d().load(table + 2));
+#endif
+    // Limit index
+    Vec2uq index1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec2uq(index) & (n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec2uq(index), n - 1);
+    }
+#if INSTRSET >= 8  // AVX2
+    return _mm_i64gather_pd(table, index1, 8);
+#else
+    uint32_t ii[4];  index1.store(ii);
+    return Vec2d(table[ii[0]], table[ii[2]]);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4f gather4f(void const * a) {
+    return reinterpret_f(gather4i<i0, i1, i2, i3>(a));
+}
+
+// Load elements from array a with indices i0, i1
+template <int i0, int i1>
+static inline Vec2d gather2d(void const * a) {
+    return reinterpret_d(gather2q<i0, i1>(a));
+}
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4f const data, float * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __m128i indx = constant4ui<i0, i1, i2, i3>();
+    __mmask8 mask = uint8_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3));
+    _mm_mask_i32scatter_ps(destination, mask, indx, data, 4);
+
+#elif INSTRSET >= 9  //  __AVX512F__
+    __m512i indx = _mm512_castsi128_si512(constant4ui<i0, i1, i2, i3>());
+    __mmask16 mask = uint16_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3));
+    _mm512_mask_i32scatter_ps(destination, mask, indx, _mm512_castps128_ps512(data), 4);
+
+#else
+    const int index[4] = { i0,i1,i2,i3 };
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0) destination[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1>
+static inline void scatter(Vec2d const data, double * destination) {
+    if (i0 >= 0) destination[i0] = data[0];
+    if (i1 >= 0) destination[i1] = data[1];
+}
+
+
+/*****************************************************************************
+*
+*          Scatter functions with variable indexes
+*
+*****************************************************************************/
+
+static inline void scatter(Vec4i const index, uint32_t limit, Vec4f const data, float * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+    _mm_mask_i32scatter_ps(destination, mask, index, data, 4);
+#else
+    for (int i = 0; i < 4; i++) {
+        if (uint32_t(index[i]) < limit) destination[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec2q const index, uint32_t limit, Vec2d const data, double * destination) {
+    if (uint64_t(index[0]) < uint64_t(limit)) destination[index[0]] = data[0];
+    if (uint64_t(index[1]) < uint64_t(limit)) destination[index[1]] = data[1];
+}
+
+
+#if INSTRSET < 10  // these are defined in vectori128.h for compact boolean vectors
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4fb const x) {
+    return to_bits(Vec4ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2db const x) {
+    return to_bits(Vec2qb(x));
+}
+
+#endif  // INSTRSET < 10
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORF128_H
diff --git a/Bwdif/VCL2/vectorf256.h b/Bwdif/VCL2/vectorf256.h
new file mode 100644
index 0000000..fa87dc2
--- /dev/null
+++ b/Bwdif/VCL2/vectorf256.h
@@ -0,0 +1,3015 @@
+/****************************  vectorf256.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 256-bit floating point vector classes
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec8f     Vector of 8 single precision floating point numbers
+* Vec8fb    Vector of 8 Booleans for use with Vec8f
+* Vec4d     Vector of 4 double precision floating point numbers
+* Vec4db    Vector of 4 Booleans for use with Vec4d
+*
+* Each vector object is represented internally in the CPU as a 256-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORF256_H
+#define VECTORF256_H  1
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#ifdef VECTORF256E_H
+#error Two different versions of vectorf256.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+/*****************************************************************************
+*
+*          Generate compile-time constant vector
+*
+*****************************************************************************/
+
+// Generate a constant vector of 8 integers stored in memory
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7>
+inline __m256 constant8f() {
+    /*
+    const union {
+        uint32_t i[8];
+        __m256   ymm;
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
+    return u.ymm;
+    */
+    return _mm256_castsi256_ps(_mm256_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7));
+}
+
+
+//    Join two 128-bit vectors. Used below
+#define set_m128r(lo,hi) _mm256_insertf128_ps(_mm256_castps128_ps256(lo),(hi),1)
+// _mm256_set_m128(hi,lo); // not defined in all versions of immintrin.h
+
+
+/*****************************************************************************
+*
+*          Vec8fb: Vector of 8 Booleans for use with Vec8f
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+class Vec8fb {
+protected:
+    __m256 ymm; // Float vector
+public:
+    // Default constructor:
+    Vec8fb() {
+    }
+    // Constructor to build from all elements:
+    Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_ps(_mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7));
+#else
+        __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3));
+        __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b4, -(int)b5, -(int)b6, -(int)b7));
+        ymm = set_m128r(blo,bhi);
+#endif
+    }
+    // Constructor to build from two Vec4fb:
+    Vec8fb(Vec4fb const a0, Vec4fb const a1) {
+        ymm = set_m128r(a0, a1);
+    }
+    // Constructor to convert from type __m256 used in intrinsics:
+    Vec8fb(__m256 const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256 used in intrinsics:
+    Vec8fb & operator = (__m256 const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8fb(bool b) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b));
+#else
+        __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
+        //ymm = _mm256_set_m128(b1,b1);
+        ymm = set_m128r(b1,b1);
+#endif
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8fb & operator = (bool b) {
+        *this = Vec8fb(b);
+        return *this;
+    }
+    // Type cast operator to convert to __m256 used in intrinsics
+    operator __m256() const {
+        return ymm;
+    }
+#if INSTRSET >= 8  // AVX2
+    // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb(Vec8ib const x) {
+        ymm = _mm256_castsi256_ps(x);
+    }
+    // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb & operator = (Vec8ib const x) {
+        ymm = _mm256_castsi256_ps(x);
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8fb & load_bits(uint8_t a) {
+        Vec8ib b;  b.load_bits(a);
+        ymm = _mm256_castsi256_ps(b);
+        return *this;
+    }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+    operator Vec8ib() const {
+        return _mm256_castps_si256(ymm);
+    }
+#endif
+#else  // AVX version
+    // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb(Vec8ib const x) {
+        ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
+    }
+    // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb & operator = (Vec8ib const x) {
+        ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    // AVX version. Use float instructions, treating integers as subnormal values
+    Vec8fb & load_bits(uint8_t a) {
+        __m256 b1 = _mm256_castsi256_ps(_mm256_set1_epi32((int32_t)a));  // broadcast a
+        __m256 m2 = constant8f<1,2,4,8,0x10,0x20,0x40,0x80>();
+        __m256 d1 = _mm256_and_ps(b1, m2); // isolate one bit in each dword
+        ymm = _mm256_cmp_ps(d1, _mm256_setzero_ps(), 4);  // compare subnormal values with 0
+        return *this;
+    }
+    // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+    operator Vec8ib() const {
+        return Vec8i(_mm_castps_si128(get_low()), _mm_castps_si128(get_high()));
+    }
+#endif // AVX2
+    // Member function to change a single element in vector
+    Vec8fb const insert(int index, bool value) {
+        const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0};
+        __m256 mask  = _mm256_loadu_ps((float const*)(maskl+8-(index & 7))); // mask with FFFFFFFF at index position
+        if (value) {
+            ymm = _mm256_or_ps(ymm,mask);
+        }
+        else {
+            ymm = _mm256_andnot_ps(mask,ymm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        union {
+            float   f[8];
+            int32_t i[8];
+        } u;
+        _mm256_storeu_ps(u.f, ymm);
+        return u.i[index & 7] != 0;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec4fb get_low() const {
+        return _mm256_castps256_ps128(ymm);
+    }
+    Vec4fb get_high() const {
+        return _mm256_extractf128_ps(ymm,1);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec8fb(int b) = delete;
+    Vec8fb & operator = (int x) = delete;
+    };
+
+#else
+
+typedef Vec8b Vec8fb;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Operators and functions for Vec8fb
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec8fb operator & (Vec8fb const a, Vec8fb const b) {
+    return _mm256_and_ps(a, b);
+}
+static inline Vec8fb operator && (Vec8fb const a, Vec8fb const b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8fb operator | (Vec8fb const a, Vec8fb const b) {
+    return _mm256_or_ps(a, b);
+}
+static inline Vec8fb operator || (Vec8fb const a, Vec8fb const b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8fb operator ~ (Vec8fb const a) {
+    return _mm256_xor_ps(a, constant8f<0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu>());
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8fb operator ^ (Vec8fb const a, Vec8fb const b) {
+    return _mm256_xor_ps(a, b);
+}
+
+// vector operator == : xnor
+static inline Vec8fb operator == (Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(a ^ Vec8fb(~b));
+}
+
+// vector operator != : xor
+static inline Vec8fb operator != (Vec8fb const a, Vec8fb const b) {
+    return _mm256_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same)
+static inline Vec8fb operator ! (Vec8fb const a) {
+return Vec8fb( !Vec8ib(a));
+}
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec8fb andnot(Vec8fb const a, Vec8fb const b) {
+    return _mm256_andnot_ps(b, a);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec8fb const a) {
+    return _mm256_testc_ps(a,constant8f<0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu>()) != 0;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec8fb const a) {
+    return ! _mm256_testz_ps(a,a);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8fb const x) {
+    return to_bits(Vec8ib(x));
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Vec4db: Vector of 4 Booleans for use with Vec4d
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+class Vec4db {
+protected:
+    __m256d ymm; // double vector
+public:
+    // Default constructor:
+    Vec4db() {
+    }
+    // Constructor to build from all elements:
+    Vec4db(bool b0, bool b1, bool b2, bool b3) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_pd(_mm256_setr_epi64x(-(int64_t)b0, -(int64_t)b1, -(int64_t)b2, -(int64_t)b3));
+#else
+        __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1));
+        __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b2, -(int)b2, -(int)b3, -(int)b3));
+        ymm = _mm256_castps_pd(set_m128r(blo, bhi));
+#endif
+    }
+    // Constructor to build from two Vec2db:
+    Vec4db(Vec2db const a0, Vec2db const a1) {
+        ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0),_mm_castpd_ps(a1)));
+        //ymm = _mm256_set_m128d(a1, a0);
+    }
+    // Constructor to convert from type __m256d used in intrinsics:
+    Vec4db(__m256d const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256d used in intrinsics:
+    Vec4db & operator = (__m256d const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4db(bool b) {
+#if INSTRSET >= 8  // AVX2
+        ymm = _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64_t)b));
+#else
+        __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
+        ymm = _mm256_castps_pd(set_m128r(b1,b1));
+#endif
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4db & operator = (bool b) {
+        ymm = _mm256_castsi256_pd(_mm256_set1_epi32(-int32_t(b)));
+        return *this;
+    }
+    // Type cast operator to convert to __m256d used in intrinsics
+    operator __m256d() const {
+        return ymm;
+    }
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db(Vec4qb const x) {
+        ymm = _mm256_castsi256_pd(x);
+    }
+    // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db & operator = (Vec4qb const x) {
+        ymm = _mm256_castsi256_pd(x);
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec4db & load_bits(uint8_t a) {
+        Vec4qb b; b.load_bits(a);
+        ymm = _mm256_castsi256_pd(b);
+        return *this;
+    }
+#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+    // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+    operator Vec4qb() const {
+        return _mm256_castpd_si256(ymm);
+    }
+#endif
+#else   // 256 bit integer vectors emulated without AVX2
+    // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db(Vec4qb const x) {
+        *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
+    }
+    // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db & operator = (Vec4qb const x) {
+        *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
+        return *this;
+    }
+    // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+    operator Vec4qb() const {
+        return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high()));
+    }
+    // Member function to change a bitfield to a boolean vector
+    // AVX version. Use float instructions, treating integers as subnormal values
+    Vec4db & load_bits(uint8_t a) {
+        __m256d b1 = _mm256_castsi256_pd(_mm256_set1_epi32((int32_t)a));  // broadcast a
+        __m256d m2 = _mm256_castps_pd(constant8f<1,0,2,0,4,0,8,0>());
+        __m256d d1 = _mm256_and_pd(b1, m2); // isolate one bit in each dword
+        ymm = _mm256_cmp_pd(d1, _mm256_setzero_pd(), 4);  // compare subnormal values with 0
+        return *this;
+    }
+#endif // AVX2
+    // Member function to change a single element in vector
+    Vec4db const insert(int index, bool value) {
+        const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0};
+        __m256d mask = _mm256_loadu_pd((double const*)(maskl+8-(index&3)*2)); // mask with FFFFFFFFFFFFFFFF at index position
+        if (value) {
+            ymm = _mm256_or_pd(ymm,mask);
+        }
+        else {
+            ymm = _mm256_andnot_pd(mask,ymm);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        union {
+            double  f[8];
+            int32_t i[16];
+        } u;
+        _mm256_storeu_pd(u.f, ymm);
+        return u.i[(index & 3) * 2 + 1] != 0;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec2db get_low() const {
+        return _mm256_castpd256_pd128(ymm);
+    }
+    Vec2db get_high() const {
+        return _mm256_extractf128_pd(ymm,1);
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec4db(int b) = delete;
+    Vec4db & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec4b Vec4db;  // compact boolean vector
+
+#endif
+
+/*****************************************************************************
+*
+*          Operators and functions for Vec4db
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec4db operator & (Vec4db const a, Vec4db const b) {
+    return _mm256_and_pd(a, b);
+}
+static inline Vec4db operator && (Vec4db const a, Vec4db const b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec4db & operator &= (Vec4db & a, Vec4db const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4db operator | (Vec4db const a, Vec4db const b) {
+    return _mm256_or_pd(a, b);
+}
+static inline Vec4db operator || (Vec4db const a, Vec4db const b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec4db & operator |= (Vec4db & a, Vec4db const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4db operator ~ (Vec4db const a) {
+    return _mm256_xor_pd(a, _mm256_castps_pd (constant8f<0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu>()));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4db operator ^ (Vec4db const a, Vec4db const b) {
+    return _mm256_xor_pd(a, b);
+}
+
+// vector operator == : xnor
+static inline Vec4db operator == (Vec4db const a, Vec4db const b) {
+    return Vec4db(a ^ Vec4db(~b));
+}
+
+// vector operator != : xor
+static inline Vec4db operator != (Vec4db const a, Vec4db const b) {
+    return _mm256_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4db & operator ^= (Vec4db & a, Vec4db const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same)
+static inline Vec4db operator ! (Vec4db const a) {
+return Vec4db( ! Vec4qb(a));
+}
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec4db andnot(Vec4db const a, Vec4db const b) {
+    return _mm256_andnot_pd(b, a);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec4db const a) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    return horizontal_and(Vec256b(_mm256_castpd_si256(a)));
+#else  // split into 128 bit vectors
+    return horizontal_and(a.get_low() & a.get_high());
+#endif
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec4db const a) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    return horizontal_or(Vec256b(_mm256_castpd_si256(a)));
+#else  // split into 128 bit vectors
+    return horizontal_or(a.get_low() | a.get_high());
+#endif
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4db const x) {
+    return to_bits(Vec4qb(x));
+}
+
+#endif
+
+
+ /*****************************************************************************
+*
+*          Vec8f: Vector of 8 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec8f {
+protected:
+    __m256 ymm; // Float vector
+public:
+    // Default constructor:
+    Vec8f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8f(float f) {
+        ymm = _mm256_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
+        ymm = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7);
+    }
+    // Constructor to build from two Vec4f:
+    Vec8f(Vec4f const a0, Vec4f const a1) {
+        ymm = set_m128r(a0, a1);
+        //ymm = _mm256_set_m128(a1, a0);
+    }
+    // Constructor to convert from type __m256 used in intrinsics:
+    Vec8f(__m256 const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256 used in intrinsics:
+    Vec8f & operator = (__m256 const x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256 used in intrinsics
+    operator __m256() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8f & load(float const * p) {
+        ymm = _mm256_loadu_ps(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address divisible by 32
+    Vec8f & load_a(float const * p) {
+        ymm = _mm256_load_ps(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm256_storeu_ps(p, ymm);
+    }
+    // Member function storing into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address divisible by 32
+    void store_a(float * p) const {
+        _mm256_store_ps(p, ymm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 32
+    void store_nt(float * p) const {
+        _mm256_stream_ps(p, ymm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8f & load_partial(int n, float const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        ymm = _mm256_maskz_loadu_ps(__mmask8((1u << n) - 1), p);
+#else
+        if (n > 0 && n <= 4) {
+            *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps());
+        }
+        else if (n > 4 && n <= 8) {
+            *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
+        }
+        else {
+            ymm = _mm256_setzero_ps();
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+#if INSTRSET >= 10  // AVX512VL
+        _mm256_mask_storeu_ps(p, __mmask8((1u << n) - 1), ymm);
+#else
+        if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 8) {
+            get_low().store(p);
+            get_high().store_partial(n - 4, p + 4);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8f & cutoff(int n) {
+#if INSTRSET >= 10
+        ymm = _mm256_maskz_mov_ps(__mmask8((1u << n) - 1), ymm);
+#else
+        if (uint32_t(n) >= 8) return *this;
+        const union {
+            int32_t i[16];
+            float   f[16];
+        } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
+        *this = Vec8fb(*this) & Vec8fb(Vec8f().load(mask.f + 8 - n));
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8f const insert(int index, float value) {
+#if INSTRSET >= 10   // AVX512VL
+        ymm = _mm256_mask_broadcastss_ps (ymm, __mmask8(1u << index), _mm_set_ss(value));
+#else
+        __m256 v0 = _mm256_broadcast_ss(&value);
+        switch (index) {
+        case 0:
+            ymm = _mm256_blend_ps (ymm, v0, 1);  break;
+        case 1:
+            ymm = _mm256_blend_ps (ymm, v0, 2);  break;
+        case 2:
+            ymm = _mm256_blend_ps (ymm, v0, 4);  break;
+        case 3:
+            ymm = _mm256_blend_ps (ymm, v0, 8);  break;
+        case 4:
+            ymm = _mm256_blend_ps (ymm, v0, 0x10);  break;
+        case 5:
+            ymm = _mm256_blend_ps (ymm, v0, 0x20);  break;
+        case 6:
+            ymm = _mm256_blend_ps (ymm, v0, 0x40);  break;
+        default:
+            ymm = _mm256_blend_ps (ymm, v0, 0x80);  break;
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(int index) const {
+#if INSTRSET >= 10
+        __m256 x = _mm256_maskz_compress_ps(__mmask8(1u << index), ymm);
+        return _mm256_cvtss_f32(x);
+#else
+        float x[8];
+        store(x);
+        return x[index & 7];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4f:
+    Vec4f get_low() const {
+        return _mm256_castps256_ps128(ymm);
+    }
+    Vec4f get_high() const {
+        return _mm256_extractf128_ps(ymm,1);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 16;
+    }
+    typedef __m256 registertype;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8f operator + (Vec8f const a, Vec8f const b) {
+    return _mm256_add_ps(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8f operator + (Vec8f const a, float b) {
+    return a + Vec8f(b);
+}
+static inline Vec8f operator + (float a, Vec8f const b) {
+    return Vec8f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8f & operator += (Vec8f & a, Vec8f const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8f operator ++ (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8f & operator ++ (Vec8f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8f operator - (Vec8f const a, Vec8f const b) {
+    return _mm256_sub_ps(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8f operator - (Vec8f const a, float b) {
+    return a - Vec8f(b);
+}
+static inline Vec8f operator - (float a, Vec8f const b) {
+    return Vec8f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8f operator - (Vec8f const a) {
+    return _mm256_xor_ps(a, Vec8f(-0.0f));
+}
+
+// vector operator -= : subtract
+static inline Vec8f & operator -= (Vec8f & a, Vec8f const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8f operator -- (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8f & operator -- (Vec8f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8f operator * (Vec8f const a, Vec8f const b) {
+    return _mm256_mul_ps(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8f operator * (Vec8f const a, float b) {
+    return a * Vec8f(b);
+}
+static inline Vec8f operator * (float a, Vec8f const b) {
+    return Vec8f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8f & operator *= (Vec8f & a, Vec8f const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8f operator / (Vec8f const a, Vec8f const b) {
+    return _mm256_div_ps(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8f operator / (Vec8f const a, float b) {
+    return a / Vec8f(b);
+}
+static inline Vec8f operator / (float a, Vec8f const b) {
+    return Vec8f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8f & operator /= (Vec8f & a, Vec8f const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8fb operator == (Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_ps_mask(a, b, 0);
+#else
+    return _mm256_cmp_ps(a, b, 0);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8fb operator != (Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_ps_mask(a, b, 4);
+#else
+    return _mm256_cmp_ps(a, b, 4);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8fb operator < (Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_ps_mask(a, b, 1);
+#else
+    return _mm256_cmp_ps(a, b, 1);
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8fb operator <= (Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_ps_mask(a, b, 2);
+#else
+    return _mm256_cmp_ps(a, b, 2);
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8fb operator > (Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_ps_mask(a, b, 6);
+#else
+    return b < a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8fb operator >= (Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_ps_mask(a, b, 5);
+#else
+    return b <= a;
+#endif
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8f operator & (Vec8f const a, Vec8f const b) {
+    return _mm256_and_ps(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec8f & operator &= (Vec8f & a, Vec8f const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8f and Vec8fb
+static inline Vec8f operator & (Vec8f const a, Vec8fb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_maskz_mov_ps(b, a);
+#else
+    return _mm256_and_ps(a, b);
+#endif
+}
+static inline Vec8f operator & (Vec8fb const a, Vec8f const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8f operator | (Vec8f const a, Vec8f const b) {
+    return _mm256_or_ps(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec8f & operator |= (Vec8f & a, Vec8f const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8f operator ^ (Vec8f const a, Vec8f const b) {
+    return _mm256_xor_ps(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8f & operator ^= (Vec8f & a, Vec8f const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8fb operator ! (Vec8f const a) {
+    return a == Vec8f(0.0f);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8f
+*
+*****************************************************************************/
+
+static inline Vec8f zero_8f() {
+    return _mm256_setzero_ps();
+}
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8f select (Vec8fb const s, Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_ps(b, s, a);
+#else
+    return _mm256_blendv_ps (b, a, s);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8f if_add (Vec8fb const f, Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_add_ps (a, f, a, b);
+#else
+    return a + (Vec8f(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec8f if_sub (Vec8fb const f, Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_sub_ps (a, f, a, b);
+#else
+    return a - (Vec8f(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec8f if_mul (Vec8fb const f, Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_mul_ps (a, f, a, b);
+#else
+    return a * select(f, b, 1.f);
+#endif
+}
+
+// Conditional divide
+static inline Vec8f if_div (Vec8fb const f, Vec8f const a, Vec8f const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_div_ps (a, f, a, b);
+#else
+    return a / select(f, b, 1.f);
+#endif
+}
+
+// Sign functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb sign_bit(Vec8f const a) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
+    Vec8i t2 = t1 >> 31;                  // extend sign bit
+#if INSTRSET >= 10
+    return t2 != 0;
+#else
+    return _mm256_castsi256_ps(t2);       // reinterpret as 32-bit Boolean
+#endif
+#else
+    return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+#endif
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8f sign_combine(Vec8f const a, Vec8f const b) {
+#if INSTRSET < 10
+    return a ^ (b & Vec8f(-0.0f));
+#else
+    return _mm256_castsi256_ps (_mm256_ternarylogic_epi32(
+        _mm256_castps_si256(a), _mm256_castps_si256(b), Vec8i(0x80000000), 0x78));
+#endif
+}
+
+// Categorization functions
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_finite(Vec8f const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask8(~ _mm256_fpclass_ps_mask (a, 0x99));
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                // shift out sign bit
+    Vec8ib t3 = Vec8i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
+    return t3;
+#else
+    return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_inf(Vec8f const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_fpclass_ps_mask (a, 0x18);
+#elif INSTRSET >= 8  //  256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                // shift out sign bit
+    return t2 == 0xFF000000;           // exponent is all 1s, fraction is 0
+#else
+    return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
+static inline Vec8fb is_nan(Vec8f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm256_fpclass_ps_mask (a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec8fb is_nan(Vec8f const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec8fb is_nan(Vec8f const a) {
+    __m256 aa = a;
+    __m256 unordered;
+    __asm volatile("vcmpps $3, %1, %1, %0" : "=v" (unordered) :  "v" (aa) );
+    return Vec8fb(unordered);
+}
+#else
+static inline Vec8fb is_nan(Vec8f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm256_cmp_ps(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
+}
+#endif
+
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8fb is_subnormal(Vec8f const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_fpclass_ps_mask (a, 0x20);
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = _mm256_castps_si256(a);      // reinterpret as 32-bit integer
+    Vec8i t2 = t1 << 1;                     // shift out sign bit
+    Vec8i t3 = 0xFF000000;                  // exponent mask
+    Vec8i t4 = t2 & t3;                     // exponent
+    Vec8i t5 = _mm256_andnot_si256(t3,t2);  // fraction
+    return Vec8ib(t4 == 0 && t5 != 0);      // exponent = 0 and fraction != 0
+#else
+    return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8fb is_zero_or_subnormal(Vec8f const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_fpclass_ps_mask (a, 0x26);
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2    Vec8i t = _mm256_castps_si256(a);            // reinterpret as 32-bit integer
+    Vec8i t = _mm256_castps_si256(a);       // reinterpret as 32-bit integer
+    t &= 0x7F800000;                        // isolate exponent
+    return t == 0;                          // exponent = 0
+#else
+    return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+#endif
+}
+
+// change signs on vectors Vec8f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+inline Vec8f change_sign(Vec8f const a) {
+    if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
+    __m256 mask = constant8f<
+        (i0 ? 0x80000000u : 0u), (i1 ? 0x80000000u : 0u), (i2 ? 0x80000000u : 0u), (i3 ? 0x80000000u : 0u),
+        (i4 ? 0x80000000u : 0u), (i5 ? 0x80000000u : 0u), (i6 ? 0x80000000u : 0u), (i7 ? 0x80000000u : 0u)> ();
+    return _mm256_xor_ps(a, mask);
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec8f const a) {
+    return horizontal_add(a.get_low()+a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8f max(Vec8f const a, Vec8f const b) {
+    return _mm256_max_ps(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8f min(Vec8f const a, Vec8f const b) {
+    return _mm256_min_ps(a,b);
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+static inline Vec8f abs(Vec8f const a) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm256_range_ps(a, a, 8);
+#else
+    __m256 mask = constant8f<0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu> ();
+    return _mm256_and_ps(a,mask);
+#endif
+}
+
+// function sqrt: square root
+static inline Vec8f sqrt(Vec8f const a) {
+    return _mm256_sqrt_ps(a);
+}
+
+// function square: a * a
+static inline Vec8f square(Vec8f const a) {
+    return a * a;
+}
+
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included
+template <typename TT> static Vec8f pow(Vec8f const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8f pow<int>(Vec8f const x0, int const n) {
+    return pow_template_i<Vec8f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8f pow<uint32_t>(Vec8f const x0, uint32_t const n) {
+    return pow_template_i<Vec8f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8f pow(Vec8f const a, Const_int_t<n>) {
+    return pow_n<Vec8f, n>(a);
+}
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec8f round(Vec8f const a) {
+    return _mm256_round_ps(a, 0+8);
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec8f truncate(Vec8f const a) {
+    return _mm256_round_ps(a, 3+8);
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec8f floor(Vec8f const a) {
+    return _mm256_round_ps(a, 1+8);
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec8f ceil(Vec8f const a) {
+    return _mm256_round_ps(a, 2+8);
+}
+
+#if INSTRSET >= 8  // 256 bit integer vectors are available
+
+// function roundi: round to nearest integer (even). (result as integer vector)
+static inline Vec8i roundi(Vec8f const a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm256_cvtps_epi32(a);
+}
+
+// function truncatei: round towards zero. (result as integer vector)
+static inline Vec8i truncatei(Vec8f const a) {
+    return _mm256_cvttps_epi32(a);
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const a) {
+    return _mm256_cvtepi32_ps(a);
+}
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec8f to_float(Vec8ui const a) {
+#if INSTRSET >= 10 && !defined (_MSC_VER)  // _mm256_cvtepu32_ps missing in VS2019
+    return _mm256_cvtepu32_ps(a);
+#elif INSTRSET >= 9  // __AVX512F__
+    return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a)));
+#else
+    Vec8f b = to_float(Vec8i(a & 0xFFFFF));             // 20 bits
+    Vec8f c = to_float(Vec8i(a >> 20));                 // remaining bits
+    Vec8f d = b + c * 1048576.f;  // 2^20
+    return d;
+#endif
+}
+
+#else // no AVX2
+
+// function roundi: round to nearest integer (even). (result as integer vector)
+static inline Vec8i roundi(Vec8f const a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8i(_mm_cvtps_epi32(a.get_low()), _mm_cvtps_epi32(a.get_high()));
+}
+
+// function truncatei: round towards zero. (result as integer vector)
+static inline Vec8i truncatei(Vec8f const a) {
+    return Vec8i(_mm_cvttps_epi32(a.get_low()), _mm_cvttps_epi32(a.get_high()));
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const a) {
+    return Vec8f(_mm_cvtepi32_ps(a.get_low()), _mm_cvtepi32_ps(a.get_high()));
+}
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec8f to_float(Vec8ui const a) {
+    return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
+}
+#endif // AVX2
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8f mul_add(Vec8f const a, Vec8f const b, Vec8f const c) {
+#ifdef __FMA__
+    return _mm256_fmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_macc_ps(a, b, c);
+#else
+    return a * b + c;
+#endif
+}
+
+// Multiply and subtract
+static inline Vec8f mul_sub(Vec8f const a, Vec8f const b, Vec8f const c) {
+#ifdef __FMA__
+    return _mm256_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_ps(a, b, c);
+#else
+    return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec8f nmul_add(Vec8f const a, Vec8f const b, Vec8f const c) {
+#ifdef __FMA__
+    return _mm256_fnmadd_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_nmacc_ps(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+// This is used in mathematical functions. Do not use it in general code
+// because it is inaccurate in certain cases
+static inline Vec8f mul_sub_x(Vec8f const a, Vec8f const b, Vec8f const c) {
+#ifdef __FMA__
+    return _mm256_fmsub_ps(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_ps(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    const uint32_t b12 = uint32_t(-(1 << 12));   // mask to remove lower 12 bits
+    Vec8f upper_mask = constant8f<b12,b12,b12,b12,b12,b12,b12,b12>();
+    Vec8f a_high = a & upper_mask;               // split into high and low parts
+    Vec8f b_high = b & upper_mask;
+    Vec8f a_low  = a - a_high;
+    Vec8f b_low  = b - b_high;
+    Vec8f r1 = a_high * b_high;                  // this product is exact
+    Vec8f r2 = r1 - c;                           // subtract c from high product
+    Vec8f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec8f approx_recipr(Vec8f const a) {
+#ifdef __AVX512ER__  // AVX512ER: full precision
+    // todo: if future processors have both AVX512ER and AVX512VL: _mm256_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
+    return _mm512_castps512_ps256(_mm512_rcp28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
+#elif INSTRSET >= 10  // AVX512VL: 14 bit precision
+    return _mm256_rcp14_ps(a);
+#elif INSTRSET >= 9   // AVX512F: 14 bit precision
+    return _mm512_castps512_ps256(_mm512_rcp14_ps(_mm512_castps256_ps512(a)));
+#else  // AVX: 11 bit precision
+    return _mm256_rcp_ps(a);
+#endif
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec8f approx_rsqrt(Vec8f const a) {
+// use more accurate version if available. (none of these will raise exceptions on zero)
+#ifdef __AVX512ER__  // AVX512ER: full precision
+    // todo: if future processors have both AVX512ER and AVX521VL: _mm256_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
+    return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
+#elif INSTRSET >= 10 && defined(_mm256_rsqrt14_ps)  // missing in VS2019
+    return _mm256_rsqrt14_ps(a);
+#elif INSTRSET >= 9  // AVX512F: 14 bit precision
+    return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
+#else  // AVX: 11 bit precision
+    return _mm256_rsqrt_ps(a);
+#endif
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec8i exponent(Vec8f const a) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec8ui t1 = _mm256_castps_si256(a);// reinterpret as 32-bit integer
+    Vec8ui t2 = t1 << 1;               // shift out sign bit
+    Vec8ui t3 = t2 >> 24;              // shift down logical to position 0
+    Vec8i  t4 = Vec8i(t3) - 0x7F;      // subtract bias from exponent
+    return t4;
+#else  // no AVX2
+    return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
+#endif
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec8f fraction(Vec8f const a) {
+#if INSTRSET >= 10
+    return _mm256_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+#elif INSTRSET >= 8 // AVX2. 256 bit integer vectors are available
+    Vec8ui t1 = _mm256_castps_si256(a);   // reinterpret as 32-bit integer
+    Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000; // set exponent to 0 + bias
+    return _mm256_castsi256_ps(t2);
+#else
+    return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8f exp2(Vec8i const n) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec8i t1 = max(n,  -0x7F);         // limit to allowed range
+    Vec8i t2 = min(t1,  0x80);
+    Vec8i t3 = t2 + 0x7F;              // add bias
+    Vec8i t4 = t3 << 23;               // put exponent into position 23
+    return _mm256_castsi256_ps(t4);    // reinterpret as float
+#else
+    return Vec8f(exp2(n.get_low()), exp2(n.get_high()));
+#endif // AVX2
+}
+//static inline Vec8f exp2(Vec8f const x); // defined in vectormath_exp.h
+
+
+
+/*****************************************************************************
+*
+*          Vec4d: Vector of 4 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec4d {
+protected:
+    __m256d ymm; // double vector
+public:
+    // Default constructor:
+    Vec4d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4d(double d) {
+        ymm = _mm256_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec4d(double d0, double d1, double d2, double d3) {
+        ymm = _mm256_setr_pd(d0, d1, d2, d3);
+    }
+    // Constructor to build from two Vec2d:
+    Vec4d(Vec2d const a0, Vec2d const a1) {
+        ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1)));
+        //ymm = _mm256_set_m128d(a1, a0);
+    }
+    // Constructor to convert from type __m256d used in intrinsics:
+    Vec4d(__m256d const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256d used in intrinsics:
+    Vec4d & operator = (__m256d const x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256d used in intrinsics
+    operator __m256d() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4d & load(double const * p) {
+        ymm = _mm256_loadu_pd(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32
+    Vec4d & load_a(double const * p) {
+        ymm = _mm256_load_pd(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm256_storeu_pd(p, ymm);
+    }
+    // Member function storing into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32
+    void store_a(double * p) const {
+        _mm256_store_pd(p, ymm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 32
+    void store_nt(double * p) const {
+        _mm256_stream_pd(p, ymm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4d & load_partial(int n, double const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        ymm = _mm256_maskz_loadu_pd(__mmask8((1u << n) - 1), p);
+#else
+        if (n > 0 && n <= 2) {
+            *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
+        }
+        else if (n > 2 && n <= 4) {
+            *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
+        }
+        else {
+            ymm = _mm256_setzero_pd();
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+#if INSTRSET >= 10  // AVX512VL
+        _mm256_mask_storeu_pd(p, __mmask8((1u << n) - 1), ymm);
+#else
+        if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 4) {
+            get_low().store(p);
+            get_high().store_partial(n - 2, p + 2);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec4d & cutoff(int n) {
+#if INSTRSET >= 10
+        ymm = _mm256_maskz_mov_pd(__mmask8((1u << n) - 1), ymm);
+#else
+        ymm = _mm256_castps_pd(Vec8f(_mm256_castpd_ps(ymm)).cutoff(n*2));
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec4d const insert(int index, double value) {
+#if INSTRSET >= 10   // AVX512VL
+        ymm = _mm256_mask_broadcastsd_pd (ymm, __mmask8(1u << index), _mm_set_sd(value));
+#else
+        __m256d v0 = _mm256_broadcast_sd(&value);
+        switch (index) {
+        case 0:
+            ymm = _mm256_blend_pd (ymm, v0, 1);  break;
+        case 1:
+            ymm = _mm256_blend_pd (ymm, v0, 2);  break;
+        case 2:
+            ymm = _mm256_blend_pd (ymm, v0, 4);  break;
+        default:
+            ymm = _mm256_blend_pd (ymm, v0, 8);  break;
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(int index) const {
+#if INSTRSET >= 10
+        __m256d x = _mm256_maskz_compress_pd(__mmask8(1u << index), ymm);
+        return _mm256_cvtsd_f64(x);
+#else
+        double x[4];
+        store(x);
+        return x[index & 3];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2d:
+    Vec2d get_low() const {
+        return _mm256_castpd256_pd128(ymm);
+    }
+    Vec2d get_high() const {
+        return _mm256_extractf128_pd(ymm,1);
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 17;
+    }
+    typedef __m256d registertype;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4d operator + (Vec4d const a, Vec4d const b) {
+    return _mm256_add_pd(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec4d operator + (Vec4d const a, double b) {
+    return a + Vec4d(b);
+}
+static inline Vec4d operator + (double a, Vec4d const b) {
+    return Vec4d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec4d & operator += (Vec4d & a, Vec4d const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4d operator ++ (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4d & operator ++ (Vec4d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4d operator - (Vec4d const a, Vec4d const b) {
+    return _mm256_sub_pd(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec4d operator - (Vec4d const a, double b) {
+    return a - Vec4d(b);
+}
+static inline Vec4d operator - (double a, Vec4d const b) {
+    return Vec4d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4d operator - (Vec4d const a) {
+    return _mm256_xor_pd(a, _mm256_castps_pd(constant8f<0u,0x80000000u,0u,0x80000000u,0u,0x80000000u,0u,0x80000000u> ()));
+}
+
+// vector operator -= : subtract
+static inline Vec4d & operator -= (Vec4d & a, Vec4d const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4d operator -- (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4d & operator -- (Vec4d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4d operator * (Vec4d const a, Vec4d const b) {
+    return _mm256_mul_pd(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec4d operator * (Vec4d const a, double b) {
+    return a * Vec4d(b);
+}
+static inline Vec4d operator * (double a, Vec4d const b) {
+    return Vec4d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec4d & operator *= (Vec4d & a, Vec4d const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4d operator / (Vec4d const a, Vec4d const b) {
+    return _mm256_div_pd(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec4d operator / (Vec4d const a, double b) {
+    return a / Vec4d(b);
+}
+static inline Vec4d operator / (double a, Vec4d const b) {
+    return Vec4d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec4d & operator /= (Vec4d & a, Vec4d const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4db operator == (Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_pd_mask(a, b, 0);
+#else
+    return _mm256_cmp_pd(a, b, 0);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4db operator != (Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_pd_mask(a, b, 4);
+#else
+    return _mm256_cmp_pd(a, b, 4);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4db operator < (Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_pd_mask(a, b, 1);
+#else
+    return _mm256_cmp_pd(a, b, 1);
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4db operator <= (Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_pd_mask(a, b, 2);
+#else
+    return _mm256_cmp_pd(a, b, 2);
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4db operator > (Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_pd_mask(a, b, 6);
+#else
+    return b < a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4db operator >= (Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_pd_mask(a, b, 5);
+#else
+    return b <= a;
+#endif
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4d operator & (Vec4d const a, Vec4d const b) {
+    return _mm256_and_pd(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec4d & operator &= (Vec4d & a, Vec4d const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec4d and Vec4db
+static inline Vec4d operator & (Vec4d const a, Vec4db const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_maskz_mov_pd(b, a);
+#else
+    return _mm256_and_pd(a, b);
+#endif
+}
+static inline Vec4d operator & (Vec4db const a, Vec4d const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4d operator | (Vec4d const a, Vec4d const b) {
+    return _mm256_or_pd(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec4d & operator |= (Vec4d & a, Vec4d const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4d operator ^ (Vec4d const a, Vec4d const b) {
+    return _mm256_xor_pd(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4d & operator ^= (Vec4d & a, Vec4d const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4db operator ! (Vec4d const a) {
+    return a == Vec4d(0.0);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec4d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec4d select (Vec4db const s, Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_pd(b, s, a);
+#else
+    return _mm256_blendv_pd(b, a, s);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4d if_add (Vec4db const f, Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_add_pd (a, f, a, b);
+#else
+    return a + (Vec4d(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec4d if_sub (Vec4db const f, Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_sub_pd (a, f, a, b);
+#else
+    return a - (Vec4d(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec4d if_mul (Vec4db const f, Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_mul_pd (a, f, a, b);
+#else
+    return a * select(f, b, 1.);
+#endif
+}
+
+// Conditional divide
+static inline Vec4d if_div (Vec4db const f, Vec4d const a, Vec4d const b) {
+#if INSTRSET >= 10
+    return _mm256_mask_div_pd (a, f, a, b);
+#else
+    return a / select(f, b, 1.);
+#endif
+}
+
+// sign functions
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4d sign_combine(Vec4d const a, Vec4d const b) {
+#if INSTRSET < 10
+    return a ^ (b & Vec4d(-0.0));
+#else
+    return _mm256_castsi256_pd (_mm256_ternarylogic_epi64(
+        _mm256_castpd_si256(a), _mm256_castpd_si256(b), Vec4q(0x8000000000000000), 0x78));
+#endif
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec4db is_finite(Vec4d const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask8(~ _mm256_fpclass_pd_mask (a, 0x99));
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    Vec4q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec4qb t4 = Vec4q(t2 & t3) != t3;  // exponent field is not all 1s
+    return t4;
+#else
+    return Vec4db(is_finite(a.get_low()),is_finite(a.get_high()));
+#endif
+}
+
+// categorization functions
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec4db is_inf(Vec4d const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_fpclass_pd_mask (a, 0x18);
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    return t2 == 0xFFE0000000000000;   // exponent is all 1s, fraction is 0
+#else
+    return Vec4db(is_inf(a.get_low()),is_inf(a.get_high()));
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
+static inline Vec4db is_nan(Vec4d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm256_fpclass_pd_mask (a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec4db is_nan(Vec4d const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec4db is_nan(Vec4d const a) {
+    __m256d aa = a;
+    __m256d unordered;
+    __asm volatile("vcmppd $3, %1, %1, %0" : "=v" (unordered) :  "v" (aa) );
+    return Vec4db(unordered);
+}
+#else
+static inline Vec4db is_nan(Vec4d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm256_cmp_pd(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
+}
+#endif
+
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4db is_subnormal(Vec4d const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_fpclass_pd_mask (a, 0x20);
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
+    Vec4q t2 = t1 << 1;                // shift out sign bit
+    Vec4q t3 = 0xFFE0000000000000;     // exponent mask
+    Vec4q t4 = t2 & t3;                // exponent
+    Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction
+    return Vec4qb(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0
+#else
+    return Vec4db(is_subnormal(a.get_low()),is_subnormal(a.get_high()));
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4db is_zero_or_subnormal(Vec4d const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_fpclass_pd_mask (a, 0x26);
+#elif INSTRSET >= 8  // 256 bit integer vectors are available, AVX2    Vec8i t = _mm256_castps_si256(a);            // reinterpret as 32-bit integer
+    Vec4q t = _mm256_castpd_si256(a);     // reinterpret as 32-bit integer
+    t &= 0x7FF0000000000000ll;   // isolate exponent
+    return t == 0;                     // exponent = 0
+#else
+    return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
+#endif
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec4d const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec4d max(Vec4d const a, Vec4d const b) {
+    return _mm256_max_pd(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec4d min(Vec4d const a, Vec4d const b) {
+    return _mm256_min_pd(a,b);
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+static inline Vec4d abs(Vec4d const a) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm256_range_pd(a, a, 8);
+#else
+    __m256d mask = _mm256_castps_pd(constant8f<0xFFFFFFFFu,0x7FFFFFFFu,0xFFFFFFFFu,0x7FFFFFFFu,0xFFFFFFFFu,0x7FFFFFFFu,0xFFFFFFFFu,0x7FFFFFFFu> ());
+    return _mm256_and_pd(a,mask);
+#endif
+}
+
+// function sqrt: square root
+static inline Vec4d sqrt(Vec4d const a) {
+    return _mm256_sqrt_pd(a);
+}
+
+// function square: a * a
+static inline Vec4d square(Vec4d const a) {
+    return a * a;
+}
+
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included
+template <typename TT> static Vec4d pow(Vec4d const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4d pow<int>(Vec4d const x0, int const n) {
+    return pow_template_i<Vec4d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4d pow<uint32_t>(Vec4d const x0, uint32_t const n) {
+    return pow_template_i<Vec4d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec4d pow(Vec4d const a, Const_int_t<n>) {
+    return pow_n<Vec4d, n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec4d round(Vec4d const a) {
+    return _mm256_round_pd(a, 0+8);
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec4d truncate(Vec4d const a) {
+    return _mm256_round_pd(a, 3+8);
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec4d floor(Vec4d const a) {
+    return _mm256_round_pd(a, 1+8);
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec4d ceil(Vec4d const a) {
+    return _mm256_round_pd(a, 2+8);
+}
+
+// function round_to_int32: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int32(Vec4d const a) {
+    // Note: assume MXCSR control register is set to rounding
+    return _mm256_cvtpd_epi32(a);
+}
+
+// function truncate_to_int32: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int32(Vec4d const a) {
+    return _mm256_cvttpd_epi32(a);
+}
+
+#if INSTRSET >= 8  // 256 bit integer vectors are available. AVX2
+
+// function truncatei: round towards zero
+static inline Vec4q truncatei(Vec4d const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm256_cvttpd_epi64(a);
+#else
+    double aa[4];    // inefficient
+    a.store(aa);
+    return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
+#endif
+}
+
+// function roundi: round to nearest or even
+static inline Vec4q roundi(Vec4d const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm256_cvtpd_epi64(a);
+#else
+    return truncatei(round(a));  // inefficient
+#endif
+}
+
+// function to_double: convert integer vector elements to double vector
+static inline Vec4d to_double(Vec4q const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+        return _mm256_maskz_cvtepi64_pd( __mmask16(0xFF), a);
+#else
+        int64_t aa[4];      // inefficient
+        a.store(aa);
+        return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+#endif
+}
+
+static inline Vec4d to_double(Vec4uq const a) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm256_cvtepu64_pd(a);
+#else
+    uint64_t aa[4];      // inefficient
+    a.store(aa);
+    return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+#endif
+}
+
+#else  // no 256 bit integer vectors
+
+// function truncatei: round towards zero. (inefficient)
+static inline Vec4q truncatei(Vec4d const a) {
+    return Vec4q(truncatei(a.get_low()), truncatei(a.get_high()));
+}
+
+// function roundi: round to nearest or even. (inefficient)
+static inline Vec4q roundi(Vec4d const a) {
+    return Vec4q(roundi(a.get_low()), roundi(a.get_high()));
+}
+
+// function to_double: convert integer vector elements to double vector
+static inline Vec4d to_double(Vec4q const a) {
+    return Vec4d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+static inline Vec4d to_double(Vec4uq const a) {
+    return Vec4d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+#endif // AVX2
+
+
+// function to_double: convert integer vector to double vector
+static inline Vec4d to_double(Vec4i const a) {
+    return _mm256_cvtepi32_pd(a);
+}
+
+// function compress: convert two Vec4d to one Vec8f
+static inline Vec8f compress (Vec4d const low, Vec4d const high) {
+    __m128 t1 = _mm256_cvtpd_ps(low);
+    __m128 t2 = _mm256_cvtpd_ps(high);
+    return Vec8f(t1, t2);
+}
+
+// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
+static inline Vec4d extend_low(Vec8f const a) {
+    return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
+}
+
+// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
+static inline Vec4d extend_high (Vec8f const a) {
+    return _mm256_cvtps_pd(_mm256_extractf128_ps(a,1));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4d mul_add(Vec4d const a, Vec4d const b, Vec4d const c) {
+#ifdef __FMA__
+    return _mm256_fmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_macc_pd(a, b, c);
+#else
+    return a * b + c;
+#endif
+
+}
+
+// Multiply and subtract
+static inline Vec4d mul_sub(Vec4d const a, Vec4d const b, Vec4d const c) {
+#ifdef __FMA__
+    return _mm256_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_pd(a, b, c);
+#else
+    return a * b - c;
+#endif
+}
+
+// Multiply and inverse subtract
+static inline Vec4d nmul_add(Vec4d const a, Vec4d const b, Vec4d const c) {
+#ifdef __FMA__
+    return _mm256_fnmadd_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_nmacc_pd(a, b, c);
+#else
+    return c - a * b;
+#endif
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split.
+// This is used in mathematical functions. Do not use it in general code
+// because it is inaccurate in certain cases
+static inline Vec4d mul_sub_x(Vec4d const a, Vec4d const b, Vec4d const c) {
+#ifdef __FMA__
+    return _mm256_fmsub_pd(a, b, c);
+#elif defined (__FMA4__)
+    return _mm256_msub_pd(a, b, c);
+#else
+    // calculate a * b - c with extra precision
+    // mask to remove lower 27 bits
+    Vec4d upper_mask = _mm256_castps_pd(constant8f<0xF8000000u,0xFFFFFFFFu,0xF8000000u,0xFFFFFFFFu,0xF8000000u,0xFFFFFFFFu,0xF8000000u,0xFFFFFFFFu>());
+    Vec4d a_high = a & upper_mask;               // split into high and low parts
+    Vec4d b_high = b & upper_mask;
+    Vec4d a_low  = a - a_high;
+    Vec4d b_low  = b - b_high;
+    Vec4d r1 = a_high * b_high;                  // this product is exact
+    Vec4d r2 = r1 - c;                           // subtract c from high product
+    Vec4d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
+    return r3; // + ((r2 - r1) + c);
+#endif
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec4q exponent(Vec4d const a) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available
+    Vec4uq t1 = _mm256_castpd_si256(a);// reinterpret as 64-bit integer
+    Vec4uq t2 = t1 << 1;               // shift out sign bit
+    Vec4uq t3 = t2 >> 53;              // shift down logical to position 0
+    Vec4q  t4 = Vec4q(t3) - 0x3FF;     // subtract bias from exponent
+    return t4;
+#else
+    return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
+#endif
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec4d fraction(Vec4d const a) {
+#if INSTRSET >= 10
+    return _mm256_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+#elif INSTRSET >= 8 // AVX2. 256 bit integer vectors are available
+    Vec4uq t1 = _mm256_castpd_si256(a);   // reinterpret as 64-bit integer
+    Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFF) | 0x3FF0000000000000); // set exponent to 0 + bias
+    return _mm256_castsi256_pd(t2);
+#else
+    return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
+#endif
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4d exp2(Vec4q const n) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available
+    Vec4q t1 = max(n,  -0x3FF);        // limit to allowed range
+    Vec4q t2 = min(t1,  0x400);
+    Vec4q t3 = t2 + 0x3FF;             // add bias
+    Vec4q t4 = t3 << 52;               // put exponent into position 52
+    return _mm256_castsi256_pd(t4);    // reinterpret as double
+#else
+    return Vec4d(exp2(n.get_low()), exp2(n.get_high()));
+#endif
+}
+//static inline Vec4d exp2(Vec4d const x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
+static inline Vec4db sign_bit(Vec4d const a) {
+#if INSTRSET >= 8  // 256 bit integer vectors are available, AVX2
+    Vec4q t1 = _mm256_castpd_si256(a);    // reinterpret as 64-bit integer
+    Vec4q t2 = t1 >> 63;                  // extend sign bit
+#if INSTRSET >= 10
+    return t2 != 0;
+#else
+    return _mm256_castsi256_pd(t2);       // reinterpret as 64-bit Boolean
+#endif
+#else
+    return Vec4db(sign_bit(a.get_low()),sign_bit(a.get_high()));
+#endif
+}
+
+// change signs on vectors Vec4d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+inline Vec4d change_sign(Vec4d const a) {
+    if ((i0 | i1 | i2 | i3) == 0) return a;
+    __m256d mask = _mm256_castps_pd(constant8f <
+        0u, (i0 ? 0x80000000u : 0u), 0u, (i1 ? 0x80000000u : 0u), 0u, (i2 ? 0x80000000u : 0u), 0u, (i3 ? 0x80000000u : 0u)> ());
+    return _mm256_xor_pd(a, mask);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+#if INSTRSET >= 8  // AVX2
+
+// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
+// If necessary, compile with -fabi-version=0 to get the latest abi version
+//#if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)
+static inline __m256i reinterpret_i (__m256i const x) {
+    return x;
+}
+
+static inline __m256i reinterpret_i (__m256  const x) {
+    return _mm256_castps_si256(x);
+}
+
+static inline __m256i reinterpret_i (__m256d const x) {
+    return _mm256_castpd_si256(x);
+}
+
+static inline __m256  reinterpret_f (__m256i const x) {
+    return _mm256_castsi256_ps(x);
+}
+
+static inline __m256  reinterpret_f (__m256  const x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (__m256d const x) {
+    return _mm256_castpd_ps(x);
+}
+
+static inline __m256d reinterpret_d (__m256i const x) {
+    return _mm256_castsi256_pd(x);
+}
+
+static inline __m256d reinterpret_d (__m256  const x) {
+    return _mm256_castps_pd(x);
+}
+
+static inline __m256d reinterpret_d (__m256d const x) {
+    return x;
+}
+
+#else  // AVX2 emulated in vectori256e.h, AVX supported
+
+// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
+// If necessary, compile with -fabi-version=0 to get the latest abi version
+
+static inline Vec256b reinterpret_i (__m256  const x) {
+    Vec8f xx(x);
+    return Vec256b(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline Vec256b reinterpret_i (__m256d const x) {
+    Vec4d xx(x);
+    return Vec256b(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
+}
+
+static inline __m256  reinterpret_f (__m256  const x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (__m256d const x) {
+    return _mm256_castpd_ps(x);
+}
+
+static inline __m256d reinterpret_d (__m256  const x) {
+    return _mm256_castps_pd(x);
+}
+
+static inline __m256d reinterpret_d (__m256d const x) {
+    return x;
+}
+
+static inline Vec256b reinterpret_i (Vec256b const x) {
+    return x;
+}
+
+static inline __m256  reinterpret_f (Vec256b const x) {
+    return Vec8f(Vec4f(reinterpret_f(x.get_low())), Vec4f(reinterpret_f(x.get_high())));
+}
+
+static inline __m256d reinterpret_d (Vec256b const x) {
+    return Vec4d(Vec2d(reinterpret_d(x.get_low())), Vec2d(reinterpret_d(x.get_high())));
+}
+
+#endif  // AVX2
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec8f infinite8f() {
+    return reinterpret_f(Vec8i(0x7F800000));
+}
+
+// Function nan8f: returns a vector where all elements are +NAN (quiet)
+static inline Vec8f nan8f(int n = 0x10) {
+    return nan_vec<Vec8f>(n);
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec4d infinite4d() {
+    return reinterpret_d(Vec4q(0x7FF0000000000000));
+}
+
+// Function nan4d: returns a vector where all elements are +NAN (quiet)
+static inline Vec4d nan4d(int n = 0x10) {
+    return nan_vec<Vec4d>(n);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// permute vector Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d permute4(Vec4d const a) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 };          // indexes as array
+    __m256d y = a;                                         // result
+    constexpr uint64_t flags = perm_flags<Vec4d>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_pd(); // just return zero
+
+    if constexpr ((flags & perm_largeblock) != 0) {        // permute 128-bit blocks
+        constexpr EList<int, 2> L = largeblock_perm<4>(indexs); // permutation pattern
+        constexpr int j0 = L.a[0];
+        constexpr int j1 = L.a[1];
+#ifndef ZEXT_MISSING
+        if constexpr (j0 == 0 && j1 == -1 && !(flags & perm_addz)) { // zero extend
+            return _mm256_zextpd128_pd256(_mm256_castpd256_pd128(y));
+        }
+        if constexpr (j0 == 1 && j1 < 0 && !(flags & perm_addz)) {   // extract upper part, zero extend
+            return _mm256_zextpd128_pd256(_mm256_extractf128_pd(y, 1));
+        }
+#endif
+        if constexpr ((flags & perm_perm) != 0  && !(flags & perm_zeroing)) {
+            return _mm256_permute2f128_pd(y, y, (j0 & 1) | (j1 & 1) << 4);
+        }
+    }
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+        if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in both lanes
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm256_unpackhi_pd(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm256_unpacklo_pd(y, y);
+            }
+            else { // general permute
+                constexpr uint8_t mm0 = (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3;
+                y = _mm256_permute_pd(a, mm0);             // select within same lane
+            }
+        }
+#if INSTRSET >= 8  // AVX2
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) {
+            y = _mm256_broadcastsd_pd(_mm256_castpd256_pd128(y)); // broadcast first element
+        }
+#endif
+        else {     // different patterns in two lanes
+#if INSTRSET >= 10 // AVX512VL
+            if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                constexpr uint8_t zm = zero_mask<4>(indexs);
+                return _mm256_castsi256_pd(_mm256_maskz_alignr_epi64 (zm, _mm256_castpd_si256(y), _mm256_castpd_si256(y), rot));
+            }
+#endif
+            if constexpr ((flags & perm_cross_lane) == 0){ // no lane crossing
+                constexpr uint8_t mm0 = (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3;
+                y = _mm256_permute_pd(a, mm0);             // select within same lane
+            }
+            else {
+#if INSTRSET >= 8  // AVX2
+                // full permute
+                constexpr uint8_t mms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6;
+                y = _mm256_permute4x64_pd(a, mms);
+#else
+                // permute lanes separately
+                __m256d sw = _mm256_permute2f128_pd(a,a,1);// swap the two 128-bit lanes
+                constexpr uint8_t mml = (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3;
+                __m256d y1 = _mm256_permute_pd(a, mml);    // select from same lane
+                __m256d y2 = _mm256_permute_pd(sw, mml);   // select from opposite lane
+                constexpr uint64_t blendm = make_bit_mask<4, 0x101>(indexs);  // blend mask
+                y = _mm256_blend_pd(y1, y2, uint8_t(blendm));
+#endif
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {           // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_pd(zero_mask<4>(indexs), y);
+#else               // use broad mask
+        const EList <int64_t, 4> bm = zero_mask_broad<Vec4q>(indexs);
+        //y = _mm256_and_pd(_mm256_castsi256_pd( Vec4q().load(bm.a) ), y);  // does not work with INSTRSET = 7
+        __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a));
+        y = _mm256_and_pd(_mm256_castsi256_pd(bm1), y);
+
+#endif
+    }
+    return y;
+}
+
+
+// permute vector Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f permute8(Vec8f const a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m256 y = a;                                         // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec8f>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_ps();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {    // use larger permutation
+            constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // permutation pattern
+            y = _mm256_castpd_ps(permute4 <L.a[0], L.a[1], L.a[2], L.a[3]>
+                (Vec4d(_mm256_castps_pd(a))));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in both lanes
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm256_unpackhi_ps(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm256_unpacklo_ps(y, y);
+            }
+            else { // general permute, same pattern in both lanes
+                y = _mm256_shuffle_ps(a, a, uint8_t(flags >> perm_ipattern));
+            }
+        }
+#if INSTRSET >= 10
+        else if constexpr ((flags & perm_broadcast) != 0) {
+            constexpr uint8_t e = flags >> perm_rot_count & 0xF; // broadcast one element
+            if constexpr (e > 0) {
+                y =  _mm256_castsi256_ps(_mm256_alignr_epi32( _mm256_castps_si256(y),  _mm256_castps_si256(y), e));
+            }
+            y = _mm256_broadcastss_ps(_mm256_castps256_ps128(y));
+        }
+#elif INSTRSET >= 8 // AVX2
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count == 0)) {
+            y = _mm256_broadcastss_ps(_mm256_castps256_ps128(y)); // broadcast first element
+        }
+#endif
+#if INSTRSET >= 8  // avx2
+        else if constexpr ((flags & perm_zext) != 0) {  // zero extension
+            y = _mm256_castsi256_ps(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(_mm256_castps_si256(y))));  // zero extension
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+#endif
+#if INSTRSET >= 10  // AVX512VL
+        else if constexpr ((flags & perm_compress) != 0) {
+            y = _mm256_maskz_compress_ps(__mmask8(compress_mask(indexs)), y); // compress
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+        else if constexpr ((flags & perm_expand) != 0) {
+            y = _mm256_maskz_expand_ps(__mmask8(expand_mask(indexs)), y); // expand
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+#endif
+        else {  // different patterns in two lanes
+#if INSTRSET >= 10  // AVX512VL
+            if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                y = _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(y), _mm256_castps_si256(y), rot));
+            }
+            else
+#endif
+            if constexpr ((flags & perm_cross_lane) == 0) {  // no lane crossing. Use vpermilps
+                __m256 m = constant8f<i0 & 3, i1 & 3, i2 & 3, i3 & 3, i4 & 3, i5 & 3, i6 & 3, i7 & 3>();
+                y = _mm256_permutevar_ps(a, _mm256_castps_si256(m));
+            }
+            else {
+                // full permute needed
+                __m256i permmask = _mm256_castps_si256(
+                    constant8f <i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7 >());
+#if INSTRSET >= 8  // AVX2
+                y = _mm256_permutevar8x32_ps(a, permmask);
+#else
+                // permute lanes separately
+                __m256 sw = _mm256_permute2f128_ps(a, a, 1);  // swap the two 128-bit lanes
+                __m256 y1 = _mm256_permutevar_ps(a,  permmask);   // select from same lane
+                __m256 y2 = _mm256_permutevar_ps(sw, permmask);   // select from opposite lane
+                constexpr uint64_t blendm = make_bit_mask<8, 0x102>(indexs);  // blend mask
+                y = _mm256_blend_ps(y1, y2, uint8_t(blendm));
+#endif
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_ps(zero_mask<8>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 8> bm = zero_mask_broad<Vec8i>(indexs);
+        __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a));
+        y = _mm256_and_ps(_mm256_castsi256_ps(bm1), y);
+#endif
+    }
+    return y;
+}
+
+
+// blend vectors Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d blend4(Vec4d const a, Vec4d const b) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 };          // indexes as array
+    __m256d y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec4d>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_pd();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute4 <i0, i1, i2, i3> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute4 <i0<0?i0:i0&3, i1<0?i1:i1&3, i2<0?i2:i2&3, i3<0?i3:i3&3> (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs);  // blend mask
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm256_mask_mov_pd (a, mb, b);
+#else  // AVX
+        y = _mm256_blend_pd(a, b, mb); // duplicate each bit
+#endif
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 128-bit blocks
+        constexpr EList<int, 2> L = largeblock_perm<4>(indexs); // get 128-bit blend pattern
+        constexpr uint8_t pp = (L.a[0] & 0xF) | uint8_t(L.a[1] & 0xF) << 4;
+        y = _mm256_permute2f128_pd(a, b, pp);
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm256_unpacklo_pd (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm256_unpacklo_pd (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm256_unpackhi_pd(a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm256_unpackhi_pd(b, a);
+    }
+    else if constexpr ((flags & blend_shufab) != 0) {
+        y = _mm256_shuffle_pd(a, b, (flags >> blend_shufpattern) & 0xF);
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {
+        y = _mm256_shuffle_pd(b, a, (flags >> blend_shufpattern) & 0xF);
+    }
+    else { // No special cases
+#if INSTRSET >= 10  // AVX512VL. use vpermi2pd
+        __m256i const maskp = constant8ui<i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0>();
+        return _mm256_maskz_permutex2var_pd (zero_mask<4>(indexs), a, maskp, b);
+#else   // permute a and b separately, then blend.
+        constexpr EList<int, 8> L = blend_perm_indexes<4, 0>(indexs); // get permutation indexes
+        __m256d ya = permute4<L.a[0], L.a[1], L.a[2], L.a[3]>(a);
+        __m256d yb = permute4<L.a[4], L.a[5], L.a[6], L.a[7]>(b);
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs);  // blend mask
+        y = _mm256_blend_pd(ya, yb, mb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_pd(zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int64_t, 4> bm = zero_mask_broad<Vec4q>(indexs);
+        __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a));
+        y = _mm256_and_pd(_mm256_castsi256_pd(bm1), y);
+#endif
+    }
+    return y;
+}
+
+
+// blend vectors Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f blend8(Vec8f const a, Vec8f const b) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m256 y = a;                                          // result
+    constexpr uint64_t flags = blend_flags<Vec8f>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_ps();  // just return zero
+
+    if constexpr ((flags & blend_largeblock) != 0) {       // blend and permute 32-bit blocks
+        constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // get 32-bit blend pattern
+        y = _mm256_castpd_ps(blend4 <L.a[0], L.a[1], L.a[2], L.a[3]>
+            (Vec4d(_mm256_castps_pd(a)), Vec4d(_mm256_castps_pd(b))));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else if constexpr ((flags & blend_b) == 0) {           // nothing from b. just permute a
+        return permute8 <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+    else if constexpr ((flags & blend_a) == 0) {           // nothing from a. just permute b
+        constexpr EList<int, 16> L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes
+        return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b);
+    }
+    else if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs);  // blend mask
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm256_mask_mov_ps(a, mb, b);
+#else  // AVX2
+        y = _mm256_blend_ps(a, b, mb);
+#endif
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm256_unpacklo_ps(a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm256_unpacklo_ps(b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm256_unpackhi_ps(a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm256_unpackhi_ps(b, a);
+    }
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm256_shuffle_ps(a, b, uint8_t(flags >> blend_shufpattern));
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm256_shuffle_ps(b, a, uint8_t(flags >> blend_shufpattern));
+    }
+    else { // No special cases
+#if INSTRSET >= 10  // AVX512VL. use vpermi2d
+        __m256i const maskp = constant8ui<i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15> ();
+        return _mm256_maskz_permutex2var_ps(zero_mask<8>(indexs), a, maskp, b);
+#else   // permute a and b separately, then blend.
+        constexpr EList<int, 16> L = blend_perm_indexes<8, 0>(indexs); // get permutation indexes
+        __m256 ya = permute8<L.a[0], L.a[1], L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7] >(a);
+        __m256 yb = permute8<L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15]>(b);
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs);  // blend mask
+        y = _mm256_blend_ps(ya, yb, mb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_ps(zero_mask<8>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 8> bm = zero_mask_broad<Vec8i>(indexs);
+        __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a));
+        y = _mm256_and_ps(_mm256_castsi256_ps(bm1), y);
+#endif
+    }
+    return y;
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec8f lookup8(Vec8i const index, Vec8f const table) {
+#if INSTRSET >= 8  // AVX2
+    return _mm256_permutevar8x32_ps(table, index);
+
+#else // AVX
+    // swap low and high part of table
+    __m256  sw = _mm256_permute2f128_ps(table, table, 1);  // swap the two 128-bit lanes
+    // join index parts
+    __m256i index2 = _mm256_insertf128_si256(_mm256_castsi128_si256(index.get_low()), index.get_high(), 1);
+    // permute within each 128-bit part
+    __m256  r0 = _mm256_permutevar_ps(table, index2);
+    __m256  r1 = _mm256_permutevar_ps(sw,    index2);
+    // high index bit for blend
+    __m128i k1 = _mm_slli_epi32(index.get_high() ^ 4, 29);
+    __m128i k0 = _mm_slli_epi32(index.get_low(),      29);
+    __m256  kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), _mm_castsi128_ps(k1), 1);
+    // blend the two permutes
+    return _mm256_blendv_ps(r0, r1, kk);
+#endif
+}
+
+template <int n>
+static inline Vec8f lookup(Vec8i const index, float const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 4) {
+        Vec4f table1 = Vec4f().load(table);
+        return Vec8f(
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+#if INSTRSET < 8  // not AVX2
+    if constexpr (n <= 8) {
+        return lookup8(index, Vec8f().load(table));
+    }
+#endif
+    // Limit index
+    Vec8ui index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), n-1);
+    }
+#if INSTRSET >= 8  // AVX2
+    return _mm256_i32gather_ps(table, index1, 4);
+#else // AVX
+    return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
+    table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
+#endif
+}
+
+static inline Vec4d lookup4(Vec4q const index, Vec4d const table) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm256_permutexvar_pd(index, table);
+
+#elif INSTRSET >= 8  // AVX2
+    // We can't use VPERMPD because it has constant indexes, vpermilpd can permute only within 128-bit lanes
+    // Convert the index to fit VPERMPS
+    Vec8i index1 = permute8<0,0,2,2,4,4,6,6> (Vec8i(index+index));
+    Vec8i index2 = index1 + Vec8i(constant8ui<0,1,0,1,0,1,0,1>());
+    return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(table), index2));
+
+#else // AVX
+    // swap low and high part of table
+    __m256d sw = _mm256_permute2f128_pd(table, table, 1);// swap the two 128-bit lanes
+    // index << 1
+    __m128i index2lo = index.get_low()  + index.get_low();
+    __m128i index2hi = index.get_high() + index.get_high();
+    // join index parts
+    __m256i index3 = _mm256_insertf128_si256(_mm256_castsi128_si256(index2lo), index2hi, 1);
+    // permute within each 128-bit part
+    __m256d r0 = _mm256_permutevar_pd(table, index3);  // permutevar_pd selects by bit 1 !
+    __m256d r1 = _mm256_permutevar_pd(sw,    index3);
+    // high index bit for blend
+    __m128i k1 = _mm_slli_epi64(index.get_high() ^ 2, 62);
+    __m128i k0 = _mm_slli_epi64(index.get_low(),      62);
+    __m256d kk = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_castsi128_pd(k0)), _mm_castsi128_pd(k1), 1);
+    // blend the two permutes
+    return _mm256_blendv_pd(r0, r1, kk);
+#endif
+}
+
+
+template <int n>
+static inline Vec4d lookup(Vec4q const index, double const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 2) {
+        Vec2d table1 = Vec2d().load(table);
+        return Vec4d(
+            lookup2 (index.get_low(),  table1),
+            lookup2 (index.get_high(), table1));
+    }
+#if INSTRSET < 8  // not AVX2
+    if constexpr (n <= 4) {
+        return lookup4(index, Vec4d().load(table));
+    }
+#endif
+    // Limit index
+    Vec4uq index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4uq(index) & Vec4uq(n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec4uq(index), n-1);
+    }
+#if INSTRSET >= 8  // AVX2
+    return _mm256_i64gather_pd(table, index1, 8);
+#else // AVX
+    Vec4q index2 = Vec4q(index1);
+    return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, ..
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f gather8f(void const * a) {
+    return reinterpret_f(gather8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d gather4d(void const * a) {
+    return reinterpret_d(gather4q<i0, i1, i2, i3>(a));
+}
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8f const data, float * array) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __m256i indx = constant8ui<i0,i1,i2,i3,i4,i5,i6,i7>();
+    __mmask8 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) |
+        ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7));
+    _mm256_mask_i32scatter_ps(array, mask, indx, data, 4);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __m512i indx = _mm512_castsi256_si512(constant8ui<i0,i1,i2,i3,i4,i5,i6,i7>());
+    __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) |
+        ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7));
+    _mm512_mask_i32scatter_ps(array, mask, indx, _mm512_castps256_ps512(data), 4);
+#else
+    const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
+    for (int i = 0; i < 8; i++) {
+        if (index[i] >= 0) array[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4d const data, double * array) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __m128i indx = constant4ui<i0,i1,i2,i3>();
+    __mmask8 mask = uint8_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3));
+    _mm256_mask_i32scatter_pd(array, mask, indx, data, 8);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __m256i indx = _mm256_castsi128_si256(constant4ui<i0,i1,i2,i3>());
+    __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3));
+    _mm512_mask_i32scatter_pd(array, (__mmask8)mask, indx, _mm512_castpd256_pd512(data), 8);
+#else
+    const int index[4] = {i0,i1,i2,i3};
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0) array[index[i]] = data[i];
+    }
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Scatter functions with variable indexes
+*
+*****************************************************************************/
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8f const data, float * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask8 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+    _mm256_mask_i32scatter_ps(destination, mask, index, data, 4);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xFFu, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+    _mm512_mask_i32scatter_ps(destination, mask, _mm512_castsi256_si512(index), _mm512_castps256_ps512(data), 4);
+#else
+    for (int i = 0; i < 8; i++) {
+        if (uint32_t(index[i]) < limit) destination[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4q const index, uint32_t limit, Vec4d const data, double * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask8 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit)));
+    _mm256_mask_i64scatter_pd(destination, mask, index, data, 8);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __mmask16 mask = _mm512_mask_cmplt_epu64_mask(0xF, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit))));
+    _mm512_mask_i64scatter_pd(destination, (__mmask8)mask, _mm512_castsi256_si512(index), _mm512_castpd256_pd512(data), 8);
+#else
+    for (int i = 0; i < 4; i++) {
+        if (uint64_t(index[i]) < uint64_t(limit)) destination[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4i const index, uint32_t limit, Vec4d const data, double * destination) {
+#if INSTRSET >= 10   //  __AVX512VL__
+    __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+    _mm256_mask_i32scatter_pd(destination, mask, index, data, 8);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xF, _mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit)));
+    _mm512_mask_i32scatter_pd(destination, (__mmask8)mask, _mm256_castsi128_si256(index), _mm512_castpd256_pd512(data), 8);
+#else
+    for (int i = 0; i < 4; i++) {
+        if (uint32_t(index[i]) < limit) destination[index[i]] = data[i];
+    }
+#endif
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORF256_H
diff --git a/Bwdif/VCL2/vectorf256e.h b/Bwdif/VCL2/vectorf256e.h
new file mode 100644
index 0000000..e8ce334
--- /dev/null
+++ b/Bwdif/VCL2/vectorf256e.h
@@ -0,0 +1,1951 @@
+/****************************  vectorf256e.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 256-bit floating point vector classes
+* Emulated for processors without AVX instruction set.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec8f     Vector of 8 single precision floating point numbers
+* Vec8fb    Vector of 8 Booleans for use with Vec8f
+* Vec4d     Vector of 4 double precision floating point numbers
+* Vec4db    Vector of 4 Booleans for use with Vec4d
+*
+* Each vector object is represented internally in the CPU as two 128-bit registers.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORF256E_H
+#define VECTORF256E_H  1
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#ifdef VECTORF256_H
+#error Two different versions of vectorf256.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+/*****************************************************************************
+*
+*          base class Vec256fe and Vec256de
+*
+*****************************************************************************/
+
+// base class to replace __m256 when AVX is not supported
+class Vec256fe {
+protected:
+    __m128 y0;                         // low half
+    __m128 y1;                         // high half
+public:
+    Vec256fe(void) {};                 // default constructor
+    Vec256fe(__m128 x0, __m128 x1) {   // constructor to build from two __m128
+        y0 = x0;  y1 = x1;
+    }
+    __m128 get_low() const {           // get low half
+        return y0;
+    }
+    __m128 get_high() const {          // get high half
+        return y1;
+    }
+};
+
+// base class to replace __m256d when AVX is not supported
+class Vec256de {
+public:
+    Vec256de() {};                     // default constructor
+    Vec256de(__m128d x0, __m128d x1) { // constructor to build from two __m128d
+        y0 = x0;  y1 = x1;
+    }
+    __m128d get_low() const {          // get low half
+        return y0;
+    }
+    __m128d get_high() const {         // get high half
+        return y1;
+    }
+protected:
+    __m128d y0;                        // low half
+    __m128d y1;                        // high half
+};
+
+
+/*****************************************************************************
+*
+*          select functions
+*
+*****************************************************************************/
+// Select between two Vec256fe sources, element by element using broad boolean vector.
+// Used in various functions and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
+static inline Vec256fe selectf (Vec256fe const s, Vec256fe const a, Vec256fe const b) {
+    return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high()));
+}
+
+// Same, with two Vec256de sources.
+// and operators. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
+// values are allowed.
+static inline Vec256de selectd (Vec256de const s, Vec256de const a, Vec256de const b) {
+    return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vec8fb: Vector of 8 Booleans for use with Vec8f
+*
+*****************************************************************************/
+
+class Vec8fb : public Vec256fe {
+public:
+    // Default constructor:
+    Vec8fb() {
+    }
+    // Constructor to build from all elements:
+    Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
+        y0 = Vec4fb(b0, b1, b2, b3);
+        y1 = Vec4fb(b4, b5, b6, b7);
+    }
+    // Constructor to build from two Vec4fb:
+    Vec8fb(Vec4fb const a0, Vec4fb const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256fe
+    Vec8fb(Vec256fe const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Constructor to broadcast scalar value:
+    Vec8fb(bool b) {
+        y0 = y1 = Vec4fb(b);
+    }
+    // Assignment operator to convert from type Vec256fe
+    Vec8fb & operator = (Vec256fe const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb(Vec8ib const x) {
+        y0 = _mm_castsi128_ps(Vec8i(x).get_low());
+        y1 = _mm_castsi128_ps(Vec8i(x).get_high());
+    }
+    // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
+    Vec8fb & operator = (Vec8ib const x) {
+        y0 = _mm_castsi128_ps(Vec8i(x).get_low());
+        y1 = _mm_castsi128_ps(Vec8i(x).get_high());
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8fb & operator = (bool b) {
+        y0 = y1 = Vec4fb(b);
+        return *this;
+    }
+    // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
+    operator Vec8ib() const {
+        return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1));
+    }
+
+    // Member function to change a single element in vector
+    Vec8fb const insert(int index, bool value) {
+        if ((uint32_t)index < 4) {
+            y0 = Vec4fb(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec4fb(y1).insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        if ((uint32_t)index < 4) {
+            return Vec4fb(y0).extract(index);
+        }
+        else {
+            return Vec4fb(y1).extract(index-4);
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec4fb get_low() const {
+        return y0;
+    }
+    Vec4fb get_high() const {
+        return y1;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8fb & load_bits(uint8_t a) {
+        y0 = Vec4fb().load_bits(a);
+        y1 = Vec4fb().load_bits(uint8_t(a>>4u));
+        return *this;
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec8fb(int b) = delete;
+    Vec8fb & operator = (int x) = delete;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8fb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8fb operator & (Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+static inline Vec8fb operator && (Vec8fb const a, Vec8fb const b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8fb operator | (Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8fb operator || (Vec8fb const a, Vec8fb const b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8fb operator ^ (Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8fb operator ~ (Vec8fb const a) {
+    return Vec8fb(~a.get_low(), ~a.get_high());
+}
+
+// vector operator == : xnor
+static inline Vec8fb operator == (Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(Vec8fb(a) ^ Vec8fb(~b));
+}
+
+// vector operator != : xor
+static inline Vec8fb operator != (Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(a ^ b);
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec8fb operator ! (Vec8fb const a) {
+    return Vec8fb(!a.get_low(), !a.get_high());
+}
+
+// Functions for Vec8fb
+
+// andnot: a & ~ b
+static inline Vec8fb andnot(Vec8fb const a, Vec8fb const b) {
+    return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec8fb const a) {
+    return horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec8fb const a) {
+    return horizontal_or(a.get_low() | a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Vec4db: Vector of 4 Booleans for use with Vec4d
+*
+*****************************************************************************/
+
+class Vec4db : public Vec256de {
+public:
+    // Default constructor:
+    Vec4db() {
+    }
+    // Constructor to build from all elements:
+    Vec4db(bool b0, bool b1, bool b2, bool b3) {
+        y0 = Vec2db(b0, b1);
+        y1 = Vec2db(b2, b3);
+    }
+    // Constructor to build from two Vec2db:
+    Vec4db(Vec2db const a0, Vec2db const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256de
+    Vec4db(Vec256de const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Constructor to broadcast scalar value:
+    Vec4db(bool b) {
+        y0 = y1 = Vec2db(b);
+    }
+    // Assignment operator to convert from type Vec256de
+    Vec4db & operator = (Vec256de const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+
+    // Constructor to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db(Vec4qb const x) {
+        y0 = _mm_castsi128_pd(Vec4q(x).get_low());
+        y1 = _mm_castsi128_pd(Vec4q(x).get_high());
+    }
+    // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
+    Vec4db & operator = (Vec4qb const x) {
+        y0 = _mm_castsi128_pd(Vec4q(x).get_low());
+        y1 = _mm_castsi128_pd(Vec4q(x).get_high());
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4db & operator = (bool b) {
+        y0 = y1 = Vec2db(b);
+        return *this;
+    }
+    // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
+    operator Vec4qb() const {
+        return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1));
+    }
+
+    // Member function to change a single element in vector
+    Vec4db const insert(int index, bool value) {
+        if ((uint32_t)index < 2) {
+            y0 = Vec2db(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec2db(y1).insert(index - 2, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        if ((uint32_t)index < 2) {
+            return Vec2db(y0).extract(index);
+        }
+        else {
+            return Vec2db(y1).extract(index - 2);
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4fb:
+    Vec2db get_low() const {
+        return y0;
+    }
+    Vec2db get_high() const {
+        return y1;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec4db & load_bits(uint8_t a) {
+        y0 = Vec2db().load_bits(a);
+        y1 = Vec2db().load_bits(uint8_t(a>>2u));
+        return *this;
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec4db(int b) = delete;
+    Vec4db & operator = (int x) = delete;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4db
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4db operator & (Vec4db const a, Vec4db const b) {
+    return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4db operator && (Vec4db const a, Vec4db const b) {
+    return a & b;
+}
+
+// vector operator &= : bitwise and
+static inline Vec4db & operator &= (Vec4db & a, Vec4db const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4db operator | (Vec4db const a, Vec4db const b) {
+    return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4db operator || (Vec4db const a, Vec4db const b) {
+    return a | b;
+}
+
+// vector operator |= : bitwise or
+static inline Vec4db & operator |= (Vec4db & a, Vec4db const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4db operator ^ (Vec4db const a, Vec4db const b) {
+    return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4db & operator ^= (Vec4db & a, Vec4db const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4db operator ~ (Vec4db const a) {
+    return Vec4db(~a.get_low(), ~a.get_high());
+}
+
+// vector operator == : xnor
+static inline Vec4db operator == (Vec4db const a, Vec4db const b) {
+    return Vec4db(Vec4db(a) ^ Vec4db(~b));
+}
+
+// vector operator != : xor
+static inline Vec4db operator != (Vec4db const a, Vec4db const b) {
+    return Vec4db(a ^ b);
+}
+
+// vector operator ! : logical not
+// (operator ! is less efficient than operator ~. Use only where not
+// all bits in an element are the same)
+static inline Vec4db operator ! (Vec4db const a) {
+    return Vec4db(!a.get_low(), !a.get_high());
+}
+
+// Functions for Vec4db
+
+// andnot: a & ~ b
+static inline Vec4db andnot(Vec4db const a, Vec4db const b) {
+    return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec4db const a) {
+    return horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec4db const a) {
+    return horizontal_or(a.get_low() | a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Vec8f: Vector of 8 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec8f : public Vec256fe {
+public:
+    // Default constructor:
+    Vec8f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8f(float f) {
+        y1 = y0 = _mm_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
+        y0 = _mm_setr_ps(f0, f1, f2, f3);
+        y1 = _mm_setr_ps(f4, f5, f6, f7);
+    }
+    // Constructor to build from two Vec4f:
+    Vec8f(Vec4f const a0, Vec4f const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256fe
+    Vec8f(Vec256fe const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256fe
+    Vec8f & operator = (Vec256fe const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8f & load(float const * p) {
+        y0 = _mm_loadu_ps(p);
+        y1 = _mm_loadu_ps(p+4);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address divisible by 32.
+    Vec8f & load_a(float const * p) {
+        y0 = _mm_load_ps(p);
+        y1 = _mm_load_ps(p+4);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm_storeu_ps(p,   y0);
+        _mm_storeu_ps(p+4, y1);
+    }
+    // Member function storing into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address divisible by 32.
+    void store_a(float * p) const {
+        _mm_store_ps(p,   y0);
+        _mm_store_ps(p+4, y1);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // Note: Will generate runtime error if p is not aligned by 32
+    void store_nt(float * p) const {
+        _mm_stream_ps(p,   y0);
+        _mm_stream_ps(p+4, y1);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8f & load_partial(int n, float const * p) {
+        if (n > 0 && n <= 4) {
+            *this = Vec8f(Vec4f().load_partial(n, p),_mm_setzero_ps());
+        }
+        else if (n > 4 && n <= 8) {
+            *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
+        }
+        else {
+            y1 = y0 = _mm_setzero_ps();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 8) {
+            get_low().store(p);
+            get_high().store_partial(n - 4, p + 4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8f & cutoff(int n) {
+        if (uint32_t(n) >= 8) return *this;
+        else if (n >= 4) {
+            y1 = Vec4f(y1).cutoff(n - 4);
+        }
+        else {
+            y0 = Vec4f(y0).cutoff(n);
+            y1 = Vec4f(0.0f);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8f const insert(int index, float value) {
+        if ((uint32_t)index < 4) {
+            y0 = Vec4f(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec4f(y1).insert(index - 4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(int index) const {
+        if ((uint32_t)index < 4) {
+            return Vec4f(y0).extract(index);
+        }
+        else {
+            return Vec4f(y1).extract(index - 4);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4f:
+    Vec4f get_low() const {
+        return y0;
+    }
+    Vec4f get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 16;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8f operator + (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8f operator + (Vec8f const a, float b) {
+    return a + Vec8f(b);
+}
+static inline Vec8f operator + (float a, Vec8f const b) {
+    return Vec8f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8f & operator += (Vec8f & a, Vec8f const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8f operator ++ (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8f & operator ++ (Vec8f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8f operator - (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8f operator - (Vec8f const a, float b) {
+    return a - Vec8f(b);
+}
+static inline Vec8f operator - (float a, Vec8f const b) {
+    return Vec8f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8f operator - (Vec8f const a) {
+    return Vec8f(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8f & operator -= (Vec8f & a, Vec8f const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8f operator -- (Vec8f & a, int) {
+    Vec8f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8f & operator -- (Vec8f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8f operator * (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8f operator * (Vec8f const a, float b) {
+    return a * Vec8f(b);
+}
+static inline Vec8f operator * (float a, Vec8f const b) {
+    return Vec8f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8f & operator *= (Vec8f & a, Vec8f const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8f operator / (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8f operator / (Vec8f const a, float b) {
+    return a / Vec8f(b);
+}
+static inline Vec8f operator / (float a, Vec8f const b) {
+    return Vec8f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8f & operator /= (Vec8f & a, Vec8f const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8fb operator == (Vec8f const a, Vec8f const b) {
+    return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8fb operator != (Vec8f const a, Vec8f const b) {
+    return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8fb operator < (Vec8f const a, Vec8f const b) {
+    return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8fb operator <= (Vec8f const a, Vec8f const b) {
+    return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8fb operator > (Vec8f const a, Vec8f const b) {
+    return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8fb operator >= (Vec8f const a, Vec8f const b) {
+    return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8f operator & (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec8f & operator &= (Vec8f & a, Vec8f const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8f and Vec8fb
+static inline Vec8f operator & (Vec8f const a, Vec8fb const b) {
+    return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8f operator & (Vec8fb const a, Vec8f const b) {
+    return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator | : bitwise or
+static inline Vec8f operator | (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec8f & operator |= (Vec8f & a, Vec8f const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8f operator ^ (Vec8f const a, Vec8f const b) {
+    return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8f & operator ^= (Vec8f & a, Vec8f const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8fb operator ! (Vec8f const a) {
+    return Vec8fb(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec8f select (Vec8fb const s, Vec8f const a, Vec8f const b) {
+    return Vec8f(select(s.get_low(),a.get_low(),b.get_low()), select(s.get_high(),a.get_high(),b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8f if_add (Vec8fb const f, Vec8f const a, Vec8f const b) {
+    return a + (Vec8f(f) & b);
+}
+
+// Conditional subtract
+static inline Vec8f if_sub (Vec8fb const f, Vec8f const a, Vec8f const b) {
+    return a - (Vec8f(f) & b);
+}
+
+// Conditional multiply
+static inline Vec8f if_mul (Vec8fb const f, Vec8f const a, Vec8f const b) {
+    return a * select(f, b, 1.f);
+}
+
+// Conditional divide
+static inline Vec8f if_div (Vec8fb const f, Vec8f const a, Vec8f const b) {
+    return a / select(f, b, 1.f);
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec8f const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8f max(Vec8f const a, Vec8f const b) {
+    return Vec8f(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8f min(Vec8f const a, Vec8f const b) {
+    return Vec8f(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8f abs(Vec8f const a) {
+    return Vec8f(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec8f sqrt(Vec8f const a) {
+    return Vec8f(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec8f square(Vec8f const a) {
+    return Vec8f(square(a.get_low()), square(a.get_high()));
+}
+
+// pow(Vec8f, int):
+template <typename TT> static Vec8f pow(Vec8f const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8f pow<int>(Vec8f const x0, int const n) {
+    return pow_template_i<Vec8f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8f pow<uint32_t>(Vec8f const x0, uint32_t const n) {
+    return pow_template_i<Vec8f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+// implement as function pow(vector, const_int)
+template <int n>
+static inline Vec8f pow(Vec8f const a, Const_int_t<n>) {
+    return pow_n<Vec8f, n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec8f round(Vec8f const a) {
+    return Vec8f(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec8f truncate(Vec8f const a) {
+    return Vec8f(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec8f floor(Vec8f const a) {
+    return Vec8f(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec8f ceil(Vec8f const a) {
+    return Vec8f(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function roundi: round to nearest integer (even). (result as integer vector)
+static inline Vec8i roundi(Vec8f const a) {
+    return Vec8i(roundi(a.get_low()), roundi(a.get_high()));
+}
+
+// function truncatei: round towards zero. (result as integer vector)
+static inline Vec8i truncatei(Vec8f const a) {
+    return Vec8i(truncatei(a.get_low()), truncatei(a.get_high()));
+}
+
+// function to_float: convert integer vector to float vector
+static inline Vec8f to_float(Vec8i const a) {
+    return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
+}
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec8f to_float(Vec8ui const a) {
+    return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
+}
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
+static inline Vec8f approx_recipr(Vec8f const a) {
+    return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high()));
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
+static inline Vec8f approx_rsqrt(Vec8f const a) {
+    return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high()));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8f mul_add(Vec8f const a, Vec8f const b, Vec8f const c) {
+    return Vec8f(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec8f mul_sub(Vec8f const a, Vec8f const b, Vec8f const c) {
+    return Vec8f(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec8f nmul_add(Vec8f const a, Vec8f const b, Vec8f const c) {
+    return Vec8f(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+
+// Multiply and subtract with extra precision on the intermediate calculations, used internally
+static inline Vec8f mul_sub_x(Vec8f const a, Vec8f const b, Vec8f const c) {
+    return Vec8f(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec8i exponent(Vec8f const a) {
+    return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8f exp2(Vec8i const a) {
+    return Vec8f(exp2(a.get_low()), exp2(a.get_high()));
+}
+//static Vec8f exp2(Vec8f const x); // defined in vectormath_exp.h
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec8f fraction(Vec8f const a) {
+    return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb sign_bit(Vec8f const a) {
+    return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8f sign_combine(Vec8f const a, Vec8f const b) {
+    return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_finite(Vec8f const a) {
+    return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_inf(Vec8f const a) {
+    return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec8fb is_nan(Vec8f const a) {
+    return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8fb is_subnormal(Vec8f const a) {
+    return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8fb is_zero_or_subnormal(Vec8f const a) {
+    return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec8f infinite8f() {
+    return Vec8f(infinite4f(),infinite4f());
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec8f nan8f(int n = 0x10) {
+    return Vec8f(nan4f(n), nan4f(n));
+}
+
+// change signs on vectors Vec8f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+inline Vec8f change_sign(Vec8f const a) {
+    if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
+    Vec4f lo = change_sign<i0,i1,i2,i3>(a.get_low());
+    Vec4f hi = change_sign<i4,i5,i6,i7>(a.get_high());
+    return Vec8f(lo, hi);
+}
+
+
+/*****************************************************************************
+*
+*          Vec2d: Vector of 2 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec4d : public Vec256de {
+public:
+    // Default constructor:
+    Vec4d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4d(double d) {
+        y1 = y0 = _mm_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec4d(double d0, double d1, double d2, double d3) {
+        y0 = _mm_setr_pd(d0, d1);
+        y1 = _mm_setr_pd(d2, d3);
+    }
+    // Constructor to build from two Vec4f:
+    Vec4d(Vec2d const a0, Vec2d const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256de
+    Vec4d(Vec256de const x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256de
+    Vec4d & operator = (Vec256de const x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4d & load(double const * p) {
+        y0 = _mm_loadu_pd(p);
+        y1 = _mm_loadu_pd(p+2);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32
+    Vec4d & load_a(double const * p) {
+        y0 = _mm_load_pd(p);
+        y1 = _mm_load_pd(p+2);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm_storeu_pd(p,   y0);
+        _mm_storeu_pd(p+2, y1);
+    }
+    // Member function storing into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32
+    void store_a(double * p) const {
+        _mm_store_pd(p,   y0);
+        _mm_store_pd(p+2, y1);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // Note: Will generate runtime error if p is not aligned by 32
+    void store_nt(double * p) const {
+        _mm_stream_pd(p,   y0);
+        _mm_stream_pd(p+2, y1);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4d & load_partial(int n, double const * p) {
+        if (n > 0 && n <= 2) {
+            *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
+        }
+        else if (n > 2 && n <= 4) {
+            *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
+        }
+        else {
+            y1 = y0 = _mm_setzero_pd();
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n <= 4) {
+            get_low().store(p);
+            get_high().store_partial(n - 2, p + 2);
+        }
+    }
+    Vec4d & cutoff(int n) {
+        if (uint32_t(n) >= 4) return *this;
+        else if (n >= 2) {
+            y1 = Vec2d(y1).cutoff(n - 2);
+        }
+        else {
+            y0 = Vec2d(y0).cutoff(n);
+            y1 = Vec2d(0.0);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4d const insert(int index, double value) {
+        if ((uint32_t)index < 2) {
+            y0 = Vec2d(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec2d(y1).insert(index-2, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(int index) const {
+        if ((uint32_t)index < 2) {
+            return Vec2d(y0).extract(index);
+        }
+        else {
+            return Vec2d(y1).extract(index-2);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2d:
+    Vec2d get_low() const {
+        return y0;
+    }
+    Vec2d get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 17;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4d operator + (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec4d operator + (Vec4d const a, double b) {
+    return a + Vec4d(b);
+}
+static inline Vec4d operator + (double a, Vec4d const b) {
+    return Vec4d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec4d & operator += (Vec4d & a, Vec4d const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4d operator ++ (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4d & operator ++ (Vec4d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4d operator - (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec4d operator - (Vec4d const a, double b) {
+    return a - Vec4d(b);
+}
+static inline Vec4d operator - (double a, Vec4d const b) {
+    return Vec4d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec4d operator - (Vec4d const a) {
+    return Vec4d(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec4d & operator -= (Vec4d & a, Vec4d const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4d operator -- (Vec4d & a, int) {
+    Vec4d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4d & operator -- (Vec4d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4d operator * (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec4d operator * (Vec4d const a, double b) {
+    return a * Vec4d(b);
+}
+static inline Vec4d operator * (double a, Vec4d const b) {
+    return Vec4d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec4d & operator *= (Vec4d & a, Vec4d const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec4d operator / (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec4d operator / (Vec4d const a, double b) {
+    return a / Vec4d(b);
+}
+static inline Vec4d operator / (double a, Vec4d const b) {
+    return Vec4d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec4d & operator /= (Vec4d & a, Vec4d const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4db operator == (Vec4d const a, Vec4d const b) {
+    return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4db operator != (Vec4d const a, Vec4d const b) {
+    return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4db operator < (Vec4d const a, Vec4d const b) {
+    return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec4db operator <= (Vec4d const a, Vec4d const b) {
+    return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4db operator > (Vec4d const a, Vec4d const b) {
+    return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec4db operator >= (Vec4d const a, Vec4d const b) {
+    return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec4d operator & (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec4d & operator &= (Vec4d & a, Vec4d const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec4d and Vec4db
+static inline Vec4d operator & (Vec4d const a, Vec4db const b) {
+    return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4d operator & (Vec4db const a, Vec4d const b) {
+    return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator | : bitwise or
+static inline Vec4d operator | (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec4d & operator |= (Vec4d & a, Vec4d const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4d operator ^ (Vec4d const a, Vec4d const b) {
+    return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec4d & operator ^= (Vec4d & a, Vec4d const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec4db operator ! (Vec4d const a) {
+    return Vec4db(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec4d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
+// No other values are allowed.
+static inline Vec4d select (Vec4db const s, Vec4d const a, Vec4d const b) {
+    return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4d if_add (Vec4db const f, Vec4d const a, Vec4d const b) {
+    return a + (Vec4d(f) & b);
+}
+
+// Conditional subtract
+static inline Vec4d if_sub (Vec4db const f, Vec4d const a, Vec4d const b) {
+    return a - (Vec4d(f) & b);
+}
+
+// Conditional multiply
+static inline Vec4d if_mul (Vec4db const f, Vec4d const a, Vec4d const b) {
+    return a * select(f, b, 1.f);
+}
+
+// Conditional divide
+static inline Vec4d if_div (Vec4db const f, Vec4d const a, Vec4d const b) {
+    return a / select(f, b, 1.);
+}
+
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec4d const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec4d max(Vec4d const a, Vec4d const b) {
+    return Vec4d(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4d min(Vec4d const a, Vec4d const b) {
+    return Vec4d(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec4d abs(Vec4d const a) {
+    return Vec4d(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec4d sqrt(Vec4d const a) {
+    return Vec4d(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec4d square(Vec4d const a) {
+    return Vec4d(square(a.get_low()), square(a.get_high()));
+}
+
+// pow(Vec4d, int):
+// Raise floating point numbers to integer power n
+template <typename TT> static Vec4d pow(Vec4d const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec4d pow<int>(Vec4d const x0, int const n) {
+    return pow_template_i<Vec4d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec4d pow<uint32_t>(Vec4d const x0, uint32_t const n) {
+    return pow_template_i<Vec4d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+// implement as function pow(vector, const_int)
+template <int n>
+static inline Vec4d pow(Vec4d const a, Const_int_t<n>) {
+    return pow_n<Vec4d, n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec4d round(Vec4d const a) {
+    return Vec4d(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec4d truncate(Vec4d const a) {
+    return Vec4d(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec4d floor(Vec4d const a) {
+    return Vec4d(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec4d ceil(Vec4d const a) {
+    return Vec4d(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function round_to_int32: round to nearest integer (even). (result as integer vector)
+static inline Vec4i round_to_int32(Vec4d const a) {
+    return round_to_int32(a.get_low(), a.get_high());
+}
+
+// function truncate_to_int32: round towards zero. (result as integer vector)
+static inline Vec4i truncate_to_int32(Vec4d const a) {
+    return truncate_to_int32(a.get_low(), a.get_high());
+}
+
+// function truncatei: round towards zero. (inefficient)
+static inline Vec4q truncatei(Vec4d const a) {
+    double aa[4];
+    a.store(aa);
+    return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
+}
+
+// function roundi: round to nearest or even. (inefficient)
+static inline Vec4q roundi(Vec4d const a) {
+    return truncatei(round(a));
+}
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec4d to_double(Vec4q const a) {
+    int64_t aa[4];
+    a.store(aa);
+    return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+}
+
+// function to_double: convert unsigned integer vector elements to double vector (inefficient)
+static inline Vec4d to_double(Vec4uq const a) {
+    uint64_t aa[4];
+    a.store(aa);
+    return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec4d to_double(Vec4i const a) {
+    return Vec4d(to_double_low(a), to_double_high(a));
+}
+
+// function compress: convert two Vec4d to one Vec8f
+static inline Vec8f compress (Vec4d const low, Vec4d const high) {
+    return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
+static inline Vec4d extend_low (Vec8f const a) {
+    return Vec4d(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
+static inline Vec4d extend_high (Vec8f const a) {
+    return Vec4d(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec4d mul_add(Vec4d const a, Vec4d const b, Vec4d const c) {
+    return Vec4d(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec4d mul_sub(Vec4d const a, Vec4d const b, Vec4d const c) {
+    return Vec4d(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec4d nmul_add(Vec4d const a, Vec4d const b, Vec4d const c) {
+    return Vec4d(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec4d mul_sub_x(Vec4d const a, Vec4d const b, Vec4d const c) {
+    return Vec4d(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec4q exponent(Vec4d const a) {
+    return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec4d fraction(Vec4d const a) {
+    return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec4d exp2(Vec4q const a) {
+    return Vec4d(exp2(a.get_low()), exp2(a.get_high()));
+}
+//static Vec4d exp2(Vec4d const x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
+static inline Vec4db sign_bit(Vec4d const a) {
+    return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec4d sign_combine(Vec4d const a, Vec4d const b) {
+    return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec4db is_finite(Vec4d const a) {
+    return Vec4db(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec4db is_inf(Vec4d const a) {
+    return Vec4db(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec4db is_nan(Vec4d const a) {
+    return Vec4db(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec4db is_subnormal(Vec4d const a) {
+    return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec4db is_zero_or_subnormal(Vec4d const a) {
+    return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec4d infinite4d() {
+    return Vec4d(infinite2d(), infinite2d());
+}
+
+// Function nan2d: returns a vector where all elements are +NAN (quiet)
+static inline Vec4d nan4d(int n = 0x10) {
+    return Vec4d(nan2d(n), nan2d(n));
+}
+
+// change signs on vectors Vec4d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3>
+inline Vec4d change_sign(Vec4d const a) {
+    if ((i0 | i1 | i2 | i3) == 0) return a;
+    Vec2d lo = change_sign<i0,i1>(a.get_low());
+    Vec2d hi = change_sign<i2,i3>(a.get_high());
+    return Vec4d(lo, hi);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+static inline Vec256b reinterpret_i (Vec256b const x) {
+    return x;
+}
+
+static inline Vec256b reinterpret_i (Vec256fe  const x) {
+    return Vec256b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec256b reinterpret_i (Vec256de const x) {
+    return Vec256b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec256fe  reinterpret_f (Vec256b const x) {
+    return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
+}
+
+static inline Vec256fe  reinterpret_f (Vec256fe  const x) {
+    return x;
+}
+
+static inline Vec256fe  reinterpret_f (Vec256de const x) {
+    return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
+}
+
+static inline Vec256de reinterpret_d (Vec256b const x) {
+    return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
+}
+
+static inline Vec256de reinterpret_d (Vec256fe  const x) {
+    return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
+}
+
+static inline Vec256de reinterpret_d (Vec256de const x) {
+    return x;
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// permute vector Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d permute4(Vec4d const a) {
+    return Vec4d(blend2<i0,i1> (a.get_low(), a.get_high()),
+           blend2<i2,i3> (a.get_low(), a.get_high()));
+}
+
+// permute vector Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f permute8(Vec8f const a) {
+    return Vec8f(blend4<i0,i1,i2,i3> (a.get_low(), a.get_high()),
+        blend4<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+
+// blend vectors Vec4d
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d blend4(Vec4d const a, Vec4d const b) {
+    Vec2d x0 = blend_half<Vec4d, i0, i1>(a, b);
+    Vec2d x1 = blend_half<Vec4d, i2, i3>(a, b);
+    return Vec4d(x0, x1);
+}
+
+// blend vectors Vec8f
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f blend8(Vec8f const a, Vec8f const b) {
+    Vec4f x0 = blend_half<Vec8f, i0, i1, i2, i3>(a, b);
+    Vec4f x1 = blend_half<Vec8f, i4, i5, i6, i7>(a, b);
+    return Vec8f(x0, x1);
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec8f lookup8(Vec8i const index, Vec8f const table) {
+    Vec4f  r0 = lookup8(index.get_low() , table.get_low(), table.get_high());
+    Vec4f  r1 = lookup8(index.get_high(), table.get_low(), table.get_high());
+    return Vec8f(r0, r1);
+}
+
+template <int n>
+static inline Vec8f lookup(Vec8i const index, float const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 4) {
+        Vec4f table1 = Vec4f().load(table);
+        return Vec8f(
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if constexpr (n <= 8) {
+        return lookup8(index, Vec8f().load(table));
+    }
+    // Limit index
+    Vec8ui index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), n-1);
+    }
+    return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
+    table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
+}
+
+static inline Vec4d lookup4(Vec4q const index, Vec4d const table) {
+    Vec2d  r0 = lookup4(index.get_low() , table.get_low(), table.get_high());
+    Vec2d  r1 = lookup4(index.get_high(), table.get_low(), table.get_high());
+    return Vec4d(r0, r1);
+}
+
+template <int n>
+static inline Vec4d lookup(Vec4q const index, double const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 2) {
+        Vec2d table1 = Vec2d().load(table);
+        return Vec4d(
+            lookup2 (index.get_low(),  table1),
+            lookup2 (index.get_high(), table1));
+    }
+    // Limit index
+    Vec8ui index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0));
+    }
+    Vec4q index2 = Vec4q(index1);
+    return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, ..
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8f gather8f(void const * a) {
+    return reinterpret_f(gather8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
+}
+
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4d gather4d(void const * a) {
+    return reinterpret_d(gather4q<i0, i1, i2, i3>(a));
+}
+
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8f const data, float * array) {
+    const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
+    for (int i = 0; i < 8; i++) {
+        if (index[i] >= 0) array[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4d const data, double * array) {
+    const int index[4] = {i0,i1,i2,i3};
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0) array[index[i]] = data[i];
+    }
+}
+
+// scatter functions with variable indexes
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8f const data, float * destination) {
+    for (int i = 0; i < 8; i++) {
+        if (uint32_t(index[i]) < limit) destination[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4q const index, uint32_t limit, Vec4d const data, double * destination) {
+    for (int i = 0; i < 4; i++) {
+        if (uint64_t(index[i]) < uint64_t(limit)) destination[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4i const index, uint32_t limit, Vec4d const data, double * destination) {
+    for (int i = 0; i < 4; i++) {
+        if (uint32_t(index[i]) < limit) destination[index[i]] = data[i];
+    }
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8fb const x) {
+    return to_bits(Vec8ib(reinterpret_i(x)));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4db const x) {
+    return to_bits(Vec4qb(reinterpret_i(x)));
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORF256E_H
diff --git a/Bwdif/VCL2/vectorf512.h b/Bwdif/VCL2/vectorf512.h
new file mode 100644
index 0000000..32ec6bf
--- /dev/null
+++ b/Bwdif/VCL2/vectorf512.h
@@ -0,0 +1,2040 @@
+/****************************  vectorf512.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 512-bit floating point vector classes
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec16f    Vector of  16  single precision floating point numbers
+* Vec16fb   Vector of  16  Booleans for use with Vec16f
+* Vec8d     Vector of   8  double precision floating point numbers
+* Vec8db    Vector of   8  Booleans for use with Vec8d
+*
+* Each vector object is represented internally in the CPU a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2014-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORF512_H
+#define VECTORF512_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#ifdef VECTORF512E_H
+#error Two different versions of vectorf512.h included
+#endif
+
+#include "vectori512.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+/*****************************************************************************
+*
+*          Vec16fb: Vector of 16 Booleans for use with Vec16f
+*          Vec8db:  Vector of 8  Booleans for use with Vec8d
+*
+*****************************************************************************/
+
+typedef Vec16b Vec16fb;
+typedef Vec8b  Vec8db;
+
+#if INSTRSET == 9  // special cases of mixed compact and broad vectors
+inline Vec16b::Vec16b(Vec8ib const x0, Vec8ib const x1) {
+    mm = to_bits(x0) | uint16_t(to_bits(x1) << 8);
+}
+inline Vec16b::Vec16b(Vec8fb const x0, Vec8fb const x1) {
+    mm = to_bits(x0) | uint16_t(to_bits(x1) << 8);
+}
+inline Vec8b::Vec8b(Vec4qb const x0, Vec4qb const x1) {
+    mm = to_bits(x0) | (to_bits(x1) << 4);
+}
+inline Vec8b::Vec8b(Vec4db const x0, Vec4db const x1) {
+    mm = to_bits(x0) | (to_bits(x1) << 4);
+}
+
+inline Vec8ib Vec16b::get_low() const {
+    return Vec8ib().load_bits(uint8_t(mm));
+}
+inline Vec8ib Vec16b::get_high() const {
+    return Vec8ib().load_bits(uint8_t((uint16_t)mm >> 8u));
+}
+inline Vec4qb Vec8b::get_low() const {
+    return Vec4qb().load_bits(mm & 0xF);
+}
+inline Vec4qb Vec8b::get_high() const {
+    return Vec4qb().load_bits(mm >> 4u);
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Vec16f: Vector of 16 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec16f {
+protected:
+    __m512 zmm; // Float vector
+public:
+    // Default constructor:
+    Vec16f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16f(float f) {
+        zmm = _mm512_set1_ps(f);
+    }
+    // Constructor to build from all elements:
+    Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7,
+    float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15) {
+        zmm = _mm512_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15);
+    }
+    // Constructor to build from two Vec8f:
+    Vec16f(Vec8f const a0, Vec8f const a1) {
+        zmm = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(_mm512_castps256_ps512(a0)), _mm256_castps_pd(a1), 1));
+    }
+    // Constructor to convert from type __m512 used in intrinsics:
+    Vec16f(__m512 const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512 used in intrinsics:
+    Vec16f & operator = (__m512 const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512 used in intrinsics
+    operator __m512() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec16f & load(float const * p) {
+        zmm = _mm512_loadu_ps(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address divisible by 64
+    Vec16f & load_a(float const * p) {
+        zmm = _mm512_load_ps(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        _mm512_storeu_ps(p, zmm);
+    }
+    // Member function storing into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address divisible by 64
+    void store_a(float * p) const {
+        _mm512_store_ps(p, zmm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 16
+    void store_nt(float * p) const {
+        _mm512_stream_ps(p, zmm);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16f & load_partial(int n, float const * p) {
+        zmm = _mm512_maskz_loadu_ps(__mmask16((1 << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        _mm512_mask_storeu_ps(p, __mmask16((1 << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec16f & cutoff(int n) {
+        zmm = _mm512_maskz_mov_ps(__mmask16((1 << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16f const insert(int index, float value) {
+        zmm = _mm512_mask_broadcastss_ps(zmm, __mmask16(1u << index), _mm_set_ss(value));
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(int index) const {
+        __m512 x = _mm512_maskz_compress_ps(__mmask16(1u << index), zmm);
+        return _mm512_cvtss_f32(x);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4f:
+    Vec8f get_low() const {
+        return _mm512_castps512_ps256(zmm);
+    }
+    Vec8f get_high() const {
+        return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm),1));
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 16;
+    }
+    typedef __m512 registertype;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16f operator + (Vec16f const a, Vec16f const b) {
+    return _mm512_add_ps(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec16f operator + (Vec16f const a, float b) {
+    return a + Vec16f(b);
+}
+static inline Vec16f operator + (float a, Vec16f const b) {
+    return Vec16f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec16f & operator += (Vec16f & a, Vec16f const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16f operator ++ (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16f & operator ++ (Vec16f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16f operator - (Vec16f const a, Vec16f const b) {
+    return _mm512_sub_ps(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec16f operator - (Vec16f const a, float b) {
+    return a - Vec16f(b);
+}
+static inline Vec16f operator - (float a, Vec16f const b) {
+    return Vec16f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec16f operator - (Vec16f const a) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ 0x80000000);
+}
+
+// vector operator -= : subtract
+static inline Vec16f & operator -= (Vec16f & a, Vec16f const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16f operator -- (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16f & operator -- (Vec16f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16f operator * (Vec16f const a, Vec16f const b) {
+    return _mm512_mul_ps(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec16f operator * (Vec16f const a, float b) {
+    return a * Vec16f(b);
+}
+static inline Vec16f operator * (float a, Vec16f const b) {
+    return Vec16f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec16f & operator *= (Vec16f & a, Vec16f const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16f operator / (Vec16f const a, Vec16f const b) {
+    return _mm512_div_ps(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec16f operator / (Vec16f const a, float b) {
+    return a / Vec16f(b);
+}
+static inline Vec16f operator / (float a, Vec16f const b) {
+    return Vec16f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec16f & operator /= (Vec16f & a, Vec16f const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16fb operator == (Vec16f const a, Vec16f const b) {
+//    return _mm512_cmpeq_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16fb operator != (Vec16f const a, Vec16f const b) {
+//    return _mm512_cmpneq_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16fb operator < (Vec16f const a, Vec16f const b) {
+//    return _mm512_cmplt_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 1);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec16fb operator <= (Vec16f const a, Vec16f const b) {
+//    return _mm512_cmple_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16fb operator > (Vec16f const a, Vec16f const b) {
+    return _mm512_cmp_ps_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec16fb operator >= (Vec16f const a, Vec16f const b) {
+    return _mm512_cmp_ps_mask(a, b, 5);
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec16f operator & (Vec16f const a, Vec16f const b) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) & Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator &= : bitwise and
+static inline Vec16f & operator &= (Vec16f & a, Vec16f const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec16f and Vec16fb
+static inline Vec16f operator & (Vec16f const a, Vec16fb const b) {
+    return _mm512_maskz_mov_ps(b, a);
+}
+static inline Vec16f operator & (Vec16fb const a, Vec16f const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16f operator | (Vec16f const a, Vec16f const b) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) | Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator |= : bitwise or
+static inline Vec16f & operator |= (Vec16f & a, Vec16f const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16f operator ^ (Vec16f const a, Vec16f const b) {
+    return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ Vec16i(_mm512_castps_si512(b)));
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16f & operator ^= (Vec16f & a, Vec16f const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec16fb operator ! (Vec16f const a) {
+    return a == Vec16f(0.0f);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec16f
+*
+*****************************************************************************/
+
+static inline Vec16f zero_16f() {
+    return _mm512_setzero_ps();
+}
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16f select (Vec16fb const s, Vec16f const a, Vec16f const b) {
+    return _mm512_mask_mov_ps(b, s, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16f if_add (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return _mm512_mask_add_ps(a, f, a, b);
+}
+
+// Conditional subtract
+static inline Vec16f if_sub (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return _mm512_mask_sub_ps(a, f, a, b);
+}
+
+// Conditional multiply
+static inline Vec16f if_mul (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return _mm512_mask_mul_ps(a, f, a, b);
+}
+
+// Conditional divide
+static inline Vec16f if_div (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return _mm512_mask_div_ps(a, f, a, b);
+}
+
+
+// sign functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb sign_bit(Vec16f const a) {
+    Vec16i t1 = _mm512_castps_si512(a);    // reinterpret as 32-bit integer
+    return Vec16fb(t1 < 0);
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec16f sign_combine(Vec16f const a, Vec16f const b) {
+    // return a ^ (b & Vec16f(-0.0f));
+    return _mm512_castsi512_ps (_mm512_ternarylogic_epi32(
+        _mm512_castps_si512(a), _mm512_castps_si512(b), Vec16i(0x80000000), 0x78));
+}
+
+// Categorization functions
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_finite(Vec16f const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    __mmask16 f = _mm512_fpclass_ps_mask(a, 0x99);
+    return _mm512_knot(f);
+#else
+    Vec16i  t1 = _mm512_castps_si512(a);    // reinterpret as 32-bit integer
+    Vec16i  t2 = t1 << 1;                   // shift out sign bit
+    Vec16ib t3 = Vec16i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
+    return Vec16fb(t3);
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_inf(Vec16f const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_fpclass_ps_mask(a, 0x18);
+#else
+    Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer
+    Vec16i t2 = t1 << 1;                // shift out sign bit
+    return Vec16fb(t2 == 0xFF000000);   // exponent is all 1s, fraction is 0
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
+static inline Vec16fb is_nan(Vec16f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm512_fpclass_ps_mask(a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec16fb is_nan(Vec16f const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec16fb is_nan(Vec16f const a) {
+    __m512 aa = a;
+    __mmask16 unordered;
+    __asm volatile("vcmpps $3, %1, %1, %0" : "=Yk" (unordered) :  "v" (aa) );
+    return Vec16fb(unordered);
+}
+#else
+static inline Vec16fb is_nan(Vec16f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec16fb().load_bits(_mm512_cmp_ps_mask(a, a, 3)); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
+}
+#endif
+
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec16fb is_subnormal(Vec16f const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_fpclass_ps_mask(a, 0x20);
+#else
+    Vec16i t1 = _mm512_castps_si512(a);    // reinterpret as 32-bit integer
+    Vec16i t2 = t1 << 1;                   // shift out sign bit
+    Vec16i t3 = 0xFF000000;                // exponent mask
+    Vec16i t4 = t2 & t3;                   // exponent
+    Vec16i t5 = _mm512_andnot_si512(t3,t2);// fraction
+    return Vec16fb(t4 == 0 && t5 != 0);     // exponent = 0 and fraction != 0
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec16fb is_zero_or_subnormal(Vec16f const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_fpclass_ps_mask(a, 0x26);
+#else
+    Vec16i t = _mm512_castps_si512(a);            // reinterpret as 32-bit integer
+    t &= 0x7F800000;                       // isolate exponent
+    return Vec16fb(t == 0);                       // exponent = 0
+#endif
+}
+
+// change signs on vectors Vec16f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f change_sign(Vec16f const a) {
+    constexpr __mmask16 m = __mmask16((i0&1) | (i1&1)<<1 | (i2&1)<< 2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7
+        | (i8&1)<<8 | (i9&1)<<9 | (i10&1)<<10 | (i11&1)<<11 | (i12&1)<<12 | (i13&1)<<13 | (i14&1)<<14 | (i15&1)<<15);
+    if constexpr ((uint16_t)m == 0) return a;
+    __m512 s = _mm512_castsi512_ps(_mm512_maskz_set1_epi32(m, 0x80000000));
+    return a ^ s;
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec16f const a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_ps(a);
+#else
+    return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec16f max(Vec16f const a, Vec16f const b) {
+    return _mm512_max_ps(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16f min(Vec16f const a, Vec16f const b) {
+    return _mm512_min_ps(a,b);
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+static inline Vec16f abs(Vec16f const a) {
+#if INSTRSET >= 10  // AVX512DQ
+    return _mm512_range_ps(a, a, 8);
+#else
+    return a & Vec16f(_mm512_castsi512_ps(Vec16i(0x7FFFFFFF)));
+#endif
+}
+
+// function sqrt: square root
+static inline Vec16f sqrt(Vec16f const a) {
+    return _mm512_sqrt_ps(a);
+}
+
+// function square: a * a
+static inline Vec16f square(Vec16f const a) {
+    return a * a;
+}
+
+// pow(Vec16f, int):
+template <typename TT> static Vec16f pow(Vec16f const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec16f pow<int>(Vec16f const x0, int const n) {
+    return pow_template_i<Vec16f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec16f pow<uint32_t>(Vec16f const x0, uint32_t const n) {
+    return pow_template_i<Vec16f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec16f pow(Vec16f const a, Const_int_t<n>) {
+    return pow_n<Vec16f, n>(a);
+}
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec16f round(Vec16f const a) {
+    return _mm512_roundscale_ps(a, 0+8);
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec16f truncate(Vec16f const a) {
+    return _mm512_roundscale_ps(a, 3+8);
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec16f floor(Vec16f const a) {
+    return _mm512_roundscale_ps(a, 1+8);
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec16f ceil(Vec16f const a) {
+    return _mm512_roundscale_ps(a, 2+8);
+}
+
+// function roundi: round to nearest integer (even). (result as integer vector)
+static inline Vec16i roundi(Vec16f const a) {
+    return _mm512_cvt_roundps_epi32(a, 0+8 /*_MM_FROUND_NO_EXC*/);
+}
+//static inline Vec16i round_to_int(Vec16f const a) {return roundi(a);} // deprecated
+
+// function truncatei: round towards zero. (result as integer vector)
+static inline Vec16i truncatei(Vec16f const a) {
+    return _mm512_cvtt_roundps_epi32(a, 0+8 /*_MM_FROUND_NO_EXC*/);
+}
+//static inline Vec16i truncate_to_int(Vec16f const a) {return truncatei(a);} // deprecated
+
+// function to_float: convert integer vector to float vector
+static inline Vec16f to_float(Vec16i const a) {
+    return _mm512_cvtepi32_ps(a);
+}
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec16f to_float(Vec16ui const a) {
+    return _mm512_cvtepu32_ps(a);
+}
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a.
+// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512F, full precision with AVX512ER)
+static inline Vec16f approx_recipr(Vec16f const a) {
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast reciprocal with better precision
+    return _mm512_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
+#else
+    return _mm512_rcp14_ps(a);
+#endif
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a).
+// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512F, full precision with AVX512ER)
+static inline Vec16f approx_rsqrt(Vec16f const a) {
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast reciprocal squareroot with better precision
+    return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
+#else
+    return _mm512_rsqrt14_ps(a);
+#endif
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec16f mul_add(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return _mm512_fmadd_ps(a, b, c);
+}
+
+// Multiply and subtract
+static inline Vec16f mul_sub(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return _mm512_fmsub_ps(a, b, c);
+}
+
+// Multiply and inverse subtract
+static inline Vec16f nmul_add(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return _mm512_fnmadd_ps(a, b, c);
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// Do not use mul_sub_x in general code because it is inaccurate in certain cases when FMA is not supported
+static inline Vec16f mul_sub_x(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return _mm512_fmsub_ps(a, b, c);
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec16i exponent(Vec16f const a) {
+    // return roundi(Vec16i(_mm512_getexp_ps(a)));
+    Vec16ui t1 = _mm512_castps_si512(a);// reinterpret as 32-bit integers
+    Vec16ui t2 = t1 << 1;               // shift out sign bit
+    Vec16ui t3 = t2 >> 24;              // shift down logical to position 0
+    Vec16i  t4 = Vec16i(t3) - 0x7F;     // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec16f fraction(Vec16f const a) {
+    return _mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec16f exp2(Vec16i const n) {
+    Vec16i t1 = max(n,  -0x7F);         // limit to allowed range
+    Vec16i t2 = min(t1,  0x80);
+    Vec16i t3 = t2 + 0x7F;              // add bias
+    Vec16i t4 = t3 << 23;               // put exponent into position 23
+    return _mm512_castsi512_ps(t4);     // reinterpret as float
+}
+//static Vec16f exp2(Vec16f const x); // defined in vectormath_exp.h
+
+
+/*****************************************************************************
+*
+*          Vec8d: Vector of 8 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec8d {
+protected:
+    __m512d zmm; // double vector
+public:
+    // Default constructor:
+    Vec8d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8d(double d) {
+        zmm = _mm512_set1_pd(d);
+    }
+    // Constructor to build from all elements:
+    Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) {
+        zmm = _mm512_setr_pd(d0, d1, d2, d3, d4, d5, d6, d7);
+    }
+    // Constructor to build from two Vec4d:
+    Vec8d(Vec4d const a0, Vec4d const a1) {
+        zmm = _mm512_insertf64x4(_mm512_castpd256_pd512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512d used in intrinsics:
+    Vec8d(__m512d const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512d used in intrinsics:
+    Vec8d & operator = (__m512d const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512d used in intrinsics
+    operator __m512d() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8d & load(double const * p) {
+        zmm = _mm512_loadu_pd(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64
+    Vec8d & load_a(double const * p) {
+        zmm = _mm512_load_pd(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        _mm512_storeu_pd(p, zmm);
+    }
+    // Member function storing into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64
+    void store_a(double * p) const {
+        _mm512_store_pd(p, zmm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 16
+    void store_nt(double * p) const {
+        _mm512_stream_pd(p, zmm);
+    } 
+    // Partial load. Load n elements and set the rest to 0
+    Vec8d & load_partial(int n, double const * p) {
+        zmm = _mm512_maskz_loadu_pd(__mmask16((1<<n)-1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        _mm512_mask_storeu_pd(p, __mmask16((1<<n)-1), zmm);
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8d & cutoff(int n) {
+        zmm = _mm512_maskz_mov_pd(__mmask16((1<<n)-1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8d const insert(int index, double value) {
+        zmm = _mm512_mask_broadcastsd_pd(zmm, __mmask8(1u << index), _mm_set_sd(value));
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(int index) const {
+#if INSTRSET >= 10
+        __m512d x = _mm512_maskz_compress_pd(__mmask8(1u << index), zmm);
+        return _mm512_cvtsd_f64(x);
+#else
+        double a[8];
+        store(a);
+        return a[index & 7];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4d:
+    Vec4d get_low() const {
+        return _mm512_castpd512_pd256(zmm);
+    }
+    Vec4d get_high() const {
+        return _mm512_extractf64x4_pd(zmm,1);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 17;
+    }
+    typedef __m512d registertype;
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator + (Vec8d const a, Vec8d const b) {
+    return _mm512_add_pd(a, b);
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8d operator + (Vec8d const a, double b) {
+    return a + Vec8d(b);
+}
+static inline Vec8d operator + (double a, Vec8d const b) {
+    return Vec8d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8d & operator += (Vec8d & a, Vec8d const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator ++ (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8d & operator ++ (Vec8d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator - (Vec8d const a, Vec8d const b) {
+    return _mm512_sub_pd(a, b);
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8d operator - (Vec8d const a, double b) {
+    return a - Vec8d(b);
+}
+static inline Vec8d operator - (double a, Vec8d const b) {
+    return Vec8d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator - (Vec8d const a) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(0x8000000000000000));
+}
+
+// vector operator -= : subtract
+static inline Vec8d & operator -= (Vec8d & a, Vec8d const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8d operator -- (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8d & operator -- (Vec8d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator * (Vec8d const a, Vec8d const b) {
+    return _mm512_mul_pd(a, b);
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8d operator * (Vec8d const a, double b) {
+    return a * Vec8d(b);
+}
+static inline Vec8d operator * (double a, Vec8d const b) {
+    return Vec8d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8d & operator *= (Vec8d & a, Vec8d const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator / (Vec8d const a, Vec8d const b) {
+    return _mm512_div_pd(a, b);
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8d operator / (Vec8d const a, double b) {
+    return a / Vec8d(b);
+}
+static inline Vec8d operator / (double a, Vec8d const b) {
+    return Vec8d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8d & operator /= (Vec8d & a, Vec8d const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8db operator == (Vec8d const a, Vec8d const b) {
+    return _mm512_cmp_pd_mask(a, b, 0);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8db operator != (Vec8d const a, Vec8d const b) {
+    return _mm512_cmp_pd_mask(a, b, 4);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8db operator < (Vec8d const a, Vec8d const b) {
+    return _mm512_cmp_pd_mask(a, b, 1);
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8db operator <= (Vec8d const a, Vec8d const b) {
+    return _mm512_cmp_pd_mask(a, b, 2);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8db operator > (Vec8d const a, Vec8d const b) {
+    return _mm512_cmp_pd_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8db operator >= (Vec8d const a, Vec8d const b) {
+    return _mm512_cmp_pd_mask(a, b, 5);
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8d operator & (Vec8d const a, Vec8d const b) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) & Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator &= : bitwise and
+static inline Vec8d & operator &= (Vec8d & a, Vec8d const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8d and Vec8db
+static inline Vec8d operator & (Vec8d const a, Vec8db const b) {
+    return _mm512_maskz_mov_pd((uint8_t)b, a);
+}
+
+static inline Vec8d operator & (Vec8db const a, Vec8d const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8d operator | (Vec8d const a, Vec8d const b) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) | Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator |= : bitwise or
+static inline Vec8d & operator |= (Vec8d & a, Vec8d const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8d operator ^ (Vec8d const a, Vec8d const b) {
+    return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(_mm512_castpd_si512(b)));
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8d & operator ^= (Vec8d & a, Vec8d const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8db operator ! (Vec8d const a) {
+    return a == Vec8d(0.0);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec8d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8d select (Vec8db const s, Vec8d const a, Vec8d const b) {
+    return _mm512_mask_mov_pd (b, (uint8_t)s, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8d if_add (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return _mm512_mask_add_pd(a, (uint8_t)f, a, b);
+}
+
+// Conditional subtract
+static inline Vec8d if_sub (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return _mm512_mask_sub_pd(a, (uint8_t)f, a, b);
+}
+
+// Conditional multiply
+static inline Vec8d if_mul (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return _mm512_mask_mul_pd(a, (uint8_t)f, a, b);
+}
+
+// Conditional divide
+static inline Vec8d if_div (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return _mm512_mask_div_pd(a, (uint8_t)f, a, b);
+}
+
+// Sign functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+static inline Vec8db sign_bit(Vec8d const a) {
+    Vec8q t1 = _mm512_castpd_si512(a);    // reinterpret as 64-bit integer
+    return Vec8db(t1 < 0);
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8d sign_combine(Vec8d const a, Vec8d const b) {
+    // return a ^ (b & Vec8d(-0.0));
+    return _mm512_castsi512_pd (_mm512_ternarylogic_epi64(
+        _mm512_castpd_si512(a), _mm512_castpd_si512(b), Vec8q(0x8000000000000000), 0x78));
+}
+
+// Categorization functions
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec8db is_finite(Vec8d const a) {
+#if INSTRSET >= 10 // __AVX512DQ__
+    __mmask8 f = _mm512_fpclass_pd_mask(a, 0x99);
+    return __mmask8(_mm512_knot(f));
+#else
+    Vec8q  t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer
+    Vec8q  t2 = t1 << 1;                // shift out sign bit
+    Vec8q  t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec8qb t4 = Vec8q(t2 & t3) != t3;   // exponent field is not all 1s
+    return Vec8db(t4);
+#endif
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec8db is_inf(Vec8d const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_fpclass_pd_mask(a, 0x18);
+#else
+    Vec8q t1 = _mm512_castpd_si512(a);           // reinterpret as 64-bit integer
+    Vec8q t2 = t1 << 1;                          // shift out sign bit
+    return Vec8db(t2 == 0xFFE0000000000000ll);   // exponent is all 1s, fraction is 0
+#endif
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
+static inline Vec8db is_nan(Vec8d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm512_fpclass_pd_mask(a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec8db is_nan(Vec8d const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec8db is_nan(Vec8d const a) {
+    __m512d aa = a;
+    __mmask16 unordered;
+    __asm volatile("vcmppd $3, %1, %1, %0" : "=Yk" (unordered) :  "v" (aa) );
+    return Vec8db(unordered);
+}
+#else
+static inline Vec8db is_nan(Vec8d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec8db().load_bits(_mm512_cmp_pd_mask(a, a, 3)); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
+}
+#endif
+
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8db is_subnormal(Vec8d const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_fpclass_pd_mask(a, 0x20);
+#else
+    Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer
+    Vec8q t2 = t1 << 1;                // shift out sign bit
+    Vec8q t3 = 0xFFE0000000000000ll;   // exponent mask
+    Vec8q t4 = t2 & t3;                // exponent
+    Vec8q t5 = _mm512_andnot_si512(t3,t2);// fraction
+    return Vec8db(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0
+#endif
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8db is_zero_or_subnormal(Vec8d const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_fpclass_pd_mask(a, 0x26);
+#else
+    Vec8q t = _mm512_castpd_si512(a);            // reinterpret as 32-bit integer
+    t &= 0x7FF0000000000000ll;             // isolate exponent
+    return Vec8db(t == 0);                       // exponent = 0
+#endif
+}
+
+// change signs on vectors Vec8d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d change_sign(Vec8d const a) {
+    const __mmask16 m = __mmask16((i0&1) | (i1&1)<<1 | (i2&1)<< 2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7);
+    if ((uint8_t)m == 0) return a;
+#ifdef __x86_64__
+    __m512d s = _mm512_castsi512_pd(_mm512_maskz_set1_epi64(m, 0x8000000000000000));
+#else  // 32 bit mode
+    __m512i v = Vec8q(0x8000000000000000);
+    __m512d s = _mm512_castsi512_pd(_mm512_maskz_mov_epi64(m, v));
+#endif
+    return a ^ s;
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec8d const a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_pd(a);
+#else
+    return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const a, Vec8d const b) {
+    return _mm512_max_pd(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const a, Vec8d const b) {
+    return _mm512_min_pd(a,b);
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+static inline Vec8d abs(Vec8d const a) {
+#if INSTRSET >= 10  // AVX512DQ
+    return _mm512_range_pd(a, a, 8);
+#else
+    return a & Vec8d(_mm512_castsi512_pd(Vec8q(0x7FFFFFFFFFFFFFFF)));
+#endif
+}
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const a) {
+    return _mm512_sqrt_pd(a);
+}
+
+// function square: a * a
+static inline Vec8d square(Vec8d const a) {
+    return a * a;
+}
+
+// The purpose of this template is to prevent implicit conversion of a float
+// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included
+template <typename TT> static Vec8d pow(Vec8d const a, TT const n); // = delete;
+
+// pow(Vec8d, int):
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8d pow<int>(Vec8d const x0, int const n) {
+    return pow_template_i<Vec8d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8d pow<uint32_t>(Vec8d const x0, uint32_t const n) {
+    return pow_template_i<Vec8d>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8d pow(Vec8d const a, Const_int_t<n>) {
+    return pow_n<Vec8d, n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const a) {
+    return _mm512_roundscale_pd(a, 0);
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const a) {
+    return _mm512_roundscale_pd(a, 3);
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const a) {
+    return _mm512_roundscale_pd(a, 1);
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const a) {
+    return _mm512_roundscale_pd(a, 2);
+}
+
+// function round_to_int32: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int32(Vec8d const a) {
+    //return _mm512_cvtpd_epi32(a);
+    return _mm512_cvt_roundpd_epi32(a, 0+8);
+}
+//static inline Vec8i round_to_int(Vec8d const a) {return round_to_int32(a);} // deprecated
+
+
+// function truncate_to_int32: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int32(Vec8d const a) {
+    return _mm512_cvttpd_epi32(a);
+}
+//static inline Vec8i truncate_to_int(Vec8d const a) {return truncate_to_int32(a);} // deprecated
+
+
+// function truncatei: round towards zero
+static inline Vec8q truncatei(Vec8d const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_cvttpd_epi64(a);
+#else
+    double aa[8];            // inefficient
+    a.store(aa);
+    return Vec8q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]), int64_t(aa[4]), int64_t(aa[5]), int64_t(aa[6]), int64_t(aa[7]));
+#endif
+}
+//static inline Vec8q truncate_to_int64(Vec8d const a) {return truncatei(a);} // deprecated
+
+// function roundi: round to nearest or even
+static inline Vec8q roundi(Vec8d const a) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_cvtpd_epi64(a);
+#else
+    return truncatei(round(a));
+#endif
+}
+//static inline Vec8q round_to_int64(Vec8d const a) {return roundi(a);} // deprecated
+
+// function to_double: convert integer vector elements to double vector
+static inline Vec8d to_double(Vec8q const a) {
+#if INSTRSET >= 10 // __AVX512DQ__
+    return _mm512_cvtepi64_pd(a);
+#else
+    int64_t aa[8];           // inefficient
+    a.store(aa);
+    return Vec8d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]), double(aa[4]), double(aa[5]), double(aa[6]), double(aa[7]));
+#endif
+}
+
+static inline Vec8d to_double(Vec8uq const a) {
+#if INSTRSET >= 10 // __AVX512DQ__
+    return _mm512_cvtepu64_pd(a);
+#else
+    uint64_t aa[8];          // inefficient
+    a.store(aa);
+    return Vec8d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]), double(aa[4]), double(aa[5]), double(aa[6]), double(aa[7]));
+#endif
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec8d to_double(Vec8i const a) {
+    return _mm512_cvtepi32_pd(a);
+}
+
+// function compress: convert two Vec8d to one Vec16f
+static inline Vec16f compress (Vec8d const low, Vec8d const high) {
+    __m256 t1 = _mm512_cvtpd_ps(low);
+    __m256 t2 = _mm512_cvtpd_ps(high);
+    return Vec16f(t1, t2);
+}
+
+// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d
+static inline Vec8d extend_low(Vec16f const a) {
+    return _mm512_cvtps_pd(_mm512_castps512_ps256(a));
+}
+
+// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d
+static inline Vec8d extend_high (Vec16f const a) {
+    return _mm512_cvtps_pd(a.get_high());
+}
+
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return _mm512_fmadd_pd(a, b, c);
+}
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return _mm512_fmsub_pd(a, b, c);
+}
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return _mm512_fnmadd_pd(a, b, c);
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations. used internally in math functions
+static inline Vec8d mul_sub_x(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return _mm512_fmsub_pd(a, b, c);
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec8q exponent(Vec8d const a) {
+    Vec8uq t1 = _mm512_castpd_si512(a);// reinterpret as 64-bit integer
+    Vec8uq t2 = t1 << 1;               // shift out sign bit
+    Vec8uq t3 = t2 >> 53;              // shift down logical to position 0
+    Vec8q  t4 = Vec8q(t3) - 0x3FF;     // subtract bias from exponent
+    return t4;
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec8d fraction(Vec8d const a) {
+    return _mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero);
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8d exp2(Vec8q const n) {
+    Vec8q t1 = max(n,  -0x3FF);        // limit to allowed range
+    Vec8q t2 = min(t1,  0x400);
+    Vec8q t3 = t2 + 0x3FF;             // add bias
+    Vec8q t4 = t3 << 52;               // put exponent into position 52
+    return _mm512_castsi512_pd(t4);    // reinterpret as double
+}
+//static Vec8d exp2(Vec8d const x);    // defined in vectormath_exp.h
+
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+// AVX512 requires gcc version 4.9 or higher. Apparently the problem with mangling intrinsic vector types no longer exists in gcc 4.x
+
+static inline __m512i reinterpret_i (__m512i const x) {
+    return x;
+}
+
+static inline __m512i reinterpret_i (__m512  const x) {
+    return _mm512_castps_si512(x);
+}
+
+static inline __m512i reinterpret_i (__m512d const x) {
+    return _mm512_castpd_si512(x);
+}
+
+static inline __m512  reinterpret_f (__m512i const x) {
+    return _mm512_castsi512_ps(x);
+}
+
+static inline __m512  reinterpret_f (__m512  const x) {
+    return x;
+}
+
+static inline __m512  reinterpret_f (__m512d const x) {
+    return _mm512_castpd_ps(x);
+}
+
+static inline __m512d reinterpret_d (__m512i const x) {
+    return _mm512_castsi512_pd(x);
+}
+
+static inline __m512d reinterpret_d (__m512  const x) {
+    return _mm512_castps_pd(x);
+}
+
+static inline __m512d reinterpret_d (__m512d const x) {
+    return x;
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec16f infinite16f() {
+    return reinterpret_f(Vec16i(0x7F800000));
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec16f nan16f(int n = 0x100) {
+    return nan_vec<Vec16f>(n);
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec8d infinite8d() {
+    return reinterpret_d(Vec8q(0x7FF0000000000000));
+}
+
+// Function nan8d: returns a vector where all elements are +NAN (quiet NAN)
+static inline Vec8d nan8d(int n = 0x10) {
+    return nan_vec<Vec8d>(n);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d permute8(Vec8d const a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m512d y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec8d>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_pd();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {    // use larger permutation
+            constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // permutation pattern
+            constexpr uint8_t  ppat = (L.a[0] & 3) | (L.a[1]<<2 & 0xC) | (L.a[2]<<4 & 0x30) | (L.a[3]<<6 & 0xC0);
+            y = _mm512_shuffle_f64x2(a, a, ppat);
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in all lanes
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm512_unpackhi_pd(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm512_unpacklo_pd(y, y);
+            }
+            else { // general permute within lanes
+                constexpr uint8_t mm0 = (i0&1) | (i1&1)<<1 | (i2&1)<<2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7;
+                y = _mm512_permute_pd(a, mm0);             // select within same lane
+            }
+        }
+        else {  // different patterns in all lanes
+            if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                y = _mm512_castsi512_pd(_mm512_alignr_epi64 (_mm512_castpd_si512(y), _mm512_castpd_si512(y), rot));
+            }
+            else if constexpr ((flags & perm_broadcast) != 0) {  // broadcast one element
+                constexpr int e = flags >> perm_rot_count;
+                if constexpr(e != 0) {
+                    y = _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(y), _mm512_castpd_si512(y), e));
+                }
+                y = _mm512_broadcastsd_pd(_mm512_castpd512_pd128(y));
+            }
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm512_maskz_compress_pd(__mmask8(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm512_maskz_expand_pd(__mmask8(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_cross_lane) == 0) {  // no lane crossing
+                if constexpr ((flags & perm_zeroing) == 0) {      // no zeroing. use vpermilps
+                    const __m512i pmask = constant16ui <i0<<1, 0, i1<<1, 0, i2<<1, 0, i3<<1, 0, i4<<1, 0, i5<<1, 0, i6<<1, 0, i7<<1, 0>();
+                    return _mm512_permutevar_pd(a, pmask);
+                }
+                else { // with zeroing. pshufb may be marginally better because it needs no extra zero mask
+                    const EList <int8_t, 64> bm = pshufb_mask<Vec8q>(indexs);
+                    return _mm512_castsi512_pd(_mm512_shuffle_epi8(_mm512_castpd_si512(y), Vec8q().load(bm.a)));
+                }
+            }
+            else {
+                // full permute needed
+                const __m512i pmask = constant16ui <
+                    i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0, i4 & 7, 0, i5 & 7, 0, i6 & 7, 0, i7 & 7, 0>();
+                y = _mm512_permutexvar_pd(pmask, y);
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed
+        y = _mm512_maskz_mov_pd(zero_mask<8>(indexs), y);
+    }
+    return y;
+}
+
+
+// Permute vector of 16 32-bit integers.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f permute16(Vec16f const a) {
+    int constexpr indexs[16] = {  // indexes as array
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    __m512 y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec16f>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_ps();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {    // use larger permutation
+            constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // permutation pattern
+            y = _mm512_castpd_ps(
+                permute8 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>
+                (Vec8d(_mm512_castps_pd(a))));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in all lanes
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm512_unpackhi_ps(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm512_unpacklo_ps(y, y);
+            }
+            else { // general permute within lanes
+                y = _mm512_permute_ps(a, uint8_t(flags >> perm_ipattern));
+            }
+        }
+        else {  // different patterns in all lanes
+            if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                y = _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(y), _mm512_castps_si512(y), rot));
+            }
+            else if constexpr ((flags & perm_broadcast) != 0) {  // broadcast one element
+                constexpr int e = flags >> perm_rot_count;       // element index
+                if constexpr(e != 0) {
+                    y = _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(y), _mm512_castps_si512(y), e));
+                }
+                y = _mm512_broadcastss_ps(_mm512_castps512_ps128(y));
+            }
+            else if constexpr ((flags & perm_zext) != 0) {       // zero extension
+                y = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(_mm512_castsi512_si256(_mm512_castps_si512(y))));
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm512_maskz_compress_ps(__mmask16(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm512_maskz_expand_ps(__mmask16(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_cross_lane) == 0) {  // no lane crossing
+                if constexpr ((flags & perm_zeroing) == 0) {      // no zeroing. use vpermilps
+                    const __m512i pmask = constant16ui <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>();
+                    return _mm512_permutevar_ps(a, pmask);
+                }
+                else { // with zeroing. pshufb may be marginally better because it needs no extra zero mask
+                    const EList <int8_t, 64> bm = pshufb_mask<Vec16i>(indexs);
+                    return _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(a), Vec16i().load(bm.a)));
+                }
+            }
+            else {
+                // full permute needed
+                const __m512i pmaskf = constant16ui <
+                    i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15,
+                    i8 & 15, i9 & 15, i10 & 15, i11 & 15, i12 & 15, i13 & 15, i14 & 15, i15 & 15>();
+                y = _mm512_permutexvar_ps(pmaskf, a);
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed
+        y = _mm512_maskz_mov_ps(zero_mask<16>(indexs), y);
+    }
+    return y;
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d blend8(Vec8d const a, Vec8d const b) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m512d y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec8d>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_pd();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute8 <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        constexpr EList<int, 16> L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes
+        return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs);  // blend mask
+        y = _mm512_mask_mov_pd (a, mb, b);
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 128-bit blocks
+        constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // get 128-bit blend pattern
+        constexpr uint8_t shuf = (L.a[0] & 3) | (L.a[1] & 3) << 2 | (L.a[2] & 3) << 4 | (L.a[3] & 3) << 6;
+        if constexpr (make_bit_mask<8, 0x103>(indexs) == 0) { // fits vshufi64x2 (a,b)
+            y = _mm512_shuffle_f64x2(a, b, shuf);
+        }
+        else if constexpr (make_bit_mask<8, 0x203>(indexs) == 0) { // fits vshufi64x2 (b,a)
+            y = _mm512_shuffle_f64x2(b, a, shuf);
+        }
+        else {
+            const EList <int64_t, 8> bm = perm_mask_broad<Vec8q>(indexs);
+            y = _mm512_permutex2var_pd(a, Vec8q().load(bm.a), b);
+        }
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm512_unpacklo_pd (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm512_unpacklo_pd (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm512_unpackhi_pd (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm512_unpackhi_pd (b, a);
+    }
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm512_shuffle_pd(a, b, uint8_t(flags >> blend_shufpattern));
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm512_shuffle_pd(b, a, uint8_t(flags >> blend_shufpattern));
+    }
+    else { // No special cases
+        const EList <int64_t, 8> bm = perm_mask_broad<Vec8q>(indexs);
+        y = _mm512_permutex2var_pd(a, Vec8q().load(bm.a), b);
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+        y = _mm512_maskz_mov_pd(zero_mask<8>(indexs), y);
+    }
+    return y;
+}
+
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16f blend16(Vec16f const a, Vec16f const b) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; // indexes as array
+    __m512 y = a;                                          // result
+    constexpr uint64_t flags = blend_flags<Vec16f>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_ps();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute16 <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        constexpr EList<int, 32> L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes
+        return permute16 <
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint16_t mb = (uint16_t)make_bit_mask<16, 0x304>(indexs);  // blend mask
+        y = _mm512_mask_mov_ps(a, mb, b);
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 64-bit blocks
+        constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // get 64-bit blend pattern
+        y = _mm512_castpd_ps(blend8 <
+            L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7] >
+            (Vec8d(_mm512_castps_pd(a)), Vec8d(_mm512_castps_pd(b))));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else if constexpr ((flags & blend_same_pattern) != 0) {
+        // same pattern in all 128-bit lanes. check if pattern fits special cases
+        if constexpr ((flags & blend_punpcklab) != 0) {
+            y = _mm512_unpacklo_ps(a, b);
+        }
+        else if constexpr ((flags & blend_punpcklba) != 0) {
+            y = _mm512_unpacklo_ps(b, a);
+        }
+        else if constexpr ((flags & blend_punpckhab) != 0) {
+            y = _mm512_unpackhi_ps(a, b);
+        }
+        else if constexpr ((flags & blend_punpckhba) != 0) {
+            y = _mm512_unpackhi_ps(b, a);
+        }
+        else if constexpr ((flags & blend_shufab) != 0) {  // use floating point instruction shufpd
+            y = _mm512_shuffle_ps(a, b, uint8_t(flags >> blend_shufpattern));
+        }
+        else if constexpr ((flags & blend_shufba) != 0) {  // use floating point instruction shufpd
+            y = _mm512_shuffle_ps(b, a, uint8_t(flags >> blend_shufpattern));
+        }
+        else {
+            // Use vshufps twice. This generates two instructions in the dependency chain,
+            // but we are avoiding the slower lane-crossing instruction, and saving 64
+            // bytes of data cache.
+            auto shuf = [](int const (&a)[16]) constexpr { // get pattern for vpshufd
+                int pat[4] = {-1,-1,-1,-1};
+                for (int i = 0; i < 16; i++) {
+                    int ix = a[i];
+                    if (ix >= 0 && pat[i&3] < 0) {
+                        pat[i&3] = ix;
+                    }
+                }
+                return (pat[0] & 3) | (pat[1] & 3) << 2 | (pat[2] & 3) << 4 | (pat[3] & 3) << 6;
+            };
+            constexpr uint8_t  pattern = uint8_t(shuf(indexs));                     // permute pattern
+            constexpr uint16_t froma = (uint16_t)make_bit_mask<16, 0x004>(indexs);  // elements from a
+            constexpr uint16_t fromb = (uint16_t)make_bit_mask<16, 0x304>(indexs);  // elements from b
+            y = _mm512_maskz_shuffle_ps(   froma, a, a, pattern);
+            y = _mm512_mask_shuffle_ps (y, fromb, b, b, pattern);
+            return y;  // we have already zeroed any unused elements
+        }
+    }
+    else { // No special cases
+        const EList <int32_t, 16> bm = perm_mask_broad<Vec16i>(indexs);
+        y = _mm512_permutex2var_ps(a, Vec16i().load(bm.a), b);
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+        y = _mm512_maskz_mov_ps(zero_mask<16>(indexs), y);
+    }
+    return y;
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec16f lookup16(Vec16i const index, Vec16f const table) {
+    return _mm512_permutexvar_ps(index, table);
+}
+
+template <int n>
+static inline Vec16f lookup(Vec16i const index, float const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 16) {
+        Vec16f table1 = Vec16f().load((float*)table);
+        return lookup16(index, table1);
+    }
+    if constexpr (n <= 32) {
+        Vec16f table1 = Vec16f().load((float*)table);
+        Vec16f table2 = Vec16f().load((float*)table + 16);
+        return _mm512_permutex2var_ps(table1, index, table2);
+    }
+    // n > 32. Limit index
+    Vec16ui index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16ui(index), uint32_t(n-1));
+    }
+    return _mm512_i32gather_ps(index1, (const float*)table, 4);
+}
+
+
+static inline Vec8d lookup8(Vec8q const index, Vec8d const table) {
+    return _mm512_permutexvar_pd(index, table);
+}
+
+template <int n>
+static inline Vec8d lookup(Vec8q const index, double const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 8) {
+        Vec8d table1 = Vec8d().load((double*)table);
+        return lookup8(index, table1);
+    }
+    if constexpr (n <= 16) {
+        Vec8d table1 = Vec8d().load((double*)table);
+        Vec8d table2 = Vec8d().load((double*)table + 8);
+        return _mm512_permutex2var_pd(table1, index, table2);
+    }
+    // n > 16. Limit index
+    Vec8uq index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8uq(index), uint32_t(n-1));
+    }
+    return _mm512_i64gather_pd(index1, (const double*)table, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f gather16f(void const * a) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if constexpr (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16f b = Vec16f().load((float const *)a + imax-15);
+            return permute16<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16f b = Vec16f().load((float const *)a + imin);
+            return permute16<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16f b = Vec16f().load((float const *)a + imin);
+        Vec16f c = Vec16f().load((float const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i32gather_ps(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const float *)a, 4);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d gather8d(void const * a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if constexpr (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8d b = Vec8d().load((double const *)a + imax-7);
+            return permute8<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8d b = Vec8d().load((double const *)a + imin);
+            return permute8<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8d b = Vec8d().load((double const *)a + imin);
+        Vec8d c = Vec8d().load((double const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i64gather_pd(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const double *)a, 8);
+}
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    static inline void scatter(Vec16f const data, float * array) {
+    __m512i indx = constant16ui<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15>();
+    Vec16fb mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0,
+        i8>=0, i9>=0, i10>=0, i11>=0, i12>=0, i13>=0, i14>=0, i15>=0);
+    _mm512_mask_i32scatter_ps(array, mask, indx, data, 4);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8d const data, double * array) {
+    __m256i indx = constant8ui<i0,i1,i2,i3,i4,i5,i6,i7>();
+    Vec8db mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0);
+    _mm512_mask_i32scatter_pd(array, mask, indx, data, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Scatter functions with variable indexes
+*
+*****************************************************************************/
+
+static inline void scatter(Vec16i const index, uint32_t limit, Vec16f const data, float * destination) {
+    Vec16fb mask = Vec16ui(index) < limit;
+    _mm512_mask_i32scatter_ps(destination, mask, index, data, 4);
+}
+
+static inline void scatter(Vec8q const index, uint32_t limit, Vec8d const data, double * destination) {
+    Vec8db mask = Vec8uq(index) < uint64_t(limit);
+    _mm512_mask_i64scatter_pd(destination, (uint8_t)mask, index, data, 8);
+}
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8d const data, double * destination) {
+#if INSTRSET >= 10 // __AVX512VL__, __AVX512DQ__
+    __mmask8 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+#else
+    __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+#endif
+    _mm512_mask_i32scatter_pd(destination, (__mmask8)mask, index, data, 8);
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORF512_H
diff --git a/Bwdif/VCL2/vectorf512e.h b/Bwdif/VCL2/vectorf512e.h
new file mode 100644
index 0000000..66c1aaf
--- /dev/null
+++ b/Bwdif/VCL2/vectorf512e.h
@@ -0,0 +1,1935 @@
+/****************************  vectorf512.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 512-bit floating point vector classes
+* Emulated for processors without AVX512 instruction set
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec16f    Vector of  16  single precision floating point numbers
+* Vec16fb   Vector of  16  Booleans for use with Vec16f
+* Vec8d     Vector of   8  double precision floating point numbers
+* Vec8db    Vector of   8  Booleans for use with Vec8d
+*
+* Each vector object is represented internally in the CPU as two 256-bit registers.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2014-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORF512E_H
+#define VECTORF512E_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#if defined (VECTORF512_H)
+#error Two different versions of vectorf512.h included
+#endif
+
+#include "vectori512e.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+/*****************************************************************************
+*
+*          Vec16fb: Vector of 16 broad booleans for use with Vec16f
+*
+*****************************************************************************/
+class Vec16fb : public Vec16b {
+public:
+    // Default constructor:
+    Vec16fb () {
+    }
+    // Constructor to build from all elements:
+    Vec16fb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {
+    }
+    // Constructor from Vec16b
+    Vec16fb (Vec16b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor from two Vec8fb
+    Vec16fb (Vec8fb const x0, Vec8fb const x1) {
+#ifdef VECTORF256E_H
+        z0 = reinterpret_i(x0);
+        z1 = reinterpret_i(x1);
+#else
+        z0 = x0;
+        z1 = x1;
+#endif
+    }
+    // Constructor to broadcast scalar value:
+    Vec16fb(bool b) : Vec16b(b) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16fb & operator = (bool b) {
+        *this = Vec16b(b);
+        return *this;
+    }
+    // Get low and high half
+    Vec8fb get_low() const {
+        return reinterpret_f(Vec8i(z0));
+    }
+    Vec8fb get_high() const {
+        return reinterpret_f(Vec8i(z1));
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec16fb & load_bits(uint16_t a) {
+        z0 = Vec8ib().load_bits(uint8_t(a));
+        z1 = Vec8ib().load_bits(uint8_t(a>>8));
+        return *this;
+    }
+    // Prevent constructing from int, etc.
+    Vec16fb(int b) = delete;
+    Vec16fb & operator = (int x) = delete;
+};
+
+// Define operators for Vec16fb
+
+// vector operator & : bitwise and
+static inline Vec16fb operator & (Vec16fb const a, Vec16fb const b) {
+    return Vec16fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16fb operator && (Vec16fb const a, Vec16fb const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16fb operator | (Vec16fb const a, Vec16fb const b) {
+    return Vec16fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16fb operator || (Vec16fb const a, Vec16fb const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16fb operator ^ (Vec16fb const a, Vec16fb const b) {
+    return Vec16fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator == : xnor
+static inline Vec16fb operator == (Vec16fb const a, Vec16fb const b) {
+    return Vec16fb(Vec16fb(a) ^ Vec16fb(~b));
+}
+
+// vector operator != : xor
+static inline Vec16fb operator != (Vec16fb const a, Vec16fb const b) {
+    return Vec16fb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16fb operator ~ (Vec16fb const a) {
+    return Vec16fb(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : element not
+static inline Vec16fb operator ! (Vec16fb const a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16fb & operator &= (Vec16fb & a, Vec16fb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16fb & operator |= (Vec16fb & a, Vec16fb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16fb & operator ^= (Vec16fb & a, Vec16fb const b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Vec8db: Vector of 8 broad booleans for use with Vec8d
+*
+*****************************************************************************/
+
+class Vec8db : public Vec512b {
+public:
+    // Default constructor:
+    Vec8db () {
+    }
+    // Constructor to build from all elements:
+    Vec8db(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
+        z0 = Vec4qb(x0, x1, x2, x3);
+        z1 = Vec4qb(x4, x5, x6, x7);
+    }
+    // Construct from Vec512b
+    Vec8db (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor from two Vec4db
+    Vec8db (Vec4db const x0, Vec4db const x1) {
+#ifdef VECTORF256E_H
+        z0 = reinterpret_i(x0);
+        z1 = reinterpret_i(x1);
+#else
+        z0 = x0;
+        z1 = x1;
+#endif
+    }
+    // Constructor to broadcast single value:
+    Vec8db(bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8db & operator = (bool b) {
+        *this = Vec8db(b);
+        return *this;
+    }
+    Vec8db & insert(int index, bool a) {
+        if (index < 4) {
+            z0 = Vec4q(z0).insert(index, -(int64_t)a);
+        }
+        else {
+            z1 = Vec4q(z1).insert(index-4, -(int64_t)a);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        if ((uint32_t)index < 4) {
+            return Vec4q(z0).extract(index) != 0;
+        }
+        else {
+            return Vec4q(z1).extract(index-4) != 0;
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Get low and high half
+    Vec4db get_low() const {
+        return reinterpret_d(Vec4q(z0));
+    }
+    Vec4db get_high() const {
+        return reinterpret_d(Vec4q(z1));
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8db & load_bits(uint8_t a) {
+        z0 = Vec4qb().load_bits(a);
+        z1 = Vec4qb().load_bits(uint8_t(a>>4u));
+        return *this;
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec8db(int b) = delete;
+    // Prevent assigning int because of ambiguity
+    Vec8db & operator = (int x) = delete;
+};
+
+// Define operators for Vec8db
+
+// vector operator & : bitwise and
+static inline Vec8db operator & (Vec8db const a, Vec8db const b) {
+    return Vec8db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8db operator && (Vec8db const a, Vec8db const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8db operator | (Vec8db const a, Vec8db const b) {
+    return Vec8db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8db operator || (Vec8db const a, Vec8db const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8db operator ^ (Vec8db const a, Vec8db const b) {
+    return Vec8db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator == : xnor
+static inline Vec8db operator == (Vec8db const a, Vec8db const b) {
+    return Vec8db(Vec8db(a) ^ Vec8db(~b));
+}
+
+// vector operator != : xor
+static inline Vec8db operator != (Vec8db const a, Vec8db const b) {
+    return Vec8db(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8db operator ~ (Vec8db const a) {
+    return Vec8db(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : element not
+static inline Vec8db operator ! (Vec8db const a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8db & operator &= (Vec8db & a, Vec8db const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8db & operator |= (Vec8db & a, Vec8db const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8db & operator ^= (Vec8db & a, Vec8db const b) {
+    a = a ^ b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Vec16f: Vector of 16 single precision floating point values
+*
+*****************************************************************************/
+
+class Vec16f {
+protected:
+    Vec8f z0;
+    Vec8f z1;
+public:
+    // Default constructor:
+    Vec16f() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16f(float f) {
+        z0 = z1 = Vec8f(f);
+    }
+    // Constructor to build from all elements:
+    Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7,
+    float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15) {
+        z0 = Vec8f(f0, f1, f2, f3, f4, f5, f6, f7);
+        z1 = Vec8f(f8, f9, f10, f11, f12, f13, f14, f15);
+    }
+    // Constructor to build from two Vec8f:
+    Vec16f(Vec8f const a0, Vec8f const a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // split into two halves
+    Vec8f get_low() const {
+        return z0;
+    }
+    Vec8f get_high() const {
+        return z1;
+    }
+    // Member function to load from array (unaligned)
+    Vec16f & load(float const * p) {
+        z0 = Vec8f().load(p);
+        z1 = Vec8f().load(p+8);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address divisible by 64
+    Vec16f & load_a(float const * p) {
+        z0 = Vec8f().load_a(p);
+        z1 = Vec8f().load_a(p+8);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(float * p) const {
+        Vec8f(z0).store(p);
+        Vec8f(z1).store(p+8);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address divisible by 64
+    void store_a(float * p) const {
+        Vec8f(z0).store_a(p);
+        Vec8f(z1).store_a(p+8);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 64
+    void store_nt(float * p) const {
+        Vec8f(z0).store_nt(p);
+        Vec8f(z1).store_nt(p+8);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16f & load_partial(int n, float const * p) {
+        if (n < 8) {
+            z0 = Vec8f().load_partial(n, p);
+            z1 = Vec8f(0.f);
+        }
+        else {
+            z0 = Vec8f().load(p);
+            z1 = Vec8f().load_partial(n-8, p + 8);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, float * p) const {
+        if (n < 8) {
+            Vec8f(z0).store_partial(n, p);
+        }
+        else {
+            Vec8f(z0).store(p);
+            Vec8f(z1).store_partial(n-8, p+8);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec16f & cutoff(int n) {
+        if (n < 8) {
+            z0 = Vec8f(z0).cutoff(n);
+            z1 = Vec8f(0.f);
+        }
+        else {
+            z1 = Vec8f(z1).cutoff(n-8);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16f const insert(int index, float value) {
+        if ((uint32_t)index < 8) {
+            z0 = Vec8f(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec8f(z1).insert(index-8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    float extract(int index) const {
+        float a[16];
+        store(a);
+        return a[index & 15];
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    float operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 16;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16f
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16f operator + (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec16f operator + (Vec16f const a, float b) {
+    return a + Vec16f(b);
+}
+static inline Vec16f operator + (float a, Vec16f const b) {
+    return Vec16f(a) + b;
+}
+
+// vector operator += : add
+static inline Vec16f & operator += (Vec16f & a, Vec16f const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16f operator ++ (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a + 1.0f;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16f & operator ++ (Vec16f & a) {
+    a = a + 1.0f;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16f operator - (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec16f operator - (Vec16f const a, float b) {
+    return a - Vec16f(b);
+}
+static inline Vec16f operator - (float a, Vec16f const b) {
+    return Vec16f(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec16f operator - (Vec16f const a) {
+    return Vec16f(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec16f & operator -= (Vec16f & a, Vec16f const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16f operator -- (Vec16f & a, int) {
+    Vec16f a0 = a;
+    a = a - 1.0f;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16f & operator -- (Vec16f & a) {
+    a = a - 1.0f;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16f operator * (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec16f operator * (Vec16f const a, float b) {
+    return a * Vec16f(b);
+}
+static inline Vec16f operator * (float a, Vec16f const b) {
+    return Vec16f(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec16f & operator *= (Vec16f & a, Vec16f const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16f operator / (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec16f operator / (Vec16f const a, float b) {
+    return a / Vec16f(b);
+}
+static inline Vec16f operator / (float a, Vec16f const b) {
+    return Vec16f(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec16f & operator /= (Vec16f & a, Vec16f const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16fb operator == (Vec16f const a, Vec16f const b) {
+    return Vec16fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16fb operator != (Vec16f const a, Vec16f const b) {
+    return Vec16fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16fb operator < (Vec16f const a, Vec16f const b) {
+    return Vec16fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec16fb operator <= (Vec16f const a, Vec16f const b) {
+    return Vec16fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16fb operator > (Vec16f const a, Vec16f const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec16fb operator >= (Vec16f const a, Vec16f const b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec16f operator & (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec16f & operator &= (Vec16f & a, Vec16f const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec16f and Vec16fb
+static inline Vec16f operator & (Vec16f const a, Vec16fb const b) {
+    return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16f operator & (Vec16fb const a, Vec16f const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16f operator | (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec16f & operator |= (Vec16f & a, Vec16f const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16f operator ^ (Vec16f const a, Vec16f const b) {
+    return Vec16f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16f & operator ^= (Vec16f & a, Vec16f const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec16fb operator ! (Vec16f const a) {
+    return Vec16fb(!a.get_low(), !a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Functions for Vec16f
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
+static inline Vec16f select (Vec16fb const s, Vec16f const a, Vec16f const b) {
+    return Vec16f(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16f if_add (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return Vec16f(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec16f if_sub (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return Vec16f(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec16f if_mul (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return Vec16f(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional divide
+static inline Vec16f if_div (Vec16fb const f, Vec16f const a, Vec16f const b) {
+    return Vec16f(if_div(f.get_low(), a.get_low(), b.get_low()), if_div(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline float horizontal_add (Vec16f const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec16f max(Vec16f const a, Vec16f const b) {
+    return Vec16f(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16f min(Vec16f const a, Vec16f const b) {
+    return Vec16f(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec16f abs(Vec16f const a) {
+    return Vec16f(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec16f sqrt(Vec16f const a) {
+    return Vec16f(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec16f square(Vec16f const a) {
+    return a * a;
+}
+
+// pow(Vec16f, int):
+template <typename TT> static Vec16f pow(Vec16f const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec16f pow<int>(Vec16f const x0, int const n) {
+    return pow_template_i<Vec16f>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec16f pow<uint32_t>(Vec16f const x0, uint32_t const n) {
+    return pow_template_i<Vec16f>(x0, (int)n);
+}
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec16f pow_n(Vec16f const a) {
+    if (n < 0)    return Vec16f(1.0f) / pow_n<-n>(a);
+    if (n == 0)   return Vec16f(1.0f);
+    if (n >= 256) return pow(a, n);
+    Vec16f x = a;                      // a^(2^i)
+    Vec16f y;                          // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec16f pow(Vec16f const a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as float vector)
+static inline Vec16f round(Vec16f const a) {
+    return Vec16f(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as float vector)
+static inline Vec16f truncate(Vec16f const a) {
+    return Vec16f(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as float vector)
+static inline Vec16f floor(Vec16f const a) {
+    return Vec16f(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as float vector)
+static inline Vec16f ceil(Vec16f const a) {
+    return Vec16f(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function roundi: round to nearest integer (even). (result as integer vector)
+static inline Vec16i roundi(Vec16f const a) {
+    return Vec16i(roundi(a.get_low()), roundi(a.get_high()));
+}
+//static inline Vec16i round_to_int(Vec16f const a) {return roundi(a);} // deprecated
+
+// function truncatei: round towards zero. (result as integer vector)
+static inline Vec16i truncatei(Vec16f const a) {
+    return Vec16i(truncatei(a.get_low()), truncatei(a.get_high()));
+}
+//static inline Vec16i truncate_to_int(Vec16f const a) {return truncatei(a);} // deprecated
+
+// function to_float: convert integer vector to float vector
+static inline Vec16f to_float(Vec16i const a) {
+    return Vec16f(to_float(a.get_low()), to_float(a.get_high()));
+}
+
+// function to_float: convert unsigned integer vector to float vector
+static inline Vec16f to_float(Vec16ui const a) {
+    return Vec16f(to_float(a.get_low()), to_float(a.get_high()));
+}
+
+
+// Approximate math functions
+
+// approximate reciprocal (Faster than 1.f / a.
+// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_recipr(Vec16f const a) {
+    return Vec16f(approx_recipr(a.get_low()), approx_recipr(a.get_high()));
+}
+
+// approximate reciprocal squareroot (Faster than 1.f / sqrt(a).
+// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512)
+static inline Vec16f approx_rsqrt(Vec16f const a) {
+    return Vec16f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high()));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec16f mul_add(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return Vec16f(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec16f mul_sub(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return Vec16f(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec16f nmul_add(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return Vec16f(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec16f mul_sub_x(Vec16f const a, Vec16f const b, Vec16f const c) {
+    return Vec16f(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
+static inline Vec16i exponent(Vec16f const a) {
+    return Vec16i(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
+static inline Vec16f fraction(Vec16f const a) {
+    return Vec16f(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =    0 gives 1.0f
+// n >=  128 gives +INF
+// n <= -127 gives 0.0f
+// This function will never produce denormals, and never raise exceptions
+static inline Vec16f exp2(Vec16i const n) {
+    return Vec16f(exp2(n.get_low()), exp2(n.get_high()));
+}
+//static Vec16f exp2(Vec16f const x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0f, -INF and -NAN
+// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb sign_bit(Vec16f const a) {
+    return Vec16fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec16f sign_combine(Vec16f const a, Vec16f const b) {
+    return Vec16f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_finite(Vec16f const a) {
+    return Vec16fb(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_inf(Vec16f const a) {
+    return Vec16fb(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+static inline Vec16fb is_nan(Vec16f const a) {
+    return Vec16fb(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec16fb is_subnormal(Vec16f const a) {
+    return Vec16fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec16fb is_zero_or_subnormal(Vec16f const a) {
+    return Vec16fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite4f: returns a vector where all elements are +INF
+static inline Vec16f infinite16f() {
+    Vec8f inf = infinite8f();
+    return Vec16f(inf, inf);
+}
+
+// Function nan4f: returns a vector where all elements are +NAN (quiet)
+static inline Vec16f nan16f(int n = 0x10) {
+    Vec8f nan = nan8f(n);
+    return Vec16f(nan, nan);
+}
+
+// change signs on vectors Vec16f
+// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
+// ("static" is removed from change_sign templates because it seems to generate problems for
+// the Clang compiler with nested template calls. "static" is probably superfluous anyway.)
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+inline Vec16f change_sign(Vec16f const a) {
+    return Vec16f(change_sign<i0,i1,i2,i3,i4,i5,i6,i7>(a.get_low()), change_sign<i8,i9,i10,i11,i12,i13,i14,i15>(a.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vec8d: Vector of 8 double precision floating point values
+*
+*****************************************************************************/
+
+class Vec8d {
+protected:
+    Vec4d z0;
+    Vec4d z1;
+public:
+    // Default constructor:
+    Vec8d() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8d(double d) {
+        z0 = z1 = Vec4d(d);
+    }
+    // Constructor to build from all elements:
+    Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) {
+        z0 = Vec4d(d0, d1, d2, d3);
+        z1 = Vec4d(d4, d5, d6, d7);
+    }
+    // Constructor to build from two Vec4d:
+    Vec8d(Vec4d const a0, Vec4d const a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Member function to load from array (unaligned)
+    Vec8d & load(double const * p) {
+        z0.load(p);
+        z1.load(p+4);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address divisible by 64
+    Vec8d & load_a(double const * p) {
+        z0.load_a(p);
+        z1.load_a(p+4);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(double * p) const {
+        z0.store(p);
+        z1.store(p+4);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address divisible by 64
+    void store_a(double * p) const {
+        z0.store_a(p);
+        z1.store_a(p+4);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 64
+    void store_nt(double * p) const {
+        z0.store_nt(p);
+        z1.store_nt(p+4);
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8d & load_partial(int n, double const * p) {
+        if (n < 4) {
+            z0.load_partial(n, p);
+            z1 = Vec4d(0.);
+        }
+        else {
+            z0.load(p);
+            z1.load_partial(n-4, p+4);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, double * p) const {
+        if (n < 4) {
+            z0.store_partial(n, p);
+        }
+        else {
+            z0.store(p);
+            z1.store_partial(n-4, p+4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8d & cutoff(int n) {
+        if (n < 4) {
+            z0.cutoff(n);
+            z1 = Vec4d(0.);
+        }
+        else {
+            z1.cutoff(n-4);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8d const insert(int index, double value) {
+        if ((uint32_t)index < 4) {
+            z0.insert(index, value);
+        }
+        else {
+            z1.insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    double extract(int index) const {
+        double a[8];
+        store(a);
+        return a[index & 7];
+    }
+
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    double operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4d:
+    Vec4d get_low() const {
+        return z0;
+    }
+    Vec4d get_high() const {
+        return z1;
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 17;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Operators for Vec8d
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8d operator + (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator + : add vector and scalar
+static inline Vec8d operator + (Vec8d const a, double b) {
+    return a + Vec8d(b);
+}
+static inline Vec8d operator + (double a, Vec8d const b) {
+    return Vec8d(a) + b;
+}
+
+// vector operator += : add
+static inline Vec8d & operator += (Vec8d & a, Vec8d const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8d operator ++ (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a + 1.0;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8d & operator ++ (Vec8d & a) {
+    a = a + 1.0;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8d operator - (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : subtract vector and scalar
+static inline Vec8d operator - (Vec8d const a, double b) {
+    return a - Vec8d(b);
+}
+static inline Vec8d operator - (double a, Vec8d const b) {
+    return Vec8d(a) - b;
+}
+
+// vector operator - : unary minus
+// Change sign bit, even for 0, INF and NAN
+static inline Vec8d operator - (Vec8d const a) {
+    return Vec8d(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8d & operator -= (Vec8d & a, Vec8d const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8d operator -- (Vec8d & a, int) {
+    Vec8d a0 = a;
+    a = a - 1.0;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8d & operator -- (Vec8d & a) {
+    a = a - 1.0;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8d operator * (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator * : multiply vector and scalar
+static inline Vec8d operator * (Vec8d const a, double b) {
+    return a * Vec8d(b);
+}
+static inline Vec8d operator * (double a, Vec8d const b) {
+    return Vec8d(a) * b;
+}
+
+// vector operator *= : multiply
+static inline Vec8d & operator *= (Vec8d & a, Vec8d const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8d operator / (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() / b.get_low(), a.get_high() / b.get_high());
+}
+
+// vector operator / : divide vector and scalar
+static inline Vec8d operator / (Vec8d const a, double b) {
+    return a / Vec8d(b);
+}
+static inline Vec8d operator / (double a, Vec8d const b) {
+    return Vec8d(a) / b;
+}
+
+// vector operator /= : divide
+static inline Vec8d & operator /= (Vec8d & a, Vec8d const b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8db operator == (Vec8d const a, Vec8d const b) {
+    return Vec8db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8db operator != (Vec8d const a, Vec8d const b) {
+    return Vec8db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8db operator < (Vec8d const a, Vec8d const b) {
+    return Vec8db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b
+static inline Vec8db operator <= (Vec8d const a, Vec8d const b) {
+    return Vec8db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8db operator > (Vec8d const a, Vec8d const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b
+static inline Vec8db operator >= (Vec8d const a, Vec8d const b) {
+    return b <= a;
+}
+
+// Bitwise logical operators
+
+// vector operator & : bitwise and
+static inline Vec8d operator & (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec8d & operator &= (Vec8d & a, Vec8d const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator & : bitwise and of Vec8d and Vec8db
+static inline Vec8d operator & (Vec8d const a, Vec8db const b) {
+    return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+static inline Vec8d operator & (Vec8db const a, Vec8d const b) {
+    return b & a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8d operator | (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec8d & operator |= (Vec8d & a, Vec8d const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8d operator ^ (Vec8d const a, Vec8d const b) {
+    return Vec8d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8d & operator ^= (Vec8d & a, Vec8d const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ! : logical not. Returns Boolean vector
+static inline Vec8db operator ! (Vec8d const a) {
+    return Vec8db(!a.get_low(), !a.get_high());
+}
+
+/*****************************************************************************
+*
+*          Functions for Vec8d
+*
+*****************************************************************************/
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8d select (Vec8db const s, Vec8d const a, Vec8d const b) {
+    return Vec8d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8d if_add (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return Vec8d(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec8d if_sub (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return Vec8d(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec8d if_mul (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return Vec8d(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional divide
+static inline Vec8d if_div (Vec8db const f, Vec8d const a, Vec8d const b) {
+    return Vec8d(if_div(f.get_low(), a.get_low(), b.get_low()), if_div(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// General arithmetic functions, etc.
+
+// Horizontal add: Calculates the sum of all vector elements.
+static inline double horizontal_add (Vec8d const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8d max(Vec8d const a, Vec8d const b) {
+    return Vec8d(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8d min(Vec8d const a, Vec8d const b) {
+    return Vec8d(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+// NAN-safe versions of maximum and minimum are in vector_convert.h
+
+// function abs: absolute value
+// Removes sign bit, even for -0.0f, -INF and -NAN
+static inline Vec8d abs(Vec8d const a) {
+    return Vec8d(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function sqrt: square root
+static inline Vec8d sqrt(Vec8d const a) {
+    return Vec8d(sqrt(a.get_low()), sqrt(a.get_high()));
+}
+
+// function square: a * a
+static inline Vec8d square(Vec8d const a) {
+    return a * a;
+}
+
+// pow(Vec8d, int):
+template <typename TT> static Vec8d pow(Vec8d const a, TT const n);
+
+// Raise floating point numbers to integer power n
+template <>
+inline Vec8d pow<int>(Vec8d const x0, int const n) {
+    return pow_template_i<Vec8d>(x0, n);
+}
+
+// allow conversion from unsigned int
+template <>
+inline Vec8d pow<uint32_t>(Vec8d const x0, uint32_t const n) {
+    return pow_template_i<Vec8d>(x0, (int)n);
+}
+
+
+// Raise floating point numbers to integer power n, where n is a compile-time constant
+template <int n>
+static inline Vec8d pow_n(Vec8d const a) {
+    if (n < 0)    return Vec8d(1.0) / pow_n<-n>(a);
+    if (n == 0)   return Vec8d(1.0);
+    if (n >= 256) return pow(a, n);
+    Vec8d x = a;                       // a^(2^i)
+    Vec8d y;                           // accumulator
+    const int lowest = n - (n & (n-1));// lowest set bit in n
+    if (n & 1) y = x;
+    if (n < 2) return y;
+    x = x*x;                           // x^2
+    if (n & 2) {
+        if (lowest == 2) y = x; else y *= x;
+    }
+    if (n < 4) return y;
+    x = x*x;                           // x^4
+    if (n & 4) {
+        if (lowest == 4) y = x; else y *= x;
+    }
+    if (n < 8) return y;
+    x = x*x;                           // x^8
+    if (n & 8) {
+        if (lowest == 8) y = x; else y *= x;
+    }
+    if (n < 16) return y;
+    x = x*x;                           // x^16
+    if (n & 16) {
+        if (lowest == 16) y = x; else y *= x;
+    }
+    if (n < 32) return y;
+    x = x*x;                           // x^32
+    if (n & 32) {
+        if (lowest == 32) y = x; else y *= x;
+    }
+    if (n < 64) return y;
+    x = x*x;                           // x^64
+    if (n & 64) {
+        if (lowest == 64) y = x; else y *= x;
+    }
+    if (n < 128) return y;
+    x = x*x;                           // x^128
+    if (n & 128) {
+        if (lowest == 128) y = x; else y *= x;
+    }
+    return y;
+}
+
+template <int n>
+static inline Vec8d pow(Vec8d const a, Const_int_t<n>) {
+    return pow_n<n>(a);
+}
+
+
+// function round: round to nearest integer (even). (result as double vector)
+static inline Vec8d round(Vec8d const a) {
+    return Vec8d(round(a.get_low()), round(a.get_high()));
+}
+
+// function truncate: round towards zero. (result as double vector)
+static inline Vec8d truncate(Vec8d const a) {
+    return Vec8d(truncate(a.get_low()), truncate(a.get_high()));
+}
+
+// function floor: round towards minus infinity. (result as double vector)
+static inline Vec8d floor(Vec8d const a) {
+    return Vec8d(floor(a.get_low()), floor(a.get_high()));
+}
+
+// function ceil: round towards plus infinity. (result as double vector)
+static inline Vec8d ceil(Vec8d const a) {
+    return Vec8d(ceil(a.get_low()), ceil(a.get_high()));
+}
+
+// function round_to_int32: round to nearest integer (even). (result as integer vector)
+static inline Vec8i round_to_int32(Vec8d const a) {
+    // Note: assume MXCSR control register is set to rounding
+    return Vec8i(round_to_int32(a.get_low()), round_to_int32(a.get_high()));
+}
+//static inline Vec8i round_to_int(Vec8d const a) {return round_to_int32(a);} // deprecated
+
+// function truncate_to_int32: round towards zero. (result as integer vector)
+static inline Vec8i truncate_to_int32(Vec8d const a) {
+    return Vec8i(truncate_to_int32(a.get_low()), truncate_to_int32(a.get_high()));
+}
+//static inline Vec8i truncate_to_int(Vec8d const a) {return truncate_to_int32(a);} // deprecated
+
+// function truncatei: round towards zero. (inefficient)
+static inline Vec8q truncatei(Vec8d const a) {
+    return Vec8q(truncatei(a.get_low()), truncatei(a.get_high()));
+}
+//static inline Vec8q truncate_to_int64(Vec8d const a) {return truncatei(a);} // deprecated
+
+// function roundi: round to nearest or even. (inefficient)
+static inline Vec8q roundi(Vec8d const a) {
+    return Vec8q(roundi(a.get_low()), roundi(a.get_high()));
+}
+//static inline Vec8q round_to_int64(Vec8d const a) {return roundi(a);} // deprecated
+
+// function to_double: convert integer vector elements to double vector (inefficient)
+static inline Vec8d to_double(Vec8q const a) {
+    return Vec8d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+// function to_double: convert unsigned integer vector elements to double vector (inefficient)
+static inline Vec8d to_double(Vec8uq const a) {
+    return Vec8d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+// function to_double: convert integer vector to double vector
+static inline Vec8d to_double(Vec8i const a) {
+    return Vec8d(to_double(a.get_low()), to_double(a.get_high()));
+}
+
+// function compress: convert two Vec8d to one Vec16f
+static inline Vec16f compress (Vec8d const low, Vec8d const high) {
+    return Vec16f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
+}
+
+// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d
+static inline Vec8d extend_low(Vec16f const a) {
+    return Vec8d(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d
+static inline Vec8d extend_high (Vec16f const a) {
+    return Vec8d(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Fused multiply and add functions
+
+// Multiply and add
+static inline Vec8d mul_add(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return Vec8d(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract
+static inline Vec8d mul_sub(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return Vec8d(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and inverse subtract
+static inline Vec8d nmul_add(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return Vec8d(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Multiply and subtract with extra precision on the intermediate calculations,
+// even if FMA instructions not supported, using Veltkamp-Dekker split
+static inline Vec8d mul_sub_x(Vec8d const a, Vec8d const b, Vec8d const c) {
+    return Vec8d(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high()));
+}
+
+// Math functions using fast bit manipulation
+
+// Extract the exponent as an integer
+// exponent(a) = floor(log2(abs(a)));
+// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
+static inline Vec8q exponent(Vec8d const a) {
+    return Vec8q(exponent(a.get_low()), exponent(a.get_high()));
+}
+
+// Extract the fraction part of a floating point number
+// a = 2^exponent(a) * fraction(a), except for a = 0
+// fraction(1.0) = 1.0, fraction(5.0) = 1.25
+static inline Vec8d fraction(Vec8d const a) {
+    return Vec8d(fraction(a.get_low()), fraction(a.get_high()));
+}
+
+// Fast calculation of pow(2,n) with n integer
+// n  =     0 gives 1.0
+// n >=  1024 gives +INF
+// n <= -1023 gives 0.0
+// This function will never produce denormals, and never raise exceptions
+static inline Vec8d exp2(Vec8q const n) {
+    return Vec8d(exp2(n.get_low()), exp2(n.get_high()));
+}
+//static Vec8d exp2(Vec8d const x); // defined in vectormath_exp.h
+
+
+// Categorization functions
+
+// Function sign_bit: gives true for elements that have the sign bit set
+// even for -0.0, -INF and -NAN
+// Note that sign_bit(Vec8d(-0.0)) gives true, while Vec8d(-0.0) < Vec8d(0.0) gives false
+static inline Vec8db sign_bit(Vec8d const a) {
+    return Vec8db(sign_bit(a.get_low()), sign_bit(a.get_high()));
+}
+
+// Function sign_combine: changes the sign of a when b has the sign bit set
+// same as select(sign_bit(b), -a, a)
+static inline Vec8d sign_combine(Vec8d const a, Vec8d const b) {
+    return Vec8d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
+}
+
+// Function is_finite: gives true for elements that are normal, denormal or zero,
+// false for INF and NAN
+static inline Vec8db is_finite(Vec8d const a) {
+    return Vec8db(is_finite(a.get_low()), is_finite(a.get_high()));
+}
+
+// Function is_inf: gives true for elements that are +INF or -INF
+// false for finite numbers and NAN
+static inline Vec8db is_inf(Vec8d const a) {
+    return Vec8db(is_inf(a.get_low()), is_inf(a.get_high()));
+}
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+static inline Vec8db is_nan(Vec8d const a) {
+    return Vec8db(is_nan(a.get_low()), is_nan(a.get_high()));
+}
+
+// Function is_subnormal: gives true for elements that are denormal (subnormal)
+// false for finite numbers, zero, NAN and INF
+static inline Vec8db is_subnormal(Vec8d const a) {
+    return Vec8db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
+}
+
+// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
+// false for finite numbers, NAN and INF
+static inline Vec8db is_zero_or_subnormal(Vec8d const a) {
+    return Vec8db(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
+}
+
+// Function infinite2d: returns a vector where all elements are +INF
+static inline Vec8d infinite8d() {
+    Vec4d inf = infinite4d();
+    return Vec8d(inf, inf);
+}
+
+// Function nan8d: returns a vector where all elements are +NAN (quiet NAN)
+static inline Vec8d nan8d(int n = 0x10) {
+    Vec4d nan = nan4d(n);
+    return Vec8d(nan, nan);
+}
+
+// change signs on vectors Vec8d
+// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+inline Vec8d change_sign(Vec8d const a) {
+    return Vec8d(change_sign<i0,i1,i2,i3>(a.get_low()), change_sign<i4,i5,i6,i7>(a.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Functions for reinterpretation between vector types
+*
+*****************************************************************************/
+
+static inline Vec512b reinterpret_i (Vec512b const x) {
+    return x;
+}
+
+static inline Vec512b reinterpret_i (Vec16f  const x) {
+    return Vec512b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec512b reinterpret_i (Vec8d const x) {
+    return Vec512b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
+}
+
+static inline Vec16f  reinterpret_f (Vec512b const x) {
+    return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high())));
+}
+
+static inline Vec16f  reinterpret_f (Vec16f  const x) {
+    return x;
+}
+
+static inline Vec16f  reinterpret_f (Vec8d const x) {
+    return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d (Vec512b const x) {
+    return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d (Vec16f  const x) {
+    return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high())));
+}
+
+static inline Vec8d reinterpret_d (Vec8d const x) {
+    return x;
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// Permute vector of 8 double
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d permute8(Vec8d const a) {
+    return Vec8d(blend4<i0,i1,i2,i3> (a.get_low(), a.get_high()),
+                 blend4<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+// Permute vector of 16 float
+// Index -1 gives 0, index -256 means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f permute16(Vec16f const a) {
+    return Vec16f(blend8<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
+                  blend8<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+// blend vectors Vec8d
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d blend8(Vec8d const a, Vec8d const b) {
+    Vec4d x0 = blend_half<Vec8d, i0, i1, i2, i3>(a, b);
+    Vec4d x1 = blend_half<Vec8d, i4, i5, i6, i7>(a, b);
+    return Vec8d(x0, x1);
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16f blend16(Vec16f const a, Vec16f const b) {
+    Vec8f x0 = blend_half<Vec16f, i0, i1, i2, i3, i4, i5, i6, i7>(a, b);
+    Vec8f x1 = blend_half<Vec16f, i8, i9, i10, i11, i12, i13, i14, i15>(a, b);
+    return Vec16f(x0, x1);
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec16f lookup16(Vec16i const index, Vec16f const table) {
+    float tab[16];
+    table.store(tab);
+    Vec8f t0 = reinterpret_f(lookup<16>(index.get_low(), tab));
+    Vec8f t1 = reinterpret_f(lookup<16>(index.get_high(), tab));
+    return Vec16f(t0, t1);
+}
+
+template <int n>
+static inline Vec16f lookup(Vec16i const index, float const * table) {
+    if constexpr (n <=  0) return 0;
+    if constexpr (n <= 16) return lookup16(index, Vec16f().load(table));
+    // n > 16. Limit index
+    Vec16ui i1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec16ui(index), n-1);
+    }
+    float const * t = table;
+    return Vec16f(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
+        t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
+}
+
+static inline Vec8d lookup8(Vec8q const index, Vec8d const table) {
+    double tab[8];
+    table.store(tab);
+    Vec4d t0 = reinterpret_d(lookup<8>(index.get_low(), tab));
+    Vec4d t1 = reinterpret_d(lookup<8>(index.get_high(), tab));
+    return Vec8d(t0, t1);
+}
+
+template <int n>
+static inline Vec8d lookup(Vec8q const index, double const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 8) {
+        return lookup8(index, Vec8d().load(table));
+    }
+    // n > 8. Limit index
+    Vec8uq i1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec8uq(index), n-1);
+    }
+    double const * t = table;
+    return Vec8d(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
+}
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16f gather16f(void const * a) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if constexpr (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16f b = Vec16f().load((float const *)a + imax-15);
+            return permute16<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16f b = Vec16f().load((float const *)a + imin);
+            return permute16<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16f b = Vec16f().load((float const *)a + imin);
+        Vec16f c = Vec16f().load((float const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const float *)a);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8d gather8d(void const * a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if constexpr (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8d b = Vec8d().load((double const *)a + imax-7);
+            return permute8<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8d b = Vec8d().load((double const *)a + imin);
+            return permute8<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8d b = Vec8d().load((double const *)a + imin);
+        Vec8d c = Vec8d().load((double const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const double *)a);
+}
+
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    static inline void scatter(Vec16f const data, float * array) {
+    const int index[16] = {i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15};
+    for (int i = 0; i < 16; i++) {
+        if (index[i] >= 0) array[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8d const data, double * array) {
+    const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
+    for (int i = 0; i < 8; i++) {
+        if (index[i] >= 0) array[index[i]] = data[i];
+    }
+}
+
+// Scatter functions with variable indexes:
+
+static inline void scatter(Vec16i const index, uint32_t limit, Vec16f const data, float * destination) {
+    uint32_t ix[16];  index.store(ix);
+    for (int i = 0; i < 16; i++) {
+        if (ix[i] < limit) destination[ix[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8q const index, uint32_t limit, Vec8d const data, double * destination) {
+    uint64_t ix[8];  index.store(ix);
+    for (int i = 0; i < 8; i++) {
+        if (ix[i] < limit) destination[ix[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8d const data, double * destination) {
+    uint32_t ix[8];  index.store(ix);
+    for (int i = 0; i < 8; i++) {
+        if (ix[i] < limit) destination[ix[i]] = data[i];
+    }
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16fb const x) {
+    return to_bits(Vec16ib(x));
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8db const x) {
+    return to_bits(Vec8qb(x));
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORF512E_H
diff --git a/Bwdif/VCL2/vectori128.h b/Bwdif/VCL2/vectori128.h
new file mode 100644
index 0000000..a3edef1
--- /dev/null
+++ b/Bwdif/VCL2/vectori128.h
@@ -0,0 +1,7112 @@
+/****************************  vectori128.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 128-bit integer vector classes
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec128b   Vector of 128  bits. Used internally as base class
+* Vec16c    Vector of  16  8-bit signed    integers
+* Vec16uc   Vector of  16  8-bit unsigned  integers
+* Vec16cb   Vector of  16  Booleans for use with Vec16c and Vec16uc
+* Vec8s     Vector of   8  16-bit signed   integers
+* Vec8us    Vector of   8  16-bit unsigned integers
+* Vec8sb    Vector of   8  Booleans for use with Vec8s and Vec8us
+* Vec4i     Vector of   4  32-bit signed   integers
+* Vec4ui    Vector of   4  32-bit unsigned integers
+* Vec4ib    Vector of   4  Booleans for use with Vec4i and Vec4ui
+* Vec2q     Vector of   2  64-bit signed   integers
+* Vec2uq    Vector of   2  64-bit unsigned integers
+* Vec2qb    Vector of   2  Booleans for use with Vec2q and Vec2uq
+*
+* Each vector object is represented internally in the CPU as a 128-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORI128_H
+#define VECTORI128_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#ifdef VCL_NAMESPACE         // optional namespace
+namespace VCL_NAMESPACE {
+#endif
+
+
+// Generate a constant vector of 4 integers stored in memory.
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3>
+static inline constexpr __m128i constant4ui() {
+    /*
+    const union {
+        uint32_t i[4];
+        __m128i  xmm;
+    } u = { {i0,i1,i2,i3} };
+    return u.xmm;
+    */
+    return _mm_setr_epi32(i0, i1, i2, i3);
+}
+
+
+/*****************************************************************************
+*
+*          Compact boolean vectors
+*
+*****************************************************************************/
+#if INSTRSET >= 9
+class Vec8b;  // allow forward reference to Vec8b
+
+#if INSTRSET == 9 && MAX_VECTOR_SIZE >= 512  // special case of mixed compact and broad vectors
+class Vec8ib;
+class Vec8fb;
+class Vec4qb;
+class Vec4db;
+#endif
+
+// Compact vector of 16 booleans
+class Vec16b {
+protected:
+    __mmask16  mm; // Boolean mask register
+public:
+    // Default constructor:
+    Vec16b() {
+    }
+    // Constructor to convert from type __mmask16 used in intrinsics
+    Vec16b(__mmask16 x) {
+        mm = x;
+    }
+    // Constructor to build from all elements:
+    Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
+        bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
+        mm = uint16_t(
+            (uint16_t)b0 | (uint16_t)b1 << 1 | (uint16_t)b2 << 2 | (uint16_t)b3 << 3 |
+            (uint16_t)b4 << 4 | (uint16_t)b5 << 5 | (uint16_t)b6 << 6 | (uint16_t)b7 << 7 |
+            (uint16_t)b8 << 8 | (uint16_t)b9 << 9 | (uint16_t)b10 << 10 | (uint16_t)b11 << 11 |
+            (uint16_t)b12 << 12 | (uint16_t)b13 << 13 | (uint16_t)b14 << 14 | (uint16_t)b15 << 15);
+    }
+    // Constructor to broadcast single value:
+    Vec16b(bool b) {
+        mm = __mmask16(-int16_t(b));
+    }
+    // Constructor to make from two halves. Implemented below after declaration of Vec8b
+    inline Vec16b(Vec8b const x0, Vec8b const x1);
+#if INSTRSET == 9 && MAX_VECTOR_SIZE >= 512  // special case of mixed compact and broad vectors
+    inline Vec16b(Vec8ib const x0, Vec8ib const x1);  // in vectorf512.h
+    inline Vec16b(Vec8fb const x0, Vec8fb const x1);  // in vectorf512.h
+#endif
+
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec16b & operator = (__mmask16 x) {
+        mm = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16b & operator = (bool b) {
+        mm = Vec16b(b);
+        return *this;
+    }
+    // Type cast operator to convert to __mmask16 used in intrinsics
+    operator __mmask16() const {
+        return mm;
+    }
+    // split into two halves
+#if INSTRSET >= 10
+    Vec8b get_low() const;
+    Vec8b get_high() const;
+#elif INSTRSET == 9 && MAX_VECTOR_SIZE >= 512  // special case of mixed compact and broad vectors
+    Vec8ib get_low()  const;    // in vectorf512.h
+    Vec8ib get_high() const;    // in vectorf512.h
+#endif
+    // Member function to change a single element in vector
+    Vec16b const insert(int index, bool value) {
+        mm = __mmask16(((uint16_t)mm & ~(1 << index)) | (int)value << index);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return ((uint32_t)mm >> index) & 1;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec16b & load_bits(uint16_t a) {
+        mm = __mmask16(a);
+        return *this;
+    }
+    // Number of elements
+    static constexpr int size() {
+        return 16;
+    }
+    // Type of elements
+    static constexpr int elementtype() {
+        return 2;
+    }
+    // I would like to prevent implicit conversion from int, but this is
+    // not possible because __mmask16 and int16_t are treated as the same type:
+    // Vec16b(int b) = delete;
+    // Vec16b & operator = (int x) = delete;
+};
+
+#if INSTRSET >= 10
+class Vec2b;
+class Vec4b;
+#endif
+
+// Compact vector of 8 booleans
+class Vec8b {
+#if INSTRSET < 10
+    // There is a problem in the case where we have AVX512F but not AVX512DQ:
+    // We have 8-bit masks, but 8-bit mask operations (KMOVB, KANDB, etc.) require AVX512DQ.
+    // We have to use 16-bit mask operations on 8-bit masks (KMOVW, KANDW, etc.).
+    // I don't know if this is necessary, but I am using __mmask16 rather than __mmask8
+    // in this case to avoid that the compiler generates 8-bit mask instructions.
+    // We may get warnings in MS compiler when using __mmask16 on intrinsic functions
+    // that require __mmask8, but I would rather have warnings than code that crashes.
+    #define Vec8b_masktype __mmask16
+#else
+    #define Vec8b_masktype __mmask8
+#endif
+protected:
+    Vec8b_masktype mm;  // Boolean mask register
+public:
+    // Default constructor:
+    Vec8b() {
+    }
+    // Constructor to convert from type  __mmask8 used in intrinsics
+    Vec8b(__mmask8 x) {
+        mm = __mmask8(x);
+    }
+    // Constructor to convert from type  __mmask16 used in intrinsics
+    Vec8b(__mmask16 x) {
+        mm = Vec8b_masktype(x);
+    }
+    // Constructor to make from two halves
+#if INSTRSET >= 10
+    inline Vec8b(Vec4b const x0, Vec4b const x1);     //  Implemented below after declaration of Vec4b
+#elif INSTRSET == 9 && MAX_VECTOR_SIZE >= 512         // special case of mixed compact and broad vectors
+    inline Vec8b(Vec4qb const x0, Vec4qb const x1);   // in vectorf512.h
+    inline Vec8b(Vec4db const x0, Vec4db const x1);   // in vectorf512.h
+#endif
+
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec8b & operator = (Vec8b_masktype x) {
+        mm = Vec8b_masktype(x);
+        return *this;
+    }
+    // Constructor to build from all elements:
+    Vec8b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
+        mm = uint8_t(
+            (uint8_t)b0 | (uint8_t)b1 << 1 | (uint8_t)b2 << 2 | (uint8_t)b3 << 3 |
+            (uint8_t)b4 << 4 | (uint8_t)b5 << 5 | (uint8_t)b6 << 6 | (uint8_t)b7 << 7);
+    }
+    // Constructor to broadcast single value:
+    Vec8b(bool b) {
+        mm = Vec8b_masktype(-int16_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8b & operator = (bool b) {
+        mm = Vec8b_masktype(Vec8b(b));
+        return *this;
+    }
+    // Type cast operator to convert to __mmask16 used in intrinsics
+    operator Vec8b_masktype() const {
+        return mm;
+    }
+    // split into two halves
+#if INSTRSET >= 10
+    Vec4b get_low()  const;
+    Vec4b get_high() const;
+#elif INSTRSET == 9 && MAX_VECTOR_SIZE >= 512    // special case of mixed compact and broad vectors
+    Vec4qb get_low()  const;                     // in vectorf512.h
+    Vec4qb get_high() const;                     // in vectorf512.h
+#endif
+    // Member function to change a single element in vector
+    Vec8b const insert(int index, bool value) {
+        mm = Vec8b_masktype(((uint8_t)mm & ~(1 << index)) | (int)value << index);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return ((uint32_t)mm >> index) & 1;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8b & load_bits(uint8_t a) {
+        mm = Vec8b_masktype(a);
+        return *this;
+    }
+    // Number of elements
+    static constexpr int size() {
+        return 8;
+    }
+    // Type of elements
+    static constexpr int elementtype() {
+        return 2;
+    }
+};
+
+// Members of Vec16b that refer to Vec8b:
+inline Vec16b::Vec16b(Vec8b const x0, Vec8b const x1) {
+    mm = uint8_t(x0) | uint16_t(x1) << 8;
+}
+#if INSTRSET >= 10
+inline Vec8b Vec16b::get_low() const {
+    return Vec8b().load_bits(uint8_t(mm));
+}
+inline Vec8b Vec16b::get_high() const {
+    return Vec8b().load_bits(uint8_t((uint16_t)mm >> 8u));
+}
+#endif
+
+#endif   // INSTRSET >= 9
+
+#if INSTRSET >= 10
+class Vec4b : public Vec8b {
+public:
+    // Default constructor:
+    Vec4b() {
+    }
+    // Constructor to make from two halves
+    inline Vec4b(Vec2b const x0, Vec2b const x1); // Implemented below after declaration of Vec4b
+
+    // Constructor to convert from type __mmask8 used in intrinsics
+    Vec4b(__mmask8 x) {
+        mm = x;
+    }
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec4b & operator = (__mmask8 x) {
+        mm = x;
+        return *this;
+    }
+    // Constructor to build from all elements:
+    Vec4b(bool b0, bool b1, bool b2, bool b3) {
+        mm = (uint8_t)b0 | (uint8_t)b1 << 1 | (uint8_t)b2 << 2 | (uint8_t)b3 << 3;
+    }
+    // Constructor to broadcast single value:
+    Vec4b(bool b) {
+        mm = -int8_t(b) & 0x0F;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4b & operator = (bool b) {
+        mm = Vec4b(b);
+        return *this;
+    }
+    // split into two halves
+    Vec2b get_low()  const;  // Implemented below after declaration of Vec4b
+    Vec2b get_high() const;  // Implemented below after declaration of Vec4b
+
+    // Member function to change a bitfield to a boolean vector
+    Vec4b & load_bits(uint8_t a) {
+        mm = a & 0x0F;
+        return *this;
+    }
+    // Number of elements
+    static constexpr int size() {
+        return 4;
+    }
+};
+
+class Vec2b : public Vec8b {
+public:
+    // Default constructor:
+    Vec2b() {
+    }
+    // Constructor to convert from type  __mmask8 used in intrinsics
+    Vec2b(__mmask8 x) {
+        mm = x;
+    }
+    // Assignment operator to convert from type __mmask16 used in intrinsics:
+    Vec2b & operator = (__mmask8 x) {
+        mm = x;
+        return *this;
+    }
+    // Constructor to build from all elements:
+    Vec2b(bool b0, bool b1) {
+        mm = (uint8_t)b0 | (uint8_t)b1 << 1;
+    }
+    // Constructor to broadcast single value:
+    Vec2b(bool b) {
+        mm = -int8_t(b) & 0x03;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec2b & operator = (bool b) {
+        mm = Vec2b(b);
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec2b & load_bits(uint8_t a) {
+        mm = a & 0x03;
+        return *this;
+    }
+    // Number of elements
+    static constexpr int size() {
+        return 2;
+    }
+};
+
+// Members of Vec8b that refer to Vec4b:
+inline Vec8b::Vec8b(Vec4b const x0, Vec4b const x1) {
+    mm = (uint8_t(x0) & 0x0F) | (uint8_t(x1) << 4);
+}
+inline Vec4b Vec8b::get_low() const {
+    return Vec4b().load_bits(mm & 0xF);
+}
+inline Vec4b Vec8b::get_high() const {
+    return Vec4b().load_bits(mm >> 4u);
+}
+//  Members of Vec4b that refer to Vec2b:
+inline Vec4b::Vec4b(Vec2b const x0, Vec2b const x1) {
+    mm = (uint8_t(x0) & 0x03) | (uint8_t(x1) << 2);
+}
+inline Vec2b Vec4b::get_low() const {
+    return Vec2b().load_bits(mm & 3);
+}
+inline Vec2b Vec4b::get_high() const {
+    return Vec2b().load_bits(mm >> 2u);
+}
+
+#endif
+
+/*****************************************************************************
+*
+*          Define operators and functions for Vec16b
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9
+
+// vector operator & : and
+static inline Vec16b operator & (Vec16b a, Vec16b b) {
+    return _mm512_kand(__mmask16(a), __mmask16(b));
+}
+static inline Vec16b operator && (Vec16b a, Vec16b b) {
+    return a & b;
+}
+
+// vector operator | : or
+static inline Vec16b operator | (Vec16b a, Vec16b b) {
+    return _mm512_kor(__mmask16(a), __mmask16(b));
+}
+static inline Vec16b operator || (Vec16b a, Vec16b b) {
+    return a | b;
+}
+
+// vector operator ^ : xor
+static inline Vec16b operator ^ (Vec16b a, Vec16b b) {
+    return _mm512_kxor(__mmask16(a), __mmask16(b));
+}
+
+// vector operator == : xnor
+static inline Vec16b operator == (Vec16b a, Vec16b b) {
+    return _mm512_kxnor(__mmask16(a), __mmask16(b));
+}
+
+// vector operator != : xor
+static inline Vec16b operator != (Vec16b a, Vec16b b) {
+    return a ^ b;
+}
+
+// vector operator ~ : not
+static inline Vec16b operator ~ (Vec16b a) {
+    return _mm512_knot(__mmask16(a));
+}
+
+// vector operator ! : element not
+static inline Vec16b operator ! (Vec16b a) {
+    return ~a;
+}
+
+// vector operator &= : and
+static inline Vec16b & operator &= (Vec16b & a, Vec16b b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : or
+static inline Vec16b & operator |= (Vec16b & a, Vec16b b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : xor
+static inline Vec16b & operator ^= (Vec16b & a, Vec16b b) {
+    a = a ^ b;
+    return a;
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec16b const a) {
+    return __mmask16(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec16b const a) {
+    return __mmask16(a) != 0;
+}
+
+// function andnot: a & ~ b
+static inline Vec16b andnot(Vec16b const a, Vec16b const b) {
+    return _mm512_kandn(b, a);
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators and functions for Vec8b
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9   // compact boolean vectors
+
+// vector operator & : and
+static inline Vec8b operator & (Vec8b a, Vec8b b) {
+#if INSTRSET >= 10  // 8-bit mask operations require AVX512DQ
+    // _kand_mask8(__mmask8(a), __mmask8(b)) // not defined
+    // must convert result to 8 bit, because bitwise operators promote everything to 32 bit results
+    return __mmask8(__mmask8(a) & __mmask8(b));
+#else
+    return _mm512_kand(__mmask16(a), __mmask16(b));
+#endif
+}
+static inline Vec8b operator && (Vec8b a, Vec8b b) {
+    return a & b;
+}
+
+// vector operator | : or
+static inline Vec8b operator | (Vec8b a, Vec8b b) {
+#if INSTRSET >= 10  // 8-bit mask operations require AVX512DQ
+    return __mmask8(__mmask8(a) | __mmask8(b)); // _kor_mask8(__mmask8(a), __mmask8(b));
+#else
+    return _mm512_kor(__mmask16(a), __mmask16(b));
+#endif
+}
+static inline Vec8b operator || (Vec8b a, Vec8b b) {
+    return a | b;
+}
+
+// vector operator ^ : xor
+static inline Vec8b operator ^ (Vec8b a, Vec8b b) {
+#if INSTRSET >= 10  // 8-bit mask operations require AVX512DQ
+    return __mmask8(__mmask8(a) ^ __mmask8(b)); // _kxor_mask8(__mmask8(a), __mmask8(b));
+#else
+    return _mm512_kxor(__mmask16(a), __mmask16(b));
+#endif
+}
+
+// vector operator == : xnor
+static inline Vec8b operator == (Vec8b a, Vec8b b) {
+#if INSTRSET >= 10  // 8-bit mask operations require AVX512DQ
+    return __mmask8(~(__mmask8(a) ^ __mmask8(b))); // _kxnor_mask8(__mmask8(a), __mmask8(b));
+#else
+    return __mmask16(uint8_t(__mmask8(a) ^ __mmask8(b)));
+#endif
+}
+
+// vector operator != : xor
+static inline Vec8b operator != (Vec8b a, Vec8b b) {
+    return a ^ b;
+}
+
+// vector operator ~ : not
+static inline Vec8b operator ~ (Vec8b a) {
+#if INSTRSET >= 10  // 8-bit mask operations require AVX512DQ
+    return __mmask8(~__mmask8(a)); //_knot_mask8(__mmask8(a));
+#else
+    return _mm512_knot(__mmask16(a));
+#endif
+}
+
+// vector operator ! : element not
+static inline Vec8b operator ! (Vec8b a) {
+    return ~a;
+}
+
+// vector operator &= : and
+static inline Vec8b & operator &= (Vec8b & a, Vec8b b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : or
+static inline Vec8b & operator |= (Vec8b & a, Vec8b b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : xor
+static inline Vec8b & operator ^= (Vec8b & a, Vec8b b) {
+    a = a ^ b;
+    return a;
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec8b const a) {
+    return uint8_t(Vec8b_masktype(a)) == 0xFFu;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec8b const a) {
+    return uint8_t(Vec8b_masktype(a)) != 0;
+}
+
+// function andnot: a & ~ b
+static inline Vec8b andnot(Vec8b const a, Vec8b const b) {
+    return Vec8b_masktype(_mm512_kandn(b, a));
+}
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec4b
+*
+*****************************************************************************/
+
+#if INSTRSET >= 10  // compact boolean vectors
+
+// vector operator & : and
+static inline Vec4b operator & (Vec4b a, Vec4b b) {
+    return __mmask8(__mmask8(a) & __mmask8(b)); // _kand_mask8(__mmask8(a), __mmask8(b)) // not defined
+}
+static inline Vec4b operator && (Vec4b a, Vec4b b) {
+    return a & b;
+}
+
+// vector operator | : or
+static inline Vec4b operator | (Vec4b a, Vec4b b) {
+    return __mmask8(__mmask8(a) | __mmask8(b)); // _kor_mask8(__mmask8(a), __mmask8(b));
+}
+static inline Vec4b operator || (Vec4b a, Vec4b b) {
+    return a | b;
+}
+
+// vector operator ^ : xor
+static inline Vec4b operator ^ (Vec4b a, Vec4b b) {
+    return __mmask8(__mmask8(a) ^ __mmask8(b)); // _kxor_mask8(__mmask8(a), __mmask8(b));
+}
+
+// vector operator ~ : not
+static inline Vec4b operator ~ (Vec4b a) {
+    return __mmask8(__mmask8(a) ^ 0x0F);
+}
+
+// vector operator == : xnor
+static inline Vec4b operator == (Vec4b a, Vec4b b) {
+    return ~(a ^ b);
+}
+
+// vector operator != : xor
+static inline Vec4b operator != (Vec4b a, Vec4b b) {
+    return a ^ b;
+}
+
+// vector operator ! : element not
+static inline Vec4b operator ! (Vec4b a) {
+    return ~a;
+}
+
+// vector operator &= : and
+static inline Vec4b & operator &= (Vec4b & a, Vec4b b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : or
+static inline Vec4b & operator |= (Vec4b & a, Vec4b b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : xor
+static inline Vec4b & operator ^= (Vec4b & a, Vec4b b) {
+    a = a ^ b;
+    return a;
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec4b const a) {
+    return (__mmask8(a) & 0x0F) == 0x0F;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec4b const a) {
+    return (__mmask8(a) & 0x0F) != 0;
+}
+
+// function andnot: a & ~ b
+static inline Vec4b andnot(Vec4b const a, Vec4b const b) {
+    return __mmask8(andnot(Vec8b(a), Vec8b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec2b
+*
+*****************************************************************************/
+
+// vector operator & : and
+static inline Vec2b operator & (Vec2b a, Vec2b b) {
+    return __mmask8(__mmask8(a) & __mmask8(b)); // _kand_mask8(__mmask8(a), __mmask8(b)) // not defined
+}
+static inline Vec2b operator && (Vec2b a, Vec2b b) {
+    return a & b;
+}
+
+// vector operator | : or
+static inline Vec2b operator | (Vec2b a, Vec2b b) {
+    return __mmask8(__mmask8(a) | __mmask8(b)); // _kor_mask8(__mmask8(a), __mmask8(b));
+}
+static inline Vec2b operator || (Vec2b a, Vec2b b) {
+    return a | b;
+}
+
+// vector operator ^ : xor
+static inline Vec2b operator ^ (Vec2b a, Vec2b b) {
+    return __mmask8(__mmask8(a) ^ __mmask8(b)); // _kxor_mask8(__mmask8(a), __mmask8(b));
+}
+
+// vector operator ~ : not
+static inline Vec2b operator ~ (Vec2b a) {
+    return __mmask8(__mmask8(a) ^ 0x03);
+}
+
+// vector operator == : xnor
+static inline Vec2b operator == (Vec2b a, Vec2b b) {
+    return ~(a ^ b);
+}
+
+// vector operator != : xor
+static inline Vec2b operator != (Vec2b a, Vec2b b) {
+    return a ^ b;
+}
+
+// vector operator ! : element not
+static inline Vec2b operator ! (Vec2b a) {
+    return ~a;
+}
+
+// vector operator &= : and
+static inline Vec2b & operator &= (Vec2b & a, Vec2b b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : or
+static inline Vec2b & operator |= (Vec2b & a, Vec2b b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : xor
+static inline Vec2b & operator ^= (Vec2b & a, Vec2b b) {
+    a = a ^ b;
+    return a;
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec2b const a) {
+    return (__mmask8(a) & 0x03) == 0x03;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec2b const a) {
+    return (__mmask8(a) & 0x03) != 0;
+}
+
+// function andnot: a & ~ b
+static inline Vec2b andnot(Vec2b const a, Vec2b const b) {
+    return __mmask8(andnot(Vec8b(a), Vec8b(b)));
+}
+#endif
+
+/*****************************************************************************
+*
+*     Vector of 128 bits. Used internally as base class
+*
+*****************************************************************************/
+class Vec128b {
+protected:
+    __m128i xmm; // Integer vector
+public:
+    // Default constructor:
+    Vec128b() {
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec128b(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec128b & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec128b & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 16
+    // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale, and Atom), but not on other processors from Intel, AMD or VIA.
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 16.
+    void load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+    }
+    // Member function to store 32-bit integer into array
+    void store_si32(void * p) const {
+        *(int32_t*)p = _mm_cvtsi128_si32(xmm);
+    }
+    // Member function to store 64-bit integer into array
+    void storel(void * p) const {
+        _mm_storel_epi64((__m128i*)p, xmm);
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm_storeu_si128((__m128i*)p, xmm);
+    }
+    // Member function storing into array, aligned by 16
+    // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
+    // Merom, Wolfdale, and Atom), but not on other processors from Intel, AMD or VIA.
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 16.
+    void store_a(void * p) const {
+        _mm_store_si128((__m128i*)p, xmm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 16
+    void store_nt(void * p) const {
+        _mm_stream_si128((__m128i*)p, xmm);
+    }
+    static constexpr int size() {
+        return 128;
+    }
+    static constexpr int elementtype() {
+        return 1;
+    }
+    typedef __m128i registertype;
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec128b operator & (Vec128b const a, Vec128b const b) {
+    return _mm_and_si128(a, b);
+}
+static inline Vec128b operator && (Vec128b const a, Vec128b const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec128b operator | (Vec128b const a, Vec128b const b) {
+    return _mm_or_si128(a, b);
+}
+static inline Vec128b operator || (Vec128b const a, Vec128b const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec128b operator ^ (Vec128b const a, Vec128b const b) {
+    return _mm_xor_si128(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec128b operator ~ (Vec128b const a) {
+    return _mm_xor_si128(a, _mm_set1_epi32(-1));
+}
+
+// vector operator &= : bitwise and
+static inline Vec128b & operator &= (Vec128b & a, Vec128b const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec128b & operator |= (Vec128b & a, Vec128b const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec128b & operator ^= (Vec128b & a, Vec128b const b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec128b andnot(Vec128b const a, Vec128b const b) {
+    return _mm_andnot_si128(b, a);
+}
+
+static inline __m128i zero_si128() {
+    return _mm_setzero_si128();
+}
+
+
+/*****************************************************************************
+*
+*          selectb function
+*
+*****************************************************************************/
+// Select between two sources, byte by byte, using broad boolean vector s.
+// Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// The implementation depends on the instruction set:
+// If SSE4.1 is supported then only bit 7 in each byte of s is checked,
+// otherwise all bits in s are used.
+static inline __m128i selectb(__m128i const s, __m128i const a, __m128i const b) {
+#if INSTRSET >= 5    // SSE4.1
+    return _mm_blendv_epi8(b, a, s);
+#else
+    return _mm_or_si128(_mm_and_si128(s, a), _mm_andnot_si128(s, b));
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Horizontal Boolean functions
+*
+*****************************************************************************/
+
+static inline bool horizontal_and(Vec128b const a) {
+#if INSTRSET >= 5   // SSE4.1. Use PTEST
+    return _mm_testc_si128(a, _mm_set1_epi32(-1)) != 0;
+#else
+    __m128i t1 = _mm_unpackhi_epi64(a, a);                 // get 64 bits down
+    __m128i t2 = _mm_and_si128(a, t1);                     // and 64 bits
+#ifdef __x86_64__
+    int64_t t5 = _mm_cvtsi128_si64(t2);                    // transfer 64 bits to integer
+    return  t5 == int64_t(-1);
+#else
+    __m128i t3 = _mm_srli_epi64(t2, 32);                   // get 32 bits down
+    __m128i t4 = _mm_and_si128(t2, t3);                    // and 32 bits
+    int     t5 = _mm_cvtsi128_si32(t4);                    // transfer 32 bits to integer
+    return  t5 == -1;
+#endif  // __x86_64__
+#endif  // INSTRSET
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or(Vec128b const a) {
+#if INSTRSET >= 5   // SSE4.1. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    __m128i t1 = _mm_unpackhi_epi64(a, a);                 // get 64 bits down
+    __m128i t2 = _mm_or_si128(a, t1);                      // and 64 bits
+#ifdef __x86_64__
+    int64_t t5 = _mm_cvtsi128_si64(t2);                    // transfer 64 bits to integer
+    return  t5 != int64_t(0);
+#else
+    __m128i t3 = _mm_srli_epi64(t2, 32);                   // get 32 bits down
+    __m128i t4 = _mm_or_si128(t2, t3);                     // and 32 bits
+    int     t5 = _mm_cvtsi128_si32(t4);                    // transfer to integer
+    return  t5 != 0;
+#endif  // __x86_64__
+#endif  // INSTRSET
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16c : public Vec128b {
+public:
+    // Default constructor:
+    Vec16c() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16c(int i) {
+        xmm = _mm_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) {
+        xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec16c(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec16c & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load 64-bit integer from array
+    Vec16c & loadl(void const * p) {
+        xmm = _mm_loadl_epi64((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16c & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec16c & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16c & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        xmm = _mm_maskz_loadu_epi8(__mmask16((1u << n) - 1), p);
+#else
+        if (n >= 16) load(p);
+        else if (n <= 0) * this = 0;
+        else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) {
+            // p is at least 16 bytes from a page boundary. OK to read 16 bytes
+            load(p);
+        }
+        else {
+            // worst case. read 1 byte at a time and suffer store forwarding penalty
+            char x[16];
+            for (int i = 0; i < n; i++) x[i] = ((char const *)p)[i];
+            load(x);
+        }
+        cutoff(n);
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        _mm_mask_storeu_epi8(p, __mmask16((1u << n) - 1), xmm);
+#else
+        if (n >= 16) {
+            store(p);
+            return;
+        }
+        if (n <= 0) return;
+        // we are not using _mm_maskmoveu_si128 because it is too slow on many processors
+        union {
+            int8_t  c[16];
+            int16_t s[8];
+            int32_t i[4];
+            int64_t q[2];
+        } u;
+        store(u.c);
+        int j = 0;
+        if (n & 8) {
+            *(int64_t*)p = u.q[0];
+            j += 8;
+        }
+        if (n & 4) {
+            ((int32_t*)p)[j / 4] = u.i[j / 4];
+            j += 4;
+        }
+        if (n & 2) {
+            ((int16_t*)p)[j / 2] = u.s[j / 2];
+            j += 2;
+        }
+        if (n & 1) {
+            ((int8_t*)p)[j] = u.c[j];
+        }
+#endif
+    }
+
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16c & cutoff(int n) {
+#if INSTRSET >= 10
+        xmm = _mm_maskz_mov_epi8(__mmask16((1u << n) - 1), xmm);
+#else
+        if (uint32_t(n) >= 16) return *this;
+        const char mask[32] = { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+        *this &= Vec16c().load(mask + 16 - n);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16c const insert(int index, int8_t value) {
+#if INSTRSET >= 10
+        xmm = _mm_mask_set1_epi8(xmm, __mmask16(1u << index), value);
+#else
+        const int8_t maskl[32] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
+        __m128i broad = _mm_set1_epi8(value);  // broadcast value into all elements
+        __m128i mask  = _mm_loadu_si128((__m128i const*)(maskl + 16 - (index & 0x0F))); // mask with FF at index position
+        xmm = selectb(mask, broad, xmm);
+#endif
+        return *this;
+    }
+    /* Note: The extract(), insert(), size(), [], etc. all use int index for consistency.
+    An unsigned type for index might cause problems in case of underflow, for example:
+    for (i = 0; i < a.size() - 4; i++) a[i] = ...
+    This would go nuts if a.size() is 2.
+    */
+
+    // Member function extract a single element from vector
+    int8_t extract(int index) const {
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        __m128i x = _mm_maskz_compress_epi8(__mmask16(1u << index), xmm);
+        return (int8_t)_mm_cvtsi128_si32(x);
+#else
+        int8_t x[16];
+        store(x);
+        return x[index & 0x0F];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec16cb: Vector of 16 Booleans for use with Vec16c and Vec16uc
+*
+*****************************************************************************/
+#if INSTRSET < 10   // broad boolean vectors
+class Vec16cb : public Vec16c {
+public:
+    // Default constructor
+    Vec16cb() {}
+    // Constructor to build from all elements:
+    Vec16cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) {
+        xmm = Vec16c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7),
+            -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15));
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec16cb(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec16cb & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16cb(bool b) : Vec16c(-int8_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16cb & operator = (bool b) {
+        *this = Vec16cb(b);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16cb & insert(int index, bool a) {
+        Vec16c::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec16cb & load_bits(uint16_t a) {
+        uint16_t an = uint16_t(~a);              // invert because we have no compare-not-equal
+#if  INSTRSET >= 4  // SSSE3 (PSHUFB available under SSSE3)
+        __m128i a1 = _mm_cvtsi32_si128(an);      // load into xmm register
+        __m128i dist = constant4ui<0, 0, 0x01010101, 0x01010101>();
+        __m128i a2 = _mm_shuffle_epi8(a1, dist); // one byte of a in each element
+        __m128i mask = constant4ui<0x08040201, 0x80402010, 0x08040201, 0x80402010>();
+        __m128i a3 = _mm_and_si128(a2, mask);    // isolate one bit in each byte
+#else
+        __m128i b1 = _mm_set1_epi8((int8_t)an);  // broadcast low byte
+        __m128i b2 = _mm_set1_epi8((int8_t)(an >> 8));  // broadcast high byte
+        __m128i m1 = constant4ui<0x08040201, 0x80402010, 0, 0>();
+        __m128i m2 = constant4ui<0, 0, 0x08040201, 0x80402010>();
+        __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte of lower half
+        __m128i c2 = _mm_and_si128(b2, m2); // isolate one bit in each byte of upper half
+        __m128i a3 = _mm_or_si128(c1, c2);
+#endif
+        xmm = _mm_cmpeq_epi8(a3, _mm_setzero_si128());  // compare with 0
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec16c::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec16cb(int b) = delete;
+    Vec16cb & operator = (int x) = delete;
+};
+
+#else
+typedef Vec16b Vec16cb;  // compact boolean vector
+#endif    // broad boolean vectors
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16cb
+*
+*****************************************************************************/
+
+#if INSTRSET < 10   // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec16cb operator & (Vec16cb const a, Vec16cb const b) {
+    return Vec16cb(Vec128b(a) & Vec128b(b));
+}
+static inline Vec16cb operator && (Vec16cb const a, Vec16cb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16cb & operator &= (Vec16cb & a, Vec16cb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16cb operator | (Vec16cb const a, Vec16cb const b) {
+    return Vec16cb(Vec128b(a) | Vec128b(b));
+}
+static inline Vec16cb operator || (Vec16cb const a, Vec16cb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16cb & operator |= (Vec16cb & a, Vec16cb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16cb operator ^ (Vec16cb const a, Vec16cb const b) {
+    return Vec16cb(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16cb & operator ^= (Vec16cb & a, Vec16cb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec16cb operator == (Vec16cb const a, Vec16cb const b) {
+    return Vec16cb(a ^ (~b));
+}
+
+// vector operator != : xor
+static inline Vec16cb operator != (Vec16cb const a, Vec16cb const b) {
+    return Vec16cb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16cb operator ~ (Vec16cb const a) {
+    return Vec16cb(~Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec16cb operator ! (Vec16cb const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec16cb andnot(Vec16cb const a, Vec16cb const b) {
+    return Vec16cb(andnot(Vec128b(a), Vec128b(b)));
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec16cb const a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec16cb const a) {
+#if INSTRSET >= 5   // SSE4.1. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+}
+#endif    // broad boolean vectors
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16c operator + (Vec16c const a, Vec16c const b) {
+    return _mm_add_epi8(a, b);
+}
+// vector operator += : add
+static inline Vec16c & operator += (Vec16c & a, Vec16c const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16c operator ++ (Vec16c & a, int) {
+    Vec16c a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16c & operator ++ (Vec16c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16c operator - (Vec16c const a, Vec16c const b) {
+    return _mm_sub_epi8(a, b);
+}
+// vector operator - : unary minus
+static inline Vec16c operator - (Vec16c const a) {
+    return _mm_sub_epi8(_mm_setzero_si128(), a);
+}
+// vector operator -= : add
+static inline Vec16c & operator -= (Vec16c & a, Vec16c const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16c operator -- (Vec16c & a, int) {
+    Vec16c a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16c & operator -- (Vec16c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16c operator * (Vec16c const a, Vec16c const b) {
+    // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
+    __m128i aodd = _mm_srli_epi16(a, 8);         // odd numbered elements of a
+    __m128i bodd = _mm_srli_epi16(b, 8);         // odd numbered elements of b
+    __m128i muleven = _mm_mullo_epi16(a, b);     // product of even numbered elements
+    __m128i mulodd = _mm_mullo_epi16(aodd, bodd);// product of odd  numbered elements
+    mulodd = _mm_slli_epi16(mulodd, 8);          // put odd numbered elements back in place
+#if INSTRSET >= 10   // AVX512VL + AVX512BW
+    return _mm_mask_mov_epi8(mulodd, 0x5555, muleven);
+#else
+    __m128i mask = _mm_set1_epi32(0x00FF00FF);   // mask for even positions
+    return selectb(mask, muleven, mulodd);       // interleave even and odd
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec16c & operator *= (Vec16c & a, Vec16c const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16c operator << (Vec16c const a, int b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;         // mask to remove bits that are shifted out
+    __m128i am = _mm_and_si128(a, _mm_set1_epi8((char)mask));// remove bits that will overflow
+    __m128i res = _mm_sll_epi16(am, _mm_cvtsi32_si128(b));// 16-bit shifts
+    return res;
+}
+// vector operator <<= : shift left
+static inline Vec16c & operator <<= (Vec16c & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec16c operator >> (Vec16c const a, int b) {
+    __m128i aeven = _mm_slli_epi16(a, 8);                  // even numbered elements of a. get sign bit in position
+    aeven = _mm_sra_epi16(aeven, _mm_cvtsi32_si128(b + 8));// shift arithmetic, back to position
+    __m128i aodd = _mm_sra_epi16(a, _mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic
+#if INSTRSET >= 10   // AVX512VL + AVX512BW
+    return _mm_mask_mov_epi8(aodd, 0x5555, aeven);
+#else
+    __m128i mask = _mm_set1_epi32(0x00FF00FF);             // mask for even positions
+    __m128i res = selectb(mask, aeven, aodd);              // interleave even and odd
+    return res;
+#endif
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec16c & operator >>= (Vec16c & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16cb operator == (Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, b, 0);
+#else
+    return _mm_cmpeq_epi8(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16cb operator != (Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, b, 4);
+#else
+    return Vec16cb(Vec16c(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec16cb operator > (Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, b, 6);
+#else
+    return _mm_cmpgt_epi8(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec16cb operator < (Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16cb operator >= (Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, b, 5);
+#else
+    return Vec16cb(Vec16c(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16cb operator <= (Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec16c operator & (Vec16c const a, Vec16c const b) {
+    return Vec16c(Vec128b(a) & Vec128b(b));
+}
+static inline Vec16c operator && (Vec16c const a, Vec16c const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16c & operator &= (Vec16c & a, Vec16c const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16c operator | (Vec16c const a, Vec16c const b) {
+    return Vec16c(Vec128b(a) | Vec128b(b));
+}
+static inline Vec16c operator || (Vec16c const a, Vec16c const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16c & operator |= (Vec16c & a, Vec16c const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16c operator ^ (Vec16c const a, Vec16c const b) {
+    return Vec16c(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16c & operator ^= (Vec16c & a, Vec16c const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16c operator ~ (Vec16c const a) {
+    return Vec16c(~Vec128b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16cb operator ! (Vec16c const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi8_mask(a, _mm_setzero_si128(), 0);
+#else
+    return _mm_cmpeq_epi8(a, _mm_setzero_si128());
+#endif
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec16c select(Vec16cb const s, Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi8(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16c if_add(Vec16cb const f, Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi8(a, f, a, b);
+#else
+    return a + (Vec16c(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec16c if_sub(Vec16cb const f, Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi8(a, f, a, b);
+#else
+    return a - (Vec16c(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec16c if_mul(Vec16cb const f, Vec16c const a, Vec16c const b) {
+    return select(f, a * b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int32_t horizontal_add(Vec16c const a) {
+    __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128());
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);
+    __m128i sum3 = _mm_add_epi16(sum1, sum2);
+    int8_t  sum4 = (int8_t)_mm_cvtsi128_si32(sum3);        // truncate to 8 bits
+    return  sum4;                                          // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x(Vec16c const a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epi8(a);
+    __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);          // high element
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);              // sum
+    return         _mm_cvtsi128_si32(sum3);
+#else
+    __m128i aeven = _mm_slli_epi16(a, 8);                  // even numbered elements of a. get sign bit in position
+    aeven = _mm_srai_epi16(aeven, 8);                      // sign extend even numbered elements
+    __m128i aodd = _mm_srai_epi16(a, 8);                   // sign extend odd  numbered elements
+    __m128i sum1 = _mm_add_epi16(aeven, aodd);             // add even and odd elements
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+#if INSTRSET >= 4 && false // SSSE3
+    __m128i sum2 = _mm_hadd_epi16(sum1, sum1);
+    __m128i sum3 = _mm_hadd_epi16(sum2, sum2);
+    __m128i sum4 = _mm_hadd_epi16(sum3, sum3);
+#else
+    __m128i sum2 = _mm_add_epi16(sum1, _mm_unpackhi_epi64(sum1, sum1));
+    __m128i sum3 = _mm_add_epi16(sum2, _mm_shuffle_epi32(sum2, 1));
+    __m128i sum4 = _mm_add_epi16(sum3, _mm_shufflelo_epi16(sum3, 1));
+#endif
+    int16_t sum5 = (int16_t)_mm_cvtsi128_si32(sum4);       // 16 bit sum
+    return  sum5;                                          // sign extend to 32 bits
+#endif
+}
+
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16c add_saturated(Vec16c const a, Vec16c const b) {
+    return _mm_adds_epi8(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16c sub_saturated(Vec16c const a, Vec16c const b) {
+    return _mm_subs_epi8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16c max(Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_max_epi8(a, b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80808080);
+    __m128i a1 = _mm_xor_si128(a, signbit);                // add 0x80
+    __m128i b1 = _mm_xor_si128(b, signbit);                // add 0x80
+    __m128i m1 = _mm_max_epu8(a1, b1);                     // unsigned max
+    return  _mm_xor_si128(m1, signbit);                    // sub 0x80
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec16c min(Vec16c const a, Vec16c const b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_min_epi8(a, b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80808080);
+    __m128i a1 = _mm_xor_si128(a, signbit);                // add 0x80
+    __m128i b1 = _mm_xor_si128(b, signbit);                // add 0x80
+    __m128i m1 = _mm_min_epu8(a1, b1);                     // unsigned min
+    return  _mm_xor_si128(m1, signbit);                    // sub 0x80
+#endif
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16c abs(Vec16c const a) {
+#if INSTRSET >= 4     // SSSE3 supported
+    return _mm_abs_epi8(a);
+#else                 // SSE2
+    __m128i nega = _mm_sub_epi8(_mm_setzero_si128(), a);
+    return _mm_min_epu8(a, nega);   // unsigned min (the negative value is bigger when compared as unsigned)
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16c abs_saturated(Vec16c const a) {
+    __m128i absa = abs(a);                                 // abs(a)
+#if INSTRSET >= 10
+    return _mm_min_epu8(absa, Vec16c(0x7F));
+#else
+    __m128i overfl = _mm_cmpgt_epi8(_mm_setzero_si128(), absa);// 0 > a
+    return           _mm_add_epi8(absa, overfl);           // subtract 1 if 0x80
+#endif
+}
+
+// function rotate_left: rotate each element left by b bits
+// Use negative count to rotate right
+static inline Vec16c rotate_left(Vec16c const a, int b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return (Vec16c)_mm_rot_epi8(a, _mm_set1_epi8(b));
+#else  // SSE2 instruction set
+    uint8_t mask = 0xFFu << b;                             // mask off overflow bits
+    __m128i m = _mm_set1_epi8(mask);
+    __m128i bb = _mm_cvtsi32_si128(b & 7);                 // b modulo 8
+    __m128i mbb = _mm_cvtsi32_si128((-b) & 7);             // 8-b modulo 8
+    __m128i left = _mm_sll_epi16(a, bb);                   // a << b
+    __m128i right = _mm_srl_epi16(a, mbb);                 // a >> 8-b
+    left  = _mm_and_si128(m, left);                        // mask off overflow bits
+    right = _mm_andnot_si128(m, right);
+    return  _mm_or_si128(left, right);                     // combine left and right shifted bits
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16uc : public Vec16c {
+public:
+    // Default constructor:
+    Vec16uc() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16uc(uint32_t i) {
+        xmm = _mm_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec16uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15) {
+        xmm = _mm_setr_epi8((int8_t)i0, (int8_t)i1, (int8_t)i2, (int8_t)i3, (int8_t)i4, (int8_t)i5, (int8_t)i6,
+            (int8_t)i7, (int8_t)i8, (int8_t)i9, (int8_t)i10, (int8_t)i11, (int8_t)i12, (int8_t)i13, (int8_t)i14, (int8_t)i15);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec16uc(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec16uc & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Member function to load 64-bit integer from array
+    Vec16uc & loadl(void const * p) {
+        xmm = _mm_loadl_epi64((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16uc & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec16uc & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16uc const insert(int index, uint8_t value) {
+        Vec16c::insert(index, (int8_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(int index) const {
+        return uint8_t(Vec16c::extract(index));
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int elementtype() {
+        return 5;
+    }
+};
+
+// Define operators for this class
+
+// vector operator << : shift left all elements
+static inline Vec16uc operator << (Vec16uc const a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;               // mask to remove bits that are shifted out
+    __m128i am = _mm_and_si128(a, _mm_set1_epi8((char)mask));  // remove bits that will overflow
+    __m128i res = _mm_sll_epi16(am, _mm_cvtsi32_si128((int)b)); // 16-bit shifts
+    return res;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16uc operator << (Vec16uc const a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16uc operator >> (Vec16uc const a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF << (uint32_t)b;               // mask to remove bits that are shifted out
+    __m128i am = _mm_and_si128(a, _mm_set1_epi8((char)mask));  // remove bits that will overflow
+    __m128i res = _mm_srl_epi16(am, _mm_cvtsi32_si128((int)b)); // 16-bit shifts
+    return res;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16uc operator >> (Vec16uc const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16uc & operator >>= (Vec16uc & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16cb operator >= (Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu8_mask(a, b, 5);
+#else
+    return (Vec16cb)_mm_cmpeq_epi8(_mm_max_epu8(a, b), a); // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16cb operator <= (Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu8_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16cb operator > (Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu8_mask(a, b, 6);
+#else
+    return Vec16cb(Vec16c(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16cb operator < (Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu8_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator + : add
+static inline Vec16uc operator + (Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(Vec16c(a) + Vec16c(b));
+}
+
+// vector operator - : subtract
+static inline Vec16uc operator - (Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(Vec16c(a) - Vec16c(b));
+}
+
+// vector operator * : multiply
+static inline Vec16uc operator * (Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(Vec16c(a) * Vec16c(b));
+}
+
+// vector operator & : bitwise and
+static inline Vec16uc operator & (Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(Vec128b(a) & Vec128b(b));
+}
+static inline Vec16uc operator && (Vec16uc const a, Vec16uc const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16uc operator | (Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(Vec128b(a) | Vec128b(b));
+}
+static inline Vec16uc operator || (Vec16uc const a, Vec16uc const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16uc operator ^ (Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16uc operator ~ (Vec16uc const a) {
+    return Vec16uc(~Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16uc select(Vec16cb const s, Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi8(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16uc if_add(Vec16cb const f, Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi8(a, f, a, b);
+#else
+    return a + (Vec16uc(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec16uc if_sub(Vec16cb const f, Vec16uc const a, Vec16uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi8(a, f, a, b);
+#else
+    return a - (Vec16uc(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec16uc if_mul(Vec16cb const f, Vec16uc const a, Vec16uc const b) {
+    return select(f, a * b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+// (Note: horizontal_add_x(Vec16uc) is slightly faster)
+static inline uint32_t horizontal_add(Vec16uc const a) {
+    __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128());
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);
+    __m128i sum3 = _mm_add_epi16(sum1, sum2);
+    uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3);     // truncate to 16 bits
+    return  sum4;
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec16uc const a) {
+    __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128());
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);
+    __m128i sum3 = _mm_add_epi16(sum1, sum2);
+    return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16uc add_saturated(Vec16uc const a, Vec16uc const b) {
+    return _mm_adds_epu8(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16uc sub_saturated(Vec16uc const a, Vec16uc const b) {
+    return _mm_subs_epu8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16uc max(Vec16uc const a, Vec16uc const b) {
+    return _mm_max_epu8(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16uc min(Vec16uc const a, Vec16uc const b) {
+    return _mm_min_epu8(a, b);
+}
+
+
+
+/*****************************************************************************
+*
+*          Vector of 8 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8s : public Vec128b {
+public:
+    // Default constructor:
+    Vec8s() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8s(int i) {
+        xmm = _mm_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec8s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7) {
+        xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec8s(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec8s & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load 64-bit integer from array
+    Vec8s & loadl(void const * p) {
+        xmm = _mm_loadl_epi64((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8s & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec8s & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load 8 unsigned 8-bit integers from array
+    Vec8s & load_8uc(void const * p) {
+#if INSTRSET >= 5   // SSE4.1
+        xmm = _mm_cvtepu8_epi16(Vec16uc().loadl(p));
+#else
+        xmm = _mm_unpacklo_epi8(Vec16uc().loadl(p), _mm_setzero_si128());
+#endif
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8s & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        xmm = _mm_maskz_loadu_epi16(__mmask8((1u << n) - 1), p);
+#else
+        if (n >= 8) load(p);
+        else if (n <= 0) * this = 0;
+        else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) {
+            // p is at least 16 bytes from a page boundary. OK to read 16 bytes
+            load(p);
+        }
+        else {
+            // worst case. read 1 byte at a time and suffer store forwarding penalty
+            int16_t x[8];
+            for (int i = 0; i < n; i++) x[i] = ((int16_t const *)p)[i];
+            load(x);
+        }
+        cutoff(n);
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        _mm_mask_storeu_epi16(p, __mmask8((1u << n) - 1), xmm);
+#else
+        if (n >= 8) {
+            store(p);
+            return;
+        }
+        if (n <= 0) return;
+        // we are not using _mm_maskmoveu_si128 because it is too slow on many processors
+        union {
+            int8_t  c[16];
+            int16_t s[8];
+            int32_t i[4];
+            int64_t q[2];
+        } u;
+        store(u.c);
+        int j = 0;
+        if (n & 4) {
+            *(int64_t*)p = u.q[0];
+            j += 8;
+        }
+        if (n & 2) {
+            ((int32_t*)p)[j / 4] = u.i[j / 4];
+            j += 4;
+        }
+        if (n & 1) {
+            ((int16_t*)p)[j / 2] = u.s[j / 2];
+        }
+#endif
+    }
+
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8s & cutoff(int n) {
+#if INSTRSET >= 10
+        xmm = _mm_maskz_mov_epi16(__mmask8((1u << n) - 1), xmm);
+#else
+        *this = Vec16c(xmm).cutoff(n * 2);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8s const insert(int index, int16_t value) {
+#if INSTRSET >= 10
+        xmm = _mm_mask_set1_epi16(xmm, __mmask8(1u << index), value);
+#else
+        switch (index) {
+        case 0:
+            xmm = _mm_insert_epi16(xmm, value, 0);  break;
+        case 1:
+            xmm = _mm_insert_epi16(xmm, value, 1);  break;
+        case 2:
+            xmm = _mm_insert_epi16(xmm, value, 2);  break;
+        case 3:
+            xmm = _mm_insert_epi16(xmm, value, 3);  break;
+        case 4:
+            xmm = _mm_insert_epi16(xmm, value, 4);  break;
+        case 5:
+            xmm = _mm_insert_epi16(xmm, value, 5);  break;
+        case 6:
+            xmm = _mm_insert_epi16(xmm, value, 6);  break;
+        case 7:
+            xmm = _mm_insert_epi16(xmm, value, 7);  break;
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int16_t extract(int index) const {
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        __m128i x = _mm_maskz_compress_epi16(__mmask8(1u << index), xmm);
+        return (int16_t)_mm_cvtsi128_si32(x);
+#else
+        switch (index) {
+        case 0:
+            return (int16_t)_mm_extract_epi16(xmm, 0);
+        case 1:
+            return (int16_t)_mm_extract_epi16(xmm, 1);
+        case 2:
+            return (int16_t)_mm_extract_epi16(xmm, 2);
+        case 3:
+            return (int16_t)_mm_extract_epi16(xmm, 3);
+        case 4:
+            return (int16_t)_mm_extract_epi16(xmm, 4);
+        case 5:
+            return (int16_t)_mm_extract_epi16(xmm, 5);
+        case 6:
+            return (int16_t)_mm_extract_epi16(xmm, 6);
+        case 7:
+            return (int16_t)_mm_extract_epi16(xmm, 7);
+        }
+        return 0;
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 6;
+    }
+};
+
+/*****************************************************************************
+*
+*          Vec8sb: Vector of 8 Booleans for use with Vec8s and Vec8us
+*
+*****************************************************************************/
+#if INSTRSET < 10   // broad boolean vectors
+
+class Vec8sb : public Vec8s {
+public:
+    // Constructor to build from all elements:
+    Vec8sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
+        xmm = Vec8s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7));
+    }
+    // Default constructor:
+    Vec8sb() {
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec8sb(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec8sb & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec8sb(bool b) : Vec8s(-int16_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8sb & operator = (bool b) {
+        *this = Vec8sb(b);
+        return *this;
+    }
+    Vec8sb & insert(int index, bool a) {
+        Vec8s::insert(index, -(int16_t)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    // Note: This function is inefficient. Use store function if extracting more than one element
+    bool extract(int index) const {
+        return Vec8s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8sb & load_bits(uint8_t a) {
+        __m128i b1 = _mm_set1_epi8((int8_t)a);  // broadcast byte. Invert because we have no compare-not-equal
+        __m128i m1 = constant4ui<0x00020001, 0x00080004, 0x00200010, 0x00800040>();
+        __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte
+        xmm = _mm_cmpgt_epi16(c1, _mm_setzero_si128());  // compare with 0
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec8sb(int b) = delete;
+    Vec8sb & operator = (int x) = delete;
+};
+#else
+typedef Vec8b Vec8sb;
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec8sb
+*
+*****************************************************************************/
+#if INSTRSET < 10   // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec8sb operator & (Vec8sb const a, Vec8sb const b) {
+    return Vec8sb(Vec128b(a) & Vec128b(b));
+}
+static inline Vec8sb operator && (Vec8sb const a, Vec8sb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8sb & operator &= (Vec8sb & a, Vec8sb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8sb operator | (Vec8sb const a, Vec8sb const b) {
+    return Vec8sb(Vec128b(a) | Vec128b(b));
+}
+static inline Vec8sb operator || (Vec8sb const a, Vec8sb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8sb & operator |= (Vec8sb & a, Vec8sb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8sb operator ^ (Vec8sb const a, Vec8sb const b) {
+    return Vec8sb(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8sb & operator ^= (Vec8sb & a, Vec8sb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec8sb operator == (Vec8sb const a, Vec8sb const b) {
+    return Vec8sb(a ^ (~b));
+}
+
+// vector operator != : xor
+static inline Vec8sb operator != (Vec8sb const a, Vec8sb const b) {
+    return Vec8sb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8sb operator ~ (Vec8sb const a) {
+    return Vec8sb(~Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec8sb operator ! (Vec8sb const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec8sb andnot(Vec8sb const a, Vec8sb const b) {
+    return Vec8sb(andnot(Vec128b(a), Vec128b(b)));
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec8sb const a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec8sb const a) {
+#if INSTRSET >= 5   // SSE4.1. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+}
+#endif   // broad boolean vectors
+
+
+/*****************************************************************************
+*
+*         operators for Vec8s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8s operator + (Vec8s const a, Vec8s const b) {
+    return _mm_add_epi16(a, b);
+}
+// vector operator += : add
+static inline Vec8s & operator += (Vec8s & a, Vec8s const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8s operator ++ (Vec8s & a, int) {
+    Vec8s a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec8s & operator ++ (Vec8s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8s operator - (Vec8s const a, Vec8s const b) {
+    return _mm_sub_epi16(a, b);
+}
+// vector operator - : unary minus
+static inline Vec8s operator - (Vec8s const a) {
+    return _mm_sub_epi16(_mm_setzero_si128(), a);
+}
+// vector operator -= : subtract
+static inline Vec8s & operator -= (Vec8s & a, Vec8s const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8s operator -- (Vec8s & a, int) {
+    Vec8s a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec8s & operator -- (Vec8s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8s operator * (Vec8s const a, Vec8s const b) {
+    return _mm_mullo_epi16(a, b);
+}
+
+// vector operator *= : multiply
+static inline Vec8s & operator *= (Vec8s & a, Vec8s const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec8s operator << (Vec8s const a, int b) {
+    return _mm_sll_epi16(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec8s & operator <<= (Vec8s & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8s operator >> (Vec8s const a, int b) {
+    return _mm_sra_epi16(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8s & operator >>= (Vec8s & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8sb operator == (Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10   // compact boolean vectors
+    return _mm_cmpeq_epi16_mask(a, b);
+#else
+    return _mm_cmpeq_epi16(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8sb operator != (Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10   // compact boolean vectors
+    return _mm_cmpneq_epi16_mask(a, b);
+#else
+    return Vec8sb(~(a == b));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8sb operator > (Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10   // compact boolean vectors
+    return  _mm_cmp_epi16_mask(a, b, 6);
+#else
+    return _mm_cmpgt_epi16(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8sb operator < (Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi16_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8sb operator >= (Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10   // compact boolean vectors
+    return  _mm_cmp_epi16_mask(a, b, 5);
+#else
+    return Vec8sb(~(b > a));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8sb operator <= (Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi16_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec8s operator & (Vec8s const a, Vec8s const b) {
+    return Vec8s(Vec128b(a) & Vec128b(b));
+}
+static inline Vec8s operator && (Vec8s const a, Vec8s const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8s & operator &= (Vec8s & a, Vec8s const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8s operator | (Vec8s const a, Vec8s const b) {
+    return Vec8s(Vec128b(a) | Vec128b(b));
+}
+static inline Vec8s operator || (Vec8s const a, Vec8s const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8s & operator |= (Vec8s & a, Vec8s const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8s operator ^ (Vec8s const a, Vec8s const b) {
+    return Vec8s(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8s & operator ^= (Vec8s & a, Vec8s const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8s operator ~ (Vec8s const a) {
+    return Vec8s(~Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8s select(Vec8sb const s, Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi16(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8s if_add(Vec8sb const f, Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi16(a, f, a, b);
+#else
+    return a + (Vec8s(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec8s if_sub(Vec8sb const f, Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi16(a, f, a, b);
+#else
+    return a - (Vec8s(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8s if_mul(Vec8sb const f, Vec8s const a, Vec8s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mullo_epi16(a, f, a, b);
+#else
+    return select(f, a * b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int16_t horizontal_add(Vec8s const a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epi16(a);
+    __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);          // high element
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);              // sum
+    int16_t sum4 = _mm_cvtsi128_si32(sum3);                // truncate to 16 bits
+    return  sum4;                                          // sign extend to 32 bits
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+#elif  INSTRSET >= 4 && false // SSSE3
+    __m128i sum1 = _mm_hadd_epi16(a, a);                   // horizontally add 8 elements in 3 steps
+    __m128i sum2 = _mm_hadd_epi16(sum1, sum1);
+    __m128i sum3 = _mm_hadd_epi16(sum2, sum2);
+    int16_t sum4 = (int16_t)_mm_cvtsi128_si32(sum3);       // 16 bit sum
+    return  sum4;                                          // sign extend to 32 bits
+#else                 // SSE2
+    __m128i sum1 = _mm_unpackhi_epi64(a, a);               // 4 high elements
+    __m128i sum2 = _mm_add_epi16(a, sum1);                 // 4 sums
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);          // 2 high elements
+    __m128i sum4 = _mm_add_epi16(sum2, sum3);              // 2 sums
+    __m128i sum5 = _mm_shufflelo_epi16(sum4, 0x01);        // 1 high element
+    __m128i sum6 = _mm_add_epi16(sum4, sum5);              // 1 sum
+    int16_t sum7 = (int16_t)_mm_cvtsi128_si32(sum6);       // 16 bit sum
+    return  sum7;                                          // sign extend to 32 bits
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x(Vec8s const a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epi16(a);
+    __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);          // high element
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);              // sum
+    return          _mm_cvtsi128_si32(sum3);
+#else
+    __m128i aeven = _mm_slli_epi32(a, 16);                 // even numbered elements of a. get sign bit in position
+    aeven = _mm_srai_epi32(aeven, 16);                     // sign extend even numbered elements
+    __m128i aodd = _mm_srai_epi32(a, 16);                  // sign extend odd  numbered elements
+    __m128i sum1 = _mm_add_epi32(aeven, aodd);             // add even and odd elements
+#if  INSTRSET >= 4 && false // SSSE3
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128i sum2 = _mm_hadd_epi32(sum1, sum1);             // horizontally add 4 elements in 2 steps
+    __m128i sum3 = _mm_hadd_epi32(sum2, sum2);
+    return  _mm_cvtsi128_si32(sum3);
+#else                 // SSE2
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);         // 2 high elements
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);
+    __m128i sum4 = _mm_shuffle_epi32(sum3, 1);             // 1 high elements
+    __m128i sum5 = _mm_add_epi32(sum3, sum4);
+    return  _mm_cvtsi128_si32(sum5);                       // 32 bit sum
+#endif
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8s add_saturated(Vec8s const a, Vec8s const b) {
+    return _mm_adds_epi16(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8s sub_saturated(Vec8s const a, Vec8s const b) {
+    return _mm_subs_epi16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8s max(Vec8s const a, Vec8s const b) {
+    return _mm_max_epi16(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8s min(Vec8s const a, Vec8s const b) {
+    return _mm_min_epi16(a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8s abs(Vec8s const a) {
+#if INSTRSET >= 4     // SSSE3 supported
+    return _mm_abs_epi16(a);
+#else                 // SSE2
+    __m128i nega = _mm_sub_epi16(_mm_setzero_si128(), a);
+    return _mm_max_epi16(a, nega);
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8s abs_saturated(Vec8s const a) {
+#if INSTRSET >= 10
+    return _mm_min_epu16(abs(a), Vec8s(0x7FFF));
+#else
+    __m128i absa = abs(a);                                 // abs(a)
+    __m128i overfl = _mm_srai_epi16(absa, 15);             // sign
+    return           _mm_add_epi16(absa, overfl);          // subtract 1 if 0x8000
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8s rotate_left(Vec8s const a, int b) {
+#ifdef __XOP__  // AMD XOP instruction set
+    return (Vec8s)_mm_rot_epi16(a, _mm_set1_epi16(b));
+#else  // SSE2 instruction set
+    __m128i left  = _mm_sll_epi16(a, _mm_cvtsi32_si128(b & 0x0F));     // a << b
+    __m128i right = _mm_srl_epi16(a, _mm_cvtsi32_si128((-b) & 0x0F));  // a >> (16 - b)
+    __m128i rot   = _mm_or_si128(left, right);                         // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8us : public Vec8s {
+public:
+    // Default constructor:
+    Vec8us() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8us(uint32_t i) {
+        xmm = _mm_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec8us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7) {
+        xmm = _mm_setr_epi16((int16_t)i0, (int16_t)i1, (int16_t)i2, (int16_t)i3, (int16_t)i4, (int16_t)i5, (int16_t)i6, (int16_t)i7);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec8us(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec8us & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Member function to load 64-bit integer from array
+    Vec8us & loadl(void const * p) {
+        xmm = _mm_loadl_epi64((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8us & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec8us & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec8us const insert(int index, uint16_t value) {
+        Vec8s::insert(index, (int16_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint16_t extract(int index) const {
+        return (uint16_t)Vec8s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int elementtype() {
+        return 7;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8us operator + (Vec8us const a, Vec8us const b) {
+    return Vec8us(Vec8s(a) + Vec8s(b));
+}
+
+// vector operator - : subtract
+static inline Vec8us operator - (Vec8us const a, Vec8us const b) {
+    return Vec8us(Vec8s(a) - Vec8s(b));
+}
+
+// vector operator * : multiply
+static inline Vec8us operator * (Vec8us const a, Vec8us const b) {
+    return Vec8us(Vec8s(a) * Vec8s(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec8us operator >> (Vec8us const a, uint32_t b) {
+    return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8us operator >> (Vec8us const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8us & operator >>= (Vec8us & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8us operator << (Vec8us const a, uint32_t b) {
+    return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)b));
+}
+
+// vector operator << : shift left all elements
+static inline Vec8us operator << (Vec8us const a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8sb operator >= (Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epu16_mask(a, b, 5);
+#elif defined (__XOP__)  // AMD XOP instruction set
+    return (Vec8sb)_mm_comge_epu16(a, b);
+#elif INSTRSET >= 5   // SSE4.1
+    __m128i max_ab = _mm_max_epu16(a, b);                   // max(a,b), unsigned
+    return _mm_cmpeq_epi16(a, max_ab);                      // a == max(a,b)
+#else  // SSE2 instruction set
+    __m128i s = _mm_subs_epu16(b, a);                       // b-a, saturated
+    return  _mm_cmpeq_epi16(s, _mm_setzero_si128());       // s == 0
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8sb operator <= (Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epu16_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8sb operator > (Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epu16_mask(a, b, 6);
+#elif defined (__XOP__)  // AMD XOP instruction set
+    return (Vec8sb)_mm_comgt_epu16(a, b);
+#else  // SSE2 instruction set
+    return Vec8sb(~(b >= a));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8sb operator < (Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epu16_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec8us operator & (Vec8us const a, Vec8us const b) {
+    return Vec8us(Vec128b(a) & Vec128b(b));
+}
+static inline Vec8us operator && (Vec8us const a, Vec8us const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8us operator | (Vec8us const a, Vec8us const b) {
+    return Vec8us(Vec128b(a) | Vec128b(b));
+}
+static inline Vec8us operator || (Vec8us const a, Vec8us const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8us operator ^ (Vec8us const a, Vec8us const b) {
+    return Vec8us(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8us operator ~ (Vec8us const a) {
+    return Vec8us(~Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8us select(Vec8sb const s, Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi16(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8us if_add(Vec8sb const f, Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi16(a, f, a, b);
+#else
+    return a + (Vec8us(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec8us if_sub(Vec8sb const f, Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi16(a, f, a, b);
+#else
+    return a - (Vec8us(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec8us if_mul(Vec8sb const f, Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mullo_epi16(a, f, a, b);
+#else
+    return select(f, a * b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline uint32_t horizontal_add(Vec8us const a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epu16(a);
+    __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);          // high element
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);              // sum
+    uint16_t sum4 = _mm_cvtsi128_si32(sum3);               // truncate to 16 bits
+    return  sum4;                                          // zero extend to 32 bits
+#elif  INSTRSET >= 4 && false // SSSE3
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128i sum1 = _mm_hadd_epi16(a, a);                   // horizontally add 8 elements in 3 steps
+    __m128i sum2 = _mm_hadd_epi16(sum1, sum1);
+    __m128i sum3 = _mm_hadd_epi16(sum2, sum2);
+    uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3);     // 16 bit sum
+    return  sum4;                                          // zero extend to 32 bits
+#else                 // SSE2
+    __m128i sum1 = _mm_unpackhi_epi64(a, a);               // 4 high elements
+    __m128i sum2 = _mm_add_epi16(a, sum1);                 // 4 sums
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);          // 2 high elements
+    __m128i sum4 = _mm_add_epi16(sum2, sum3);              // 2 sums
+    __m128i sum5 = _mm_shufflelo_epi16(sum4, 0x01);        // 1 high element
+    __m128i sum6 = _mm_add_epi16(sum4, sum5);              // 1 sum
+    uint16_t sum7 = (uint16_t)_mm_cvtsi128_si32(sum6);     // 16 bit sum
+    return  sum7;                                          // zero extend to 32 bits
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x(Vec8us const a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epu16(a);
+    __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);          // high element
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);              // sum
+    return  (uint32_t)_mm_cvtsi128_si32(sum3);
+    /*
+#elif INSTRSET >= 4  // SSSE3
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128i mask  = _mm_set1_epi32(0x0000FFFF);            // mask for even positions
+    __m128i aeven = _mm_and_si128(a,mask);                 // even numbered elements of a
+    __m128i aodd  = _mm_srli_epi32(a,16);                  // zero extend odd numbered elements
+    __m128i sum1  = _mm_add_epi32(aeven,aodd);             // add even and odd elements
+    __m128i sum2  = _mm_hadd_epi32(sum1,sum1);             // horizontally add 4 elements in 2 steps
+    __m128i sum3  = _mm_hadd_epi32(sum2,sum2);
+    return  (uint32_t)_mm_cvtsi128_si32(sum3);
+    */
+#else                 // SSE2
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+    __m128i aeven = _mm_maskz_mov_epi16(0x55, a);
+#else
+    __m128i mask  = _mm_set1_epi32(0x0000FFFF);            // mask for even positions
+    __m128i aeven = _mm_and_si128(a, mask);                // even numbered elements of a
+#endif
+    __m128i aodd = _mm_srli_epi32(a, 16);                  // zero extend odd numbered elements
+    __m128i sum1 = _mm_add_epi32(aeven, aodd);             // add even and odd elements
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);         // 2 high elements
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);
+    __m128i sum4 = _mm_shuffle_epi32(sum3, 0x01);          // 1 high elements
+    __m128i sum5 = _mm_add_epi32(sum3, sum4);
+    return  (uint32_t)_mm_cvtsi128_si32(sum5);             // 16 bit sum
+#endif
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8us add_saturated(Vec8us const a, Vec8us const b) {
+    return _mm_adds_epu16(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8us sub_saturated(Vec8us const a, Vec8us const b) {
+    return _mm_subs_epu16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8us max(Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_max_epu16(a, b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80008000);
+    __m128i a1 = _mm_xor_si128(a, signbit);                // add 0x8000
+    __m128i b1 = _mm_xor_si128(b, signbit);                // add 0x8000
+    __m128i m1 = _mm_max_epi16(a1, b1);                    // signed max
+    return  _mm_xor_si128(m1, signbit);                    // sub 0x8000
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec8us min(Vec8us const a, Vec8us const b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_min_epu16(a, b);
+#else  // SSE2
+    __m128i signbit = _mm_set1_epi32(0x80008000);
+    __m128i a1 = _mm_xor_si128(a, signbit);                // add 0x8000
+    __m128i b1 = _mm_xor_si128(b, signbit);                // add 0x8000
+    __m128i m1 = _mm_min_epi16(a1, b1);                    // signed min
+    return  _mm_xor_si128(m1, signbit);                    // sub 0x8000
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec4i : public Vec128b {
+public:
+    // Default constructor:
+    Vec4i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4i(int i) {
+        xmm = _mm_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) {
+        xmm = _mm_setr_epi32(i0, i1, i2, i3);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec4i(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec4i & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4i & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec4i & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load 4 unsigned 8-bit integers from array
+    Vec4i & load_4uc(void const * p) {
+#if INSTRSET >= 5   // SSE4.1
+        xmm          = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t const*)p));
+#else
+        __m128i zero = _mm_setzero_si128();
+        xmm          = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t const*)p), zero), zero);
+#endif
+        return *this;
+    }
+    // Member function to load 4 unsigned 16-bit integers from array
+    Vec4i & load_4us(void const * p) {
+#if INSTRSET >= 5   // SSE4.1
+        xmm = _mm_cvtepu16_epi32(Vec8us().loadl(p));
+#else
+        xmm = _mm_unpacklo_epi16(Vec8us().loadl(p), _mm_setzero_si128());
+#endif
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4i & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        xmm = _mm_maskz_loadu_epi32(__mmask8((1u << n) - 1), p);
+#else
+        switch (n) {
+        case 0:
+            *this = 0;  break;
+        case 1:
+            xmm = _mm_cvtsi32_si128(*(int32_t const*)p);  break;
+        case 2:
+            // intrinsic for movq is missing!
+            xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], 0, 0);  break;
+        case 3:
+            xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], ((int32_t const*)p)[2], 0);  break;
+        case 4:
+            load(p);  break;
+        default:
+            break;
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        _mm_mask_storeu_epi32(p, __mmask8((1u << n) - 1), xmm);
+#else
+        union {
+            int32_t i[4];
+            int64_t q[2];
+        } u;
+        switch (n) {
+        case 1:
+            *(int32_t*)p = _mm_cvtsi128_si32(xmm);  break;
+        case 2:
+            // intrinsic for movq is missing!
+            store(u.i);
+            *(int64_t*)p = u.q[0];  break;
+        case 3:
+            store(u.i);
+            *(int64_t*)p = u.q[0];
+            ((int32_t*)p)[2] = u.i[2];  break;
+        case 4:
+            store(p);  break;
+        default:
+            break;
+        }
+#endif
+    }
+
+    // cut off vector to n elements. The last 4-n elements are set to zero
+    Vec4i & cutoff(int n) {
+#if INSTRSET >= 10
+        xmm = _mm_maskz_mov_epi32(__mmask8((1u << n) - 1), xmm);
+#else
+        * this = Vec16c(xmm).cutoff(n * 4);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4i const insert(int index, int32_t value) {
+#if INSTRSET >= 10
+        xmm = _mm_mask_set1_epi32(xmm, __mmask8(1u << index), value);
+#else
+        __m128i broad = _mm_set1_epi32(value);  // broadcast value into all elements
+        const int32_t maskl[8] = { 0,0,0,0,-1,0,0,0 };
+        __m128i mask = _mm_loadu_si128((__m128i const*)(maskl + 4 - (index & 3))); // mask with FFFFFFFF at index position
+        xmm = selectb(mask, broad, xmm);
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(int index) const {
+#if INSTRSET >= 10
+        __m128i x = _mm_maskz_compress_epi32(__mmask8(1u << index), xmm);
+        return _mm_cvtsi128_si32(x);
+#else
+        int32_t x[4];
+        store(x);
+        return x[index & 3];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec4ib: Vector of 4 Booleans for use with Vec4i and Vec4ui
+*
+*****************************************************************************/
+#if INSTRSET < 10   // broad boolean vectors
+
+class Vec4ib : public Vec4i {
+public:
+    // Default constructor:
+    Vec4ib() {
+    }
+    // Constructor to build from all elements:
+    Vec4ib(bool x0, bool x1, bool x2, bool x3) {
+        xmm = Vec4i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3));
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec4ib(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec4ib & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4ib(bool b) : Vec4i(-int32_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4ib & operator = (bool b) {
+        *this = Vec4ib(b);
+        return *this;
+    }
+    Vec4ib & insert(int index, bool a) {
+        Vec4i::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec4i::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec4ib & load_bits(uint8_t a) {
+        __m128i b1 = _mm_set1_epi8((int8_t)a);  // broadcast byte
+        __m128i m1 = constant4ui<1, 2, 4, 8>();
+        __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte
+        xmm = _mm_cmpgt_epi32(c1, _mm_setzero_si128());  // compare signed because no numbers are negative
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec4ib(int b) = delete;
+    Vec4ib & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec4b Vec4ib;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec4ib
+*
+*****************************************************************************/
+
+#if INSTRSET < 10   // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec4ib operator & (Vec4ib const a, Vec4ib const b) {
+    return Vec4ib(Vec128b(a) & Vec128b(b));
+}
+static inline Vec4ib operator && (Vec4ib const a, Vec4ib const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4ib & operator &= (Vec4ib & a, Vec4ib const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4ib operator | (Vec4ib const a, Vec4ib const b) {
+    return Vec4ib(Vec128b(a) | Vec128b(b));
+}
+static inline Vec4ib operator || (Vec4ib const a, Vec4ib const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4ib & operator |= (Vec4ib & a, Vec4ib const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4ib operator ^ (Vec4ib const a, Vec4ib const b) {
+    return Vec4ib(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4ib & operator ^= (Vec4ib & a, Vec4ib const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec4ib operator == (Vec4ib const a, Vec4ib const b) {
+    return Vec4ib(a ^ (~b));
+}
+
+// vector operator != : xor
+static inline Vec4ib operator != (Vec4ib const a, Vec4ib const b) {
+    return Vec4ib(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4ib operator ~ (Vec4ib const a) {
+    return Vec4ib(~Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec4ib operator ! (Vec4ib const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec4ib andnot(Vec4ib const a, Vec4ib const b) {
+    return Vec4ib(andnot(Vec128b(a), Vec128b(b)));
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec4ib const a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec4ib const a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+}
+#endif  // broad boolean vectors
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4i
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4i operator + (Vec4i const a, Vec4i const b) {
+    return _mm_add_epi32(a, b);
+}
+// vector operator += : add
+static inline Vec4i & operator += (Vec4i & a, Vec4i const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4i operator ++ (Vec4i & a, int) {
+    Vec4i a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec4i & operator ++ (Vec4i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4i operator - (Vec4i const a, Vec4i const b) {
+    return _mm_sub_epi32(a, b);
+}
+// vector operator - : unary minus
+static inline Vec4i operator - (Vec4i const a) {
+    return _mm_sub_epi32(_mm_setzero_si128(), a);
+}
+// vector operator -= : subtract
+static inline Vec4i & operator -= (Vec4i & a, Vec4i const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4i operator -- (Vec4i & a, int) {
+    Vec4i a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec4i & operator -- (Vec4i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4i operator * (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 5  // SSE4.1 instruction set
+    return _mm_mullo_epi32(a, b);
+#else
+    __m128i a13 = _mm_shuffle_epi32(a, 0xF5);              // (-,a3,-,a1)
+    __m128i b13 = _mm_shuffle_epi32(b, 0xF5);              // (-,b3,-,b1)
+    __m128i prod02 = _mm_mul_epu32(a, b);                  // (-,a2*b2,-,a0*b0)
+    __m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
+    __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
+    __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
+    return           _mm_unpacklo_epi64(prod01, prod23);   // (ab3,ab2,ab1,ab0)
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec4i & operator *= (Vec4i & a, Vec4i const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec4i operator << (Vec4i const a, int32_t b) {
+    return _mm_sll_epi32(a, _mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec4i & operator <<= (Vec4i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4i operator >> (Vec4i const a, int32_t b) {
+    return _mm_sra_epi32(a, _mm_cvtsi32_si128(b));
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec4i & operator >>= (Vec4i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4ib operator == (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, b, 0);
+#else
+    return _mm_cmpeq_epi32(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4ib operator != (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, b, 4);
+#else
+    return Vec4ib(Vec4i(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4ib operator > (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, b, 6);
+#else
+    return _mm_cmpgt_epi32(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4ib operator < (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4ib operator >= (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, b, 5);
+#else
+    return Vec4ib(Vec4i(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4ib operator <= (Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec4i operator & (Vec4i const a, Vec4i const b) {
+    return Vec4i(Vec128b(a) & Vec128b(b));
+}
+static inline Vec4i operator && (Vec4i const a, Vec4i const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4i & operator &= (Vec4i & a, Vec4i const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4i operator | (Vec4i const a, Vec4i const b) {
+    return Vec4i(Vec128b(a) | Vec128b(b));
+}
+static inline Vec4i operator || (Vec4i const a, Vec4i const b) {
+    return a | b;
+}
+// vector operator |= : bitwise and
+static inline Vec4i & operator |= (Vec4i & a, Vec4i const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4i operator ^ (Vec4i const a, Vec4i const b) {
+    return Vec4i(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise and
+static inline Vec4i & operator ^= (Vec4i & a, Vec4i const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4i operator ~ (Vec4i const a) {
+    return Vec4i(~Vec128b(a));
+}
+
+// vector operator ! : returns true for elements == 0
+static inline Vec4ib operator ! (Vec4i const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi32_mask(a, _mm_setzero_si128(), 0);
+#else
+    return _mm_cmpeq_epi32(a, _mm_setzero_si128());
+#endif
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4i select(Vec4ib const s, Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi32(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4i if_add(Vec4ib const f, Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi32(a, f, a, b);
+#else
+    return a + (Vec4i(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec4i if_sub(Vec4ib const f, Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi32(a, f, a, b);
+#else
+    return a - (Vec4i(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4i if_mul(Vec4ib const f, Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mullo_epi32(a, f, a, b);
+#else
+    return select(f, a * b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int32_t horizontal_add(Vec4i const a) {
+#ifdef __XOP__       // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epi32(a);
+    __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E);          // high element
+    __m128i sum3 = _mm_add_epi32(sum1, sum2);              // sum
+    return          _mm_cvtsi128_si32(sum3);               // truncate to 32 bits
+#elif  INSTRSET >= 4 & false // SSSE3
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128i sum1 = _mm_hadd_epi32(a, a);                   // horizontally add 4 elements in 2 steps
+    __m128i sum2 = _mm_hadd_epi32(sum1, sum1);
+    return          _mm_cvtsi128_si32(sum2);               // 32 bit sum
+#else                 // SSE2
+    __m128i sum1 = _mm_unpackhi_epi64(a, a);               // 2 high elements
+    __m128i sum2 = _mm_add_epi32(a, sum1);                 // 2 sums
+    __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);          // 1 high element
+    __m128i sum4 = _mm_add_epi32(sum2, sum3);              // 2 sums
+    return          _mm_cvtsi128_si32(sum4);               // 32 bit sum
+#endif
+}
+
+// function used below
+static inline int64_t _emulate_movq(__m128i const x) {
+#ifdef __x86_64__
+    return _mm_cvtsi128_si64(x);
+#else
+    // 64 bit registers not available
+    union {
+        __m128i m;
+        int64_t y;
+    } u;
+    _mm_storel_epi64(&u.m, x);
+    return u.y;
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x(Vec4i const a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epi32(a);
+#else              // SSE2
+    __m128i signs = _mm_srai_epi32(a, 31);                 // sign of all elements
+    __m128i a01 = _mm_unpacklo_epi32(a, signs);            // sign-extended a0, a1
+    __m128i a23 = _mm_unpackhi_epi32(a, signs);            // sign-extended a2, a3
+    __m128i sum1 = _mm_add_epi64(a01, a23);                // add
+#endif
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);         // high qword
+    __m128i sum3 = _mm_add_epi64(sum1, sum2);              // add
+    return _emulate_movq(sum3);
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec4i add_saturated(Vec4i const a, Vec4i const b) {
+    // is there a faster method?
+    __m128i sum = _mm_add_epi32(a, b);                     // a + b
+    __m128i axb = _mm_xor_si128(a, b);                     // check if a and b have different sign
+    __m128i axs = _mm_xor_si128(a, sum);                   // check if a and sum have different sign
+    __m128i overf1 = _mm_andnot_si128(axb, axs);           // check if sum has wrong sign
+    __m128i overf2 = _mm_srai_epi32(overf1, 31);           // -1 if overflow
+    __m128i asign = _mm_srli_epi32(a, 31);                 // 1  if a < 0
+    __m128i sat1 = _mm_srli_epi32(overf2, 1);              // 7FFFFFFF if overflow
+    __m128i sat2 = _mm_add_epi32(sat1, asign);             // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2, sat2, sum);                    // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec4i sub_saturated(Vec4i const a, Vec4i const b) {
+    __m128i diff = _mm_sub_epi32(a, b);                    // a + b
+    __m128i axb = _mm_xor_si128(a, b);                     // check if a and b have different sign
+    __m128i axs = _mm_xor_si128(a, diff);                  // check if a and sum have different sign
+    __m128i overf1 = _mm_and_si128(axb, axs);              // check if sum has wrong sign
+    __m128i overf2 = _mm_srai_epi32(overf1, 31);           // -1 if overflow
+    __m128i asign = _mm_srli_epi32(a, 31);                 // 1  if a < 0
+    __m128i sat1 = _mm_srli_epi32(overf2, 1);              // 7FFFFFFF if overflow
+    __m128i sat2 = _mm_add_epi32(sat1, asign);             // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2, sat2, diff);                   // diff if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec4i max(Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_max_epi32(a, b);
+#else
+    __m128i greater = _mm_cmpgt_epi32(a, b);
+    return selectb(greater, a, b);
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4i min(Vec4i const a, Vec4i const b) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return _mm_min_epi32(a, b);
+#else
+    __m128i greater = _mm_cmpgt_epi32(a, b);
+    return selectb(greater, b, a);
+#endif
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4i abs(Vec4i const a) {
+#if INSTRSET >= 4     // SSSE3 supported
+    return _mm_abs_epi32(a);
+#else                 // SSE2
+    __m128i sign = _mm_srai_epi32(a, 31);                  // sign of a
+    __m128i inv = _mm_xor_si128(a, sign);                  // invert bits if negative
+    return         _mm_sub_epi32(inv, sign);               // add 1
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4i abs_saturated(Vec4i const a) {
+#if INSTRSET >= 10
+    return _mm_min_epu32(abs(a), Vec4i(0x7FFFFFFF));
+#else
+    __m128i absa = abs(a);                                 // abs(a)
+    __m128i overfl = _mm_srai_epi32(absa, 31);             // sign
+    return           _mm_add_epi32(absa, overfl);          // subtract 1 if 0x80000000
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4i rotate_left(Vec4i const a, int b) {
+#if INSTRSET >= 10  // __AVX512VL__
+    return _mm_rolv_epi32(a, _mm_set1_epi32(b));
+#elif defined __XOP__  // AMD XOP instruction set
+    return _mm_rot_epi32(a, _mm_set1_epi32(b));
+#else  // SSE2 instruction set
+    __m128i left = _mm_sll_epi32(a, _mm_cvtsi32_si128(b & 0x1F));    // a << b
+    __m128i right = _mm_srl_epi32(a, _mm_cvtsi32_si128((-b) & 0x1F));// a >> (32 - b)
+    __m128i rot = _mm_or_si128(left, right);                         // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec4ui : public Vec4i {
+public:
+    // Default constructor:
+    Vec4ui() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4ui(uint32_t i) {
+        xmm = _mm_set1_epi32((int32_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec4ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+        xmm = _mm_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec4ui(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec4ui & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4ui & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec4ui & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4ui const insert(int index, uint32_t value) {
+        Vec4i::insert(index, (int32_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(int index) const {
+        return (uint32_t)Vec4i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int elementtype() {
+        return 9;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4ui operator + (Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(Vec4i(a) + Vec4i(b));
+}
+
+// vector operator - : subtract
+static inline Vec4ui operator - (Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(Vec4i(a) - Vec4i(b));
+}
+
+// vector operator * : multiply
+static inline Vec4ui operator * (Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(Vec4i(a) * Vec4i(b));
+}
+
+// vector operator / : divide. See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec4ui operator >> (Vec4ui const a, uint32_t b) {
+    return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)b));
+}
+// vector operator >> : shift right logical all elements
+static inline Vec4ui operator >> (Vec4ui const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+// vector operator >>= : shift right logical
+static inline Vec4ui & operator >>= (Vec4ui & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4ui operator << (Vec4ui const a, uint32_t b) {
+    return Vec4ui((Vec4i)a << (int32_t)b);
+}
+// vector operator << : shift left all elements
+static inline Vec4ui operator << (Vec4ui const a, int32_t b) {
+    return Vec4ui((Vec4i)a << (int32_t)b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4ib operator > (Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu32_mask(a, b, 6);
+#elif defined (__XOP__)  // AMD XOP instruction set
+    return (Vec4ib)_mm_comgt_epu32(a, b);
+#else  // SSE2 instruction set
+    __m128i signbit = _mm_set1_epi32(0x80000000);
+    __m128i a1 = _mm_xor_si128(a, signbit);
+    __m128i b1 = _mm_xor_si128(b, signbit);
+    return (Vec4ib)_mm_cmpgt_epi32(a1, b1);                // signed compare
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4ib operator < (Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu32_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4ib operator >= (Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu32_mask(a, b, 5);
+#else
+#ifdef __XOP__  // AMD XOP instruction set
+    return (Vec4ib)_mm_comge_epu32(a, b);
+#elif INSTRSET >= 5   // SSE4.1
+    __m128i max_ab = _mm_max_epu32(a, b);                  // max(a,b), unsigned
+    return (Vec4ib)_mm_cmpeq_epi32(a, max_ab);             // a == max(a,b)
+#else  // SSE2 instruction set
+    return Vec4ib(Vec4i(~(b > a)));
+#endif
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4ib operator <= (Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu32_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec4ui operator & (Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(Vec128b(a) & Vec128b(b));
+}
+static inline Vec4ui operator && (Vec4ui const a, Vec4ui const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec4ui operator | (Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(Vec128b(a) | Vec128b(b));
+}
+static inline Vec4ui operator || (Vec4ui const a, Vec4ui const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4ui operator ^ (Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4ui operator ~ (Vec4ui const a) {
+    return Vec4ui(~Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec4ui select(Vec4ib const s, Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi32(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4ui if_add(Vec4ib const f, Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi32(a, f, a, b);
+#else
+    return a + (Vec4ui(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec4ui if_sub(Vec4ib const f, Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi32(a, f, a, b);
+#else
+    return a - (Vec4ui(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec4ui if_mul(Vec4ib const f, Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mullo_epi32(a, f, a, b);
+#else
+    return select(f, a * b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint32_t horizontal_add(Vec4ui const a) {
+    return (uint32_t)horizontal_add((Vec4i)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x(Vec4ui const a) {
+#ifdef __XOP__     // AMD XOP instruction set
+    __m128i sum1 = _mm_haddq_epu32(a);
+#else              // SSE2
+    __m128i zero = _mm_setzero_si128();                    // 0
+    __m128i a01  = _mm_unpacklo_epi32(a, zero);            // zero-extended a0, a1
+    __m128i a23  = _mm_unpackhi_epi32(a, zero);            // zero-extended a2, a3
+    __m128i sum1 = _mm_add_epi64(a01, a23);                // add
+#endif
+    __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1);         // high qword
+    __m128i sum3 = _mm_add_epi64(sum1, sum2);              // add
+    return (uint64_t)_emulate_movq(sum3);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec4ui add_saturated(Vec4ui const a, Vec4ui const b) {
+    Vec4ui sum = a + b;
+    Vec4ui aorb = Vec4ui(a | b);
+#if INSTRSET >= 10
+    Vec4b  overflow = _mm_cmp_epu32_mask(sum, aorb, 1);
+    return _mm_mask_set1_epi32(sum, overflow, -1);
+#else
+    Vec4ui overflow = Vec4ui(sum < aorb);                  // overflow if a + b < (a | b)
+    return Vec4ui(sum | overflow);                         // return 0xFFFFFFFF if overflow
+#endif
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec4ui sub_saturated(Vec4ui const a, Vec4ui const b) {
+    Vec4ui diff = a - b;
+#if INSTRSET >= 10
+    Vec4b  nunderflow = _mm_cmp_epu32_mask(diff, a, 2);    // not underflow if a - b <= a
+    return _mm_maskz_mov_epi32(nunderflow, diff);          // zero if underflow
+#else
+    Vec4ui underflow = Vec4ui(diff > a);                   // underflow if a - b > a
+    return _mm_andnot_si128(underflow, diff);              // return 0 if underflow
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec4ui max(Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_max_epu32(a, b);
+#else  // SSE2
+    return select(a > b, a, b);
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4ui min(Vec4ui const a, Vec4ui const b) {
+#if INSTRSET >= 5   // SSE4.1
+    return _mm_min_epu32(a, b);
+#else  // SSE2
+    return select(a > b, b, a);
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 2 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec2q : public Vec128b {
+public:
+    // Default constructor:
+    Vec2q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec2q(int64_t i) {
+        xmm = _mm_set1_epi64x(i);
+    }
+    // Constructor to build from all elements:
+    Vec2q(int64_t i0, int64_t i1) {
+        xmm = _mm_set_epi64x(i1, i0);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec2q(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec2q & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m128i used in intrinsics
+    operator __m128i() const {
+        return xmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec2q & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec2q & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec2q & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        xmm = _mm_maskz_loadu_epi64(__mmask8((1u << n) - 1), p);
+#else
+        switch (n) {
+        case 0:
+            *this = 0;  break;
+        case 1:
+            // intrinsic for movq is missing!
+            *this = Vec2q(*(int64_t const*)p, 0);  break;
+        case 2:
+            load(p);  break;
+        default:
+            break;
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        _mm_mask_storeu_epi64(p, __mmask8((1u << n) - 1), xmm);
+#else
+        switch (n) {
+        case 1:
+            int64_t q[2];
+            store(q);
+            *(int64_t*)p = q[0];  break;
+        case 2:
+            store(p);  break;
+        default:
+            break;
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 2-n elements are set to zero
+    Vec2q & cutoff(int n) {
+#if INSTRSET >= 10
+        xmm = _mm_maskz_mov_epi64(__mmask8((1u << n) - 1), xmm);
+#else
+        *this = Vec16c(xmm).cutoff(n * 8);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec2q const insert(int index, int64_t value) {
+#if INSTRSET >= 10
+        xmm = _mm_mask_set1_epi64(xmm, __mmask8(1u << index), value);
+#elif INSTRSET >= 5 && defined(__x86_64__)  // SSE4.1 supported, 64 bit mode
+        if (index == 0) {
+            xmm = _mm_insert_epi64(xmm, value, 0);
+        }
+        else {
+            xmm = _mm_insert_epi64(xmm, value, 1);
+        }
+#else               // SSE2
+#if defined(__x86_64__)                                    // 64 bit mode
+        __m128i v = _mm_cvtsi64_si128(value);              // 64 bit load
+#else
+        union {
+            __m128i m;
+            int64_t ii;
+        } u;
+        u.ii = value;
+        __m128i v = _mm_loadl_epi64(&u.m);
+#endif
+        if (index == 0) {
+            v = _mm_unpacklo_epi64(v, v);
+            xmm = _mm_unpackhi_epi64(v, xmm);
+        }
+        else {  // index = 1
+            xmm = _mm_unpacklo_epi64(xmm, v);
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(int index) const {
+#if INSTRSET >= 10
+        __m128i x = _mm_mask_unpackhi_epi64(xmm, __mmask8(index), xmm, xmm);
+        return _emulate_movq(x);
+#else
+        int64_t x[2];
+        store(x);
+        return x[index & 1];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 2;
+    }
+    static constexpr int elementtype() {
+        return 10;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec2qb: Vector of 2 Booleans for use with Vec2q and Vec2uq
+*
+*****************************************************************************/
+
+#if INSTRSET < 10   // broad boolean vectors
+
+// Definition will be different for the AVX512 instruction set
+class Vec2qb : public Vec2q {
+public:
+    // Default constructor:
+    Vec2qb() {
+    }
+    // Constructor to build from all elements:
+    Vec2qb(bool x0, bool x1) {
+        xmm = Vec2q(-int64_t(x0), -int64_t(x1));
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec2qb(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec2qb & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec2qb(bool b) : Vec2q(-int64_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec2qb & operator = (bool b) {
+        *this = Vec2qb(b);
+        return *this;
+    }
+    Vec2qb & insert(int index, bool a) {
+        Vec2q::insert(index, -(int64_t)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec2q::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec2qb & load_bits(uint8_t a) {
+        __m128i b1 = _mm_set1_epi8((int8_t)a);  // broadcast byte
+        __m128i m1 = constant4ui<1, 1, 2, 2>();
+        __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte
+        xmm = _mm_cmpgt_epi32(c1, _mm_setzero_si128());  // compare with 0 (64 bit compare requires SSE4.1)
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec2qb(int b) = delete;
+    Vec2qb & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec2b Vec2qb;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec2qb
+*
+*****************************************************************************/
+
+#if INSTRSET < 10   // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec2qb operator & (Vec2qb const a, Vec2qb const b) {
+    return Vec2qb(Vec128b(a) & Vec128b(b));
+}
+static inline Vec2qb operator && (Vec2qb const a, Vec2qb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec2qb & operator &= (Vec2qb & a, Vec2qb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2qb operator | (Vec2qb const a, Vec2qb const b) {
+    return Vec2qb(Vec128b(a) | Vec128b(b));
+}
+static inline Vec2qb operator || (Vec2qb const a, Vec2qb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec2qb & operator |= (Vec2qb & a, Vec2qb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2qb operator ^ (Vec2qb const a, Vec2qb const b) {
+    return Vec2qb(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec2qb & operator ^= (Vec2qb & a, Vec2qb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec2qb operator == (Vec2qb const a, Vec2qb const b) {
+    return Vec2qb(a ^ (~b));
+}
+
+// vector operator != : xor
+static inline Vec2qb operator != (Vec2qb const a, Vec2qb const b) {
+    return Vec2qb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2qb operator ~ (Vec2qb const a) {
+    return Vec2qb(~Vec128b(a));
+}
+
+// vector operator ! : element not
+static inline Vec2qb operator ! (Vec2qb const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec2qb andnot(Vec2qb const a, Vec2qb const b) {
+    return Vec2qb(andnot(Vec128b(a), Vec128b(b)));
+}
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec2qb const a) {
+    return _mm_movemask_epi8(a) == 0xFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec2qb const a) {
+#if INSTRSET >= 5   // SSE4.1 supported. Use PTEST
+    return !_mm_testz_si128(a, a);
+#else
+    return _mm_movemask_epi8(a) != 0;
+#endif
+}
+
+#endif     // broad boolean vectors
+
+
+
+/*****************************************************************************
+*
+*          Operators for Vec2q
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec2q operator + (Vec2q const a, Vec2q const b) {
+    return _mm_add_epi64(a, b);
+}
+// vector operator += : add
+static inline Vec2q & operator += (Vec2q & a, Vec2q const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec2q operator ++ (Vec2q & a, int) {
+    Vec2q a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec2q & operator ++ (Vec2q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec2q operator - (Vec2q const a, Vec2q const b) {
+    return _mm_sub_epi64(a, b);
+}
+// vector operator - : unary minus
+static inline Vec2q operator - (Vec2q const a) {
+    return _mm_sub_epi64(_mm_setzero_si128(), a);
+}
+// vector operator -= : subtract
+static inline Vec2q & operator -= (Vec2q & a, Vec2q const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec2q operator -- (Vec2q & a, int) {
+    Vec2q a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec2q & operator -- (Vec2q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec2q operator * (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm_mullo_epi64(a, b);
+#elif INSTRSET >= 5   // SSE4.1 supported
+    // Split into 32-bit multiplies
+    __m128i bswap = _mm_shuffle_epi32(b, 0xB1);            // b0H,b0L,b1H,b1L (swap H<->L)
+    __m128i prodlh = _mm_mullo_epi32(a, bswap);            // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products
+    __m128i zero = _mm_setzero_si128();                    // 0
+    __m128i prodlh2 = _mm_hadd_epi32(prodlh, zero);        // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+    __m128i prodlh3 = _mm_shuffle_epi32(prodlh2, 0x73);    // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+    __m128i prodll = _mm_mul_epu32(a, b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
+    __m128i prod = _mm_add_epi64(prodll, prodlh3);         // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+    return  prod;
+#else               // SSE2
+    int64_t aa[2], bb[2];
+    a.store(aa);                                           // split into elements
+    b.store(bb);
+    return Vec2q(aa[0] * bb[0], aa[1] * bb[1]);            // multiply elements separetely
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec2q & operator *= (Vec2q & a, Vec2q const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec2q operator << (Vec2q const a, int32_t b) {
+    return _mm_sll_epi64(a, _mm_cvtsi32_si128(b));
+}
+
+// vector operator <<= : shift left
+static inline Vec2q & operator <<= (Vec2q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec2q operator >> (Vec2q const a, int32_t b) {
+#if INSTRSET >= 10   // AVX512VL
+    return _mm_sra_epi64(a, _mm_cvtsi32_si128(b));
+#else
+    __m128i bb, shi, slo, sra2;
+    if (b <= 32) {
+        bb = _mm_cvtsi32_si128(b);               // b
+        shi = _mm_sra_epi32(a, bb);              // a >> b signed dwords
+        slo = _mm_srl_epi64(a, bb);              // a >> b unsigned qwords
+    }
+    else {  // b > 32
+        bb = _mm_cvtsi32_si128(b - 32);          // b - 32
+        shi = _mm_srai_epi32(a, 31);             // sign of a
+        sra2 = _mm_sra_epi32(a, bb);             // a >> (b-32) signed dwords
+        slo = _mm_srli_epi64(sra2, 32);          // a >> (b-32) >> 32 (second shift unsigned qword)
+    }
+#if INSTRSET >= 5  // SSE4.1
+    return _mm_blend_epi16(slo, shi, 0xCC);
+#else
+    __m128i mask = _mm_setr_epi32(0, -1, 0, -1); // mask for high part containing only sign
+    return  selectb(mask, shi, slo);
+#endif
+#endif
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec2q & operator >>= (Vec2q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec2qb operator == (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epi64_mask(a, b, 0);
+#elif INSTRSET >= 5   // SSE4.1 supported
+    return _mm_cmpeq_epi64(a, b);
+#else               // SSE2
+    // no 64 compare instruction. Do two 32 bit compares
+    __m128i com32 = _mm_cmpeq_epi32(a, b);                 // 32 bit compares
+    __m128i com32s = _mm_shuffle_epi32(com32, 0xB1);       // swap low and high dwords
+    __m128i test = _mm_and_si128(com32, com32s);           // low & high
+    __m128i teste = _mm_srai_epi32(test, 31);              // extend sign bit to 32 bits
+    __m128i testee = _mm_shuffle_epi32(teste, 0xF5);       // extend sign bit to 64 bits
+    return  Vec2qb(Vec2q(testee));
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec2qb operator != (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epi64_mask(a, b, 4);
+#elif defined (__XOP__)  // AMD XOP instruction set
+    return Vec2qb(_mm_comneq_epi64(a, b));
+#else  // SSE2 instruction set
+    return Vec2qb(Vec2q(~(a == b)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec2qb operator < (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epi64_mask(a, b, 1);
+#elif INSTRSET >= 6   // SSE4.2 supported
+    return Vec2qb(Vec2q(_mm_cmpgt_epi64(b, a)));
+#else               // SSE2
+    // no 64 compare instruction. Subtract
+    __m128i s = _mm_sub_epi64(a, b);                       // a-b
+    // a < b if a and b have same sign and s < 0 or (a < 0 and b >= 0)
+    // The latter () corrects for overflow
+    __m128i axb = _mm_xor_si128(a, b);                     // a ^ b
+    __m128i anb = _mm_andnot_si128(b, a);                  // a & ~b
+    __m128i snaxb = _mm_andnot_si128(axb, s);              // s & ~(a ^ b)
+    __m128i or1 = _mm_or_si128(anb, snaxb);                // (a & ~b) | (s & ~(a ^ b))
+    __m128i teste = _mm_srai_epi32(or1, 31);               // extend sign bit to 32 bits
+    __m128i testee = _mm_shuffle_epi32(teste, 0xF5);       // extend sign bit to 64 bits
+    return  testee;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec2qb operator > (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi64_mask(a, b, 6);
+#else
+    return b < a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec2qb operator >= (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epi64_mask(a, b, 5);
+#elif defined (__XOP__)  // AMD XOP instruction set
+    return Vec2qb(_mm_comge_epi64(a, b));
+#else  // SSE2 instruction set
+    return Vec2qb(Vec2q(~(a < b)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec2qb operator <= (Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epi64_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec2q operator & (Vec2q const a, Vec2q const b) {
+    return Vec2q(Vec128b(a) & Vec128b(b));
+}
+static inline Vec2q operator && (Vec2q const a, Vec2q const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec2q & operator &= (Vec2q & a, Vec2q const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec2q operator | (Vec2q const a, Vec2q const b) {
+    return Vec2q(Vec128b(a) | Vec128b(b));
+}
+static inline Vec2q operator || (Vec2q const a, Vec2q const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec2q & operator |= (Vec2q & a, Vec2q const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2q operator ^ (Vec2q const a, Vec2q const b) {
+    return Vec2q(Vec128b(a) ^ Vec128b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec2q & operator ^= (Vec2q & a, Vec2q const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2q operator ~ (Vec2q const a) {
+    return Vec2q(~Vec128b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec2qb operator ! (Vec2q const a) {
+    return a == Vec2q(_mm_setzero_si128());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec2q select(Vec2qb const s, Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi64(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2q if_add(Vec2qb const f, Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi64(a, f, a, b);
+#else
+    return a + (Vec2q(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec2q if_sub(Vec2qb const f, Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi64(a, f, a, b);
+#else
+    return a - (Vec2q(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec2q if_mul(Vec2qb const f, Vec2q const a, Vec2q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mullo_epi64(a, f, a, b);
+#else
+    return select(f, a * b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int64_t horizontal_add(Vec2q const a) {
+    __m128i sum1 = _mm_unpackhi_epi64(a, a);               // high element
+    __m128i sum2 = _mm_add_epi64(a, sum1);                 // sum
+    return _emulate_movq(sum2);
+}
+
+// function max: a > b ? a : b
+static inline Vec2q max(Vec2q const a, Vec2q const b) {
+    return select(a > b, a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec2q min(Vec2q const a, Vec2q const b) {
+    return select(a < b, a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec2q abs(Vec2q const a) {
+#if INSTRSET >= 10     // AVX512VL
+    return _mm_abs_epi64(a);
+#elif INSTRSET >= 6     // SSE4.2 supported
+    __m128i sign = _mm_cmpgt_epi64(_mm_setzero_si128(), a);// 0 > a
+    __m128i inv = _mm_xor_si128(a, sign);                  // invert bits if negative
+    return          _mm_sub_epi64(inv, sign);              // add 1
+#else                 // SSE2
+    __m128i signh = _mm_srai_epi32(a, 31);                 // sign in high dword
+    __m128i sign = _mm_shuffle_epi32(signh, 0xF5);         // copy sign to low dword
+    __m128i inv = _mm_xor_si128(a, sign);                  // invert bits if negative
+    return          _mm_sub_epi64(inv, sign);              // add 1
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec2q abs_saturated(Vec2q const a) {
+#if INSTRSET >= 10
+    return _mm_min_epu64(abs(a), Vec2q(0x7FFFFFFFFFFFFFFF));
+#elif INSTRSET >= 6     // SSE4.2 supported
+    __m128i absa = abs(a);                                 // abs(a)
+    __m128i overfl = _mm_cmpgt_epi64(_mm_setzero_si128(), absa);// 0 > a
+    return           _mm_add_epi64(absa, overfl);          // subtract 1 if 0x8000000000000000
+#else                 // SSE2
+    __m128i absa = abs(a);                                 // abs(a)
+    __m128i signh = _mm_srai_epi32(absa, 31);              // sign in high dword
+    __m128i overfl = _mm_shuffle_epi32(signh, 0xF5);       // copy sign to low dword
+    return           _mm_add_epi64(absa, overfl);          // subtract 1 if 0x8000000000000000
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec2q rotate_left(Vec2q const a, int b) {
+#if INSTRSET >= 10  // __AVX512VL__
+    return _mm_rolv_epi64(a, _mm_set1_epi64x(int64_t(b)));
+#elif defined __XOP__  // AMD XOP instruction set
+    return (Vec2q)_mm_rot_epi64(a, Vec2q(b));
+#else  // SSE2 instruction set
+    __m128i left = _mm_sll_epi64(a, _mm_cvtsi32_si128(b & 0x3F));    // a << b
+    __m128i right = _mm_srl_epi64(a, _mm_cvtsi32_si128((-b) & 0x3F));// a >> (64 - b)
+    __m128i rot = _mm_or_si128(left, right);                         // or
+    return  (Vec2q)rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 2 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec2uq : public Vec2q {
+public:
+    // Default constructor:
+    Vec2uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec2uq(uint64_t i) {
+        xmm = Vec2q((int64_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec2uq(uint64_t i0, uint64_t i1) {
+        xmm = Vec2q((int64_t)i0, (int64_t)i1);
+    }
+    // Constructor to convert from type __m128i used in intrinsics:
+    Vec2uq(__m128i const x) {
+        xmm = x;
+    }
+    // Assignment operator to convert from type __m128i used in intrinsics:
+    Vec2uq & operator = (__m128i const x) {
+        xmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec2uq & load(void const * p) {
+        xmm = _mm_loadu_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to load from array (aligned)
+    Vec2uq & load_a(void const * p) {
+        xmm = _mm_load_si128((__m128i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec2uq const insert(int index, uint64_t value) {
+        Vec2q::insert(index, (int64_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(int index) const {
+        return (uint64_t)Vec2q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int elementtype() {
+        return 11;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec2uq operator + (Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(Vec2q(a) + Vec2q(b));
+}
+
+// vector operator - : subtract
+static inline Vec2uq operator - (Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(Vec2q(a) - Vec2q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec2uq operator * (Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(Vec2q(a) * Vec2q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec2uq operator >> (Vec2uq const a, uint32_t b) {
+    return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec2uq operator >> (Vec2uq const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec2uq & operator >>= (Vec2uq & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec2uq operator << (Vec2uq const a, uint32_t b) {
+    return Vec2uq((Vec2q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec2uq operator << (Vec2uq const a, int32_t b) {
+    return Vec2uq((Vec2q)a << b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec2qb operator > (Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epu64_mask(a, b, 6);
+#elif defined ( __XOP__ ) // AMD XOP instruction set
+    return Vec2qb(_mm_comgt_epu64(a, b));
+#elif INSTRSET >= 6 // SSE4.2
+    __m128i sign64 = constant4ui<0, 0x80000000, 0, 0x80000000>();
+    __m128i aflip = _mm_xor_si128(a, sign64);              // flip sign bits to use signed compare
+    __m128i bflip = _mm_xor_si128(b, sign64);
+    Vec2q   cmp = _mm_cmpgt_epi64(aflip, bflip);
+    return Vec2qb(cmp);
+#else  // SSE2 instruction set
+    __m128i sign32 = _mm_set1_epi32(0x80000000);           // sign bit of each dword
+    __m128i aflip = _mm_xor_si128(a, sign32);              // a with sign bits flipped to use signed compare
+    __m128i bflip = _mm_xor_si128(b, sign32);              // b with sign bits flipped to use signed compare
+    __m128i equal = _mm_cmpeq_epi32(a, b);                 // a == b, dwords
+    __m128i bigger = _mm_cmpgt_epi32(aflip, bflip);        // a > b, dwords
+    __m128i biggerl = _mm_shuffle_epi32(bigger, 0xA0);     // a > b, low dwords copied to high dwords
+    __m128i eqbig = _mm_and_si128(equal, biggerl);         // high part equal and low part bigger
+    __m128i hibig = _mm_or_si128(bigger, eqbig);           // high part bigger or high part equal and low part bigger
+    __m128i big = _mm_shuffle_epi32(hibig, 0xF5);          // result copied to low part
+    return  Vec2qb(Vec2q(big));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec2qb operator < (Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu64_mask(a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec2qb operator >= (Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // broad boolean vectors
+    return _mm_cmp_epu64_mask(a, b, 5);
+#elif defined (__XOP__)  // AMD XOP instruction set
+    return Vec2qb(_mm_comge_epu64(a, b));
+#else  // SSE2 instruction set
+    return  Vec2qb(Vec2q(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec2qb operator <= (Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_cmp_epu64_mask(a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec2uq operator & (Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(Vec128b(a) & Vec128b(b));
+}
+static inline Vec2uq operator && (Vec2uq const a, Vec2uq const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec2uq operator | (Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(Vec128b(a) | Vec128b(b));
+}
+static inline Vec2uq operator || (Vec2uq const a, Vec2uq const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec2uq operator ^ (Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(Vec128b(a) ^ Vec128b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec2uq operator ~ (Vec2uq const a) {
+    return Vec2uq(~Vec128b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec2uq select(Vec2qb const s, Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mov_epi64(b, s, a);
+#else
+    return selectb(s, a, b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec2uq if_add(Vec2qb const f, Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_add_epi64(a, f, a, b);
+#else
+    return a + (Vec2uq(f) & b);
+#endif
+}
+
+// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i]
+static inline Vec2uq if_sub(Vec2qb const f, Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_sub_epi64(a, f, a, b);
+#else
+    return a - (Vec2uq(f) & b);
+#endif
+}
+
+// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
+static inline Vec2uq if_mul(Vec2qb const f, Vec2uq const a, Vec2uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm_mask_mullo_epi64(a, f, a, b);
+#else
+    return select(f, a * b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint64_t horizontal_add(Vec2uq const a) {
+    return (uint64_t)horizontal_add((Vec2q)a);
+}
+
+// function max: a > b ? a : b
+static inline Vec2uq max(Vec2uq const a, Vec2uq const b) {
+    return select(a > b, a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec2uq min(Vec2uq const a, Vec2uq const b) {
+    return select(a > b, b, a);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero.
+*
+* The indexes are inserted as template parameters in <>.
+* These indexes must be constants.
+* Each template parameter is an index to the element you want to select.
+* An index of -1 will generate zero.
+* An index of V_DC means don't care. This gives the best instruction that
+* fits the remaining indexes
+*
+* Example:
+* Vec4i a(10,11,12,13);        // a is (10,11,12,13)
+* Vec4i b, c;
+* b = permute4<0,0,2,2>(a);    // b is (10,10,12,12)
+* c = permute4<3,2,-1,-1>(a);  // c is (13,12, 0, 0)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fits the template parameters and instruction set.
+* The final optimized code will contain only one or a few instructions.
+* Higher instruction sets may give you more efficient code.
+*
+*****************************************************************************/
+
+// permute Vec2q
+template <int i0, int i1>
+static inline Vec2q permute2(Vec2q const a) {
+    constexpr int indexs[2] = { i0, i1 };                  // indexes as array
+    __m128i y = a;                                         // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec2q>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128();  // just return zero
+
+    constexpr bool fit_shleft  = (flags & perm_shleft)  != 0;
+    constexpr bool fit_shright = (flags & perm_shright) != 0;
+    constexpr bool fit_punpckh = (flags & perm_punpckh) != 0;
+    constexpr bool fit_punpckl = (flags & perm_punpckl) != 0;
+    constexpr bool fit_zeroing = (flags & perm_zeroing) != 0;
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+        // try to fit various instructions
+
+        if constexpr (fit_shleft && fit_zeroing) {
+            // pslldq does both permutation and zeroing. if zeroing not needed use punpckl instead
+            return _mm_bslli_si128(a, 8);
+        }
+        if constexpr (fit_shright && fit_zeroing) {
+            // psrldq does both permutation and zeroing. if zeroing not needed use punpckh instead
+            return _mm_bsrli_si128(a, 8);
+        }
+        if constexpr (fit_punpckh) {       // fits punpckhi
+            y = _mm_unpackhi_epi64(a, a);
+        }
+        else if constexpr (fit_punpckl) {  // fits punpcklo
+            y = _mm_unpacklo_epi64(a, a);
+        }
+        else {  // needs general permute
+            y = _mm_shuffle_epi32(a, i0 * 0x0A + i1 * 0xA0 + 0x44);
+        }
+    }
+    if constexpr (fit_zeroing) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi64(zero_mask<2>(indexs), y);
+#else  // use unpack to avoid using data cache
+        if constexpr (i0 == -1) {
+            y = _mm_unpackhi_epi64(_mm_setzero_si128(), y);
+        }
+        else if constexpr (i1 == -1) {
+            y = _mm_unpacklo_epi64(y, _mm_setzero_si128());
+        }
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1>
+static inline Vec2uq permute2(Vec2uq const a) {
+    return Vec2uq(permute2 <i0, i1>((Vec2q)a));
+}
+
+// permute Vec4i
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i permute4(Vec4i const a) {
+    constexpr int indexs[4] = {i0, i1, i2, i3};            // indexes as array
+    __m128i y = a;                                         // result
+
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec4i>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128();
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {
+            // use larger permutation
+            constexpr EList<int, 2> L = largeblock_perm<4>(indexs); // permutation pattern
+            y = permute2 <L.a[0], L.a[1]> (Vec2q(a));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_shleft) != 0) {   // fits pslldq
+            y = _mm_bslli_si128(a, (16-(flags >> perm_rot_count)) & 0xF);
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_shright) != 0) {  // fits psrldq
+            y = _mm_bsrli_si128(a, (flags >> perm_rot_count) & 0xF);
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+#if  INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask
+        else if constexpr ((flags & perm_zeroing) != 0) {
+            // Do both permutation and zeroing with PSHUFB instruction
+            const EList <int8_t, 16> bm = pshufb_mask<Vec4i>(indexs);
+            return _mm_shuffle_epi8(a, Vec4i().load(bm.a));
+        }
+#endif
+        else if constexpr ((flags & perm_punpckh) != 0) {  // fits punpckhi
+            y = _mm_unpackhi_epi32(a, a);
+        }
+        else if constexpr ((flags & perm_punpckl) != 0) {  // fits punpcklo
+            y = _mm_unpacklo_epi32(a, a);
+        }
+#if INSTRSET >= 4  // SSSE3
+        else if constexpr ((flags & perm_rotate) != 0) {   // fits palignr
+            y = _mm_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF);
+        }
+#endif
+        else {  // needs general permute
+            y = _mm_shuffle_epi32(a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6);
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        // The mask-zero operation can be merged into the preceding instruction, whatever that is.
+        // A good optimizing compiler will do this automatically.
+        // I don't want to clutter all the branches above with this
+        y = _mm_maskz_mov_epi32 (zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 4> bm = zero_mask_broad<Vec4i>(indexs);
+        y = _mm_and_si128(Vec4i().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4ui permute4(Vec4ui const a) {
+    return Vec4ui(permute4 <i0, i1, i2, i3>(Vec4i(a)));
+}
+
+
+// permute Vec8s
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8s permute8(Vec8s const a) {
+    // indexes as array
+    constexpr int indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 };
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags   = perm_flags<Vec8s>(indexs);
+    constexpr uint64_t flags16 = perm16_flags<Vec8s>(indexs);
+
+    constexpr bool fit_zeroing = (flags & perm_zeroing) != 0;// needs additional zeroing
+    constexpr bool L2L = (flags16 & 1) != 0;               // from low  to low  64-bit part
+    constexpr bool H2H = (flags16 & 2) != 0;               // from high to high 64-bit part
+    constexpr bool H2L = (flags16 & 4) != 0;               // from high to low  64-bit part
+    constexpr bool L2H = (flags16 & 8) != 0;               // from low  to high 64-bit part
+    constexpr uint8_t pL2L = uint8_t(flags16 >> 32);       // low  to low  permute pattern
+    constexpr uint8_t pH2H = uint8_t(flags16 >> 40);       // high to high permute pattern
+    constexpr uint8_t noperm = 0xE4;                       // pattern for no permute
+
+    __m128i y = a;                                         // result
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128();
+
+    if constexpr ((flags & perm_perm) != 0) {
+        // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {
+            // use larger permutation
+            constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // permutation pattern
+            y = permute4 <L.a[0], L.a[1], L.a[2], L.a[3]> (Vec4i(a));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_shleft) != 0 && (flags & perm_addz) == 0) {// fits pslldq
+            return _mm_bslli_si128(a, (16-(flags >> perm_rot_count)) & 0xF);
+        }
+        else if constexpr ((flags & perm_shright) != 0 && (flags & perm_addz) == 0) {// fits psrldq
+            return _mm_bsrli_si128(a, (flags >> perm_rot_count) & 0xF);
+        }
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags & perm_zeroing) == 0 && (flags >> perm_rot_count & 0xF) == 0) {
+#if INSTRSET >= 8   // AVX2
+            return _mm_broadcastw_epi16(y);
+#else
+            y = _mm_shufflelo_epi16(a, 0);                 // broadcast of first element
+            return _mm_unpacklo_epi64(y, y);
+#endif
+        }
+#if  INSTRSET >= 4 && INSTRSET < 10                        // SSSE3, but no compact mask
+        else if constexpr (fit_zeroing) {
+            // Do both permutation and zeroing with PSHUFB instruction
+            const EList <int8_t, 16> bm = pshufb_mask<Vec8s>(indexs);
+            return _mm_shuffle_epi8(a, Vec8s().load(bm.a));
+        }
+#endif
+        else if constexpr ((flags & perm_punpckh) != 0) {  // fits punpckhi
+            y = _mm_unpackhi_epi16(a, a);
+        }
+        else if constexpr ((flags & perm_punpckl) != 0) {  // fits punpcklo
+            y = _mm_unpacklo_epi16(a, a);
+        }
+#if INSTRSET >= 4  // SSSE3
+        else if constexpr ((flags & perm_rotate) != 0) {   // fits palignr
+            y = _mm_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF);
+        }
+#endif
+        else if constexpr (!H2L && !L2H) {                 // no crossing of 64-bit boundary
+            if constexpr (L2L && pL2L != noperm) {
+                y = _mm_shufflelo_epi16(y, pL2L);          // permute low 64-bits
+            }
+            if constexpr (H2H && pH2H != noperm) {
+                y = _mm_shufflehi_epi16(y, pH2H);          // permute high 64-bits
+            }
+        }
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        else if constexpr ((flags & perm_compress) != 0) {
+            y = _mm_maskz_compress_epi16(__mmask8(compress_mask(indexs)), y); // compress
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+        else if constexpr ((flags & perm_expand) != 0) {
+            y = _mm_maskz_expand_epi16(__mmask8(expand_mask(indexs)), y); // expand
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+#endif  // AVX512VBMI2
+#if INSTRSET >= 4  // SSSE3
+        else {  // needs general permute
+            const EList <int8_t, 16> bm = pshufb_mask<Vec8s>(indexs);
+            y = _mm_shuffle_epi8(a, Vec8s().load(bm.a));
+            return y;  // _mm_shuffle_epi8 also does zeroing
+        }
+    }
+    if constexpr (fit_zeroing) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi16(zero_mask<8>(indexs), y);
+#else  // use broad mask
+        const EList <int16_t, 8> bm = zero_mask_broad<Vec8s>(indexs);
+        y = _mm_and_si128(Vec8s().load(bm.a), y);
+#endif
+    }
+    return y;
+#else // INSTRSET < 4
+        else {
+        // Difficult case. Use permutations of low and high half separately
+            constexpr uint8_t pH2L = uint8_t(flags16 >> 48);       // high to low  permute pattern
+            constexpr uint8_t pL2H = uint8_t(flags16 >> 56);       // low  to high permute pattern
+            __m128i yswap = _mm_shuffle_epi32(y, 0x4E);    // swap low and high 64-bits
+            if constexpr (H2L && pH2L != noperm) {
+                yswap = _mm_shufflelo_epi16(yswap, pH2L);  // permute low 64-bits
+            }
+            if constexpr (L2H && pL2H != noperm) {
+                yswap = _mm_shufflehi_epi16(yswap, pL2H);  // permute high 64-bits
+            }
+            if constexpr (L2L && pL2L != noperm) {
+                y =     _mm_shufflelo_epi16(y, pL2L);      // permute low 64-bits
+            }
+            if constexpr (H2H && pH2H != noperm) {
+                y =     _mm_shufflehi_epi16(y, pH2H);      // permute high 64-bits
+            }
+            if constexpr (H2H || L2L) {                    // merge data from y and yswap
+                auto selb = make_bit_mask<8,0x102>(indexs);// blend by bit 2. invert upper half
+                const EList <int16_t, 8> bm = make_broad_mask<Vec8s>(selb);// convert to broad mask
+                y = selectb(Vec8s().load(bm.a), yswap, y);
+            }
+            else {
+                y = yswap;
+            }
+        }
+    }
+    if constexpr (fit_zeroing) {
+        // additional zeroing needed
+        const EList <int16_t, 8> bm = zero_mask_broad<Vec8s>(indexs);
+        y = _mm_and_si128(Vec8s().load(bm.a), y);
+    }
+    return y;
+#endif
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8us permute8(Vec8us const a) {
+    return Vec8us(permute8 <i0, i1, i2, i3, i4, i5, i6, i7>(Vec8s(a)));
+}
+
+// permute Vec16c
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+    static inline Vec16c permute16(Vec16c const a) {
+
+    // indexes as array
+    constexpr int indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec16c>(indexs);
+
+    constexpr bool fit_zeroing = (flags & perm_zeroing) != 0;// needs additional zeroing
+
+    __m128i y = a;                                         // result
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128();
+
+    if constexpr ((flags & perm_perm) != 0) {
+        // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {
+            // use larger permutation
+            constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // permutation pattern
+            y = permute8 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7] > (Vec8s(a));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_shleft) != 0) {   // fits pslldq
+            y = _mm_bslli_si128(a, (16-(flags >> perm_rot_count)) & 0xF);
+            if ((flags & perm_addz) == 0) return y;
+        }
+        else if constexpr ((flags & perm_shright) != 0) {  // fits psrldq
+            y = _mm_bsrli_si128(a, (flags >> perm_rot_count) & 0xF);
+            if ((flags & perm_addz) == 0) return y;
+        }
+#if  INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask
+        else if constexpr (fit_zeroing) {
+            // Do both permutation and zeroing with PSHUFB instruction
+            const EList <int8_t, 16> bm = pshufb_mask<Vec16c>(indexs);
+            return _mm_shuffle_epi8(a, Vec16c().load(bm.a));
+        }
+#endif
+        else if constexpr ((flags & perm_punpckh) != 0) {  // fits punpckhi
+            y = _mm_unpackhi_epi8(a, a);
+        }
+        else if constexpr ((flags & perm_punpckl) != 0) {  // fits punpcklo
+            y = _mm_unpacklo_epi8(a, a);
+        }
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        else if constexpr ((flags & perm_compress) != 0) {
+            y = _mm_maskz_compress_epi8(__mmask16(compress_mask(indexs)), y); // compress
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+        else if constexpr ((flags & perm_expand) != 0) {
+            y = _mm_maskz_expand_epi8(__mmask16(expand_mask(indexs)), y); // expand
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+#endif  // AVX512VBMI2
+#if INSTRSET >= 8  // AVX2
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags & fit_zeroing) == 0 && (flags >> perm_rot_count & 0xF) == 0) {
+            return _mm_broadcastb_epi8(y);
+        }
+#endif
+#if INSTRSET >= 4  // SSSE3
+        else if constexpr ((flags & perm_rotate) != 0) {   // fits palignr
+            y = _mm_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF);
+        }
+        else {  // needs general permute
+            const EList <int8_t, 16> bm = pshufb_mask<Vec16c>(indexs);
+            y = _mm_shuffle_epi8(a, Vec16c().load(bm.a));
+            return y;  // _mm_shuffle_epi8 also does zeroing
+        }
+    }
+#else
+        else {
+            // Difficult case. Use permutations of low and high half separately
+            Vec16c swapped, te2e, te2o, to2e, to2o, combeven, combodd;
+
+            // get permutation indexes for four 16-bit permutes:
+            // k = 0: e2e: even bytes of source to even bytes of destination
+            // k = 1: o2e: odd  bytes of source to even bytes of destination
+            // k = 2: e2o: even bytes of source to odd  bytes of destination
+            // k = 3: o2o: odd  bytes of source to odd  bytes of destination
+            auto eoperm = [](uint8_t const k, int const (&indexs)[16]) constexpr {
+                uint8_t  ix = 0;            // index element
+                uint64_t r = 0;             // return value
+                uint8_t  i = (k >> 1) & 1;  // look at odd indexes if destination is odd
+                for (; i < 16; i += 2) {
+                    ix = (indexs[i] >= 0 && ((indexs[i] ^ k) & 1) == 0) ? (uint8_t)indexs[i]/2u : 0xFFu;
+                    r |= uint64_t(ix) << (i / 2u * 8u);
+                }
+                return r;
+            };
+            constexpr uint64_t ixe2e = eoperm(0, indexs);
+            constexpr uint64_t ixo2e = eoperm(1, indexs);
+            constexpr uint64_t ixe2o = eoperm(2, indexs);
+            constexpr uint64_t ixo2o = eoperm(3, indexs);
+
+            constexpr bool e2e = ixe2e != -1ll;  // even bytes of source to odd  bytes of destination
+            constexpr bool e2o = ixe2o != -1ll;  // even bytes of source to odd  bytes of destination
+            constexpr bool o2e = ixo2e != -1ll;  // odd  bytes of source to even bytes of destination
+            constexpr bool o2o = ixo2o != -1ll;  // odd  bytes of source to odd  bytes of destination
+
+            if constexpr (e2o || o2e) swapped = rotate_left(Vec8s(a), 8); // swap odd and even bytes
+
+            if constexpr (e2e) te2e = permute8 < int8_t(ixe2e), int8_t(ixe2e>>8), int8_t(ixe2e>>16), int8_t(ixe2e>>24),
+                int8_t(ixe2e>>32), int8_t(ixe2e>>40), int8_t(ixe2e>>48), int8_t(ixe2e>>56)> (Vec8s(a));
+
+            if constexpr (e2o) te2o = permute8 < int8_t(ixe2o), int8_t(ixe2o>>8), int8_t(ixe2o>>16), int8_t(ixe2o>>24),
+                int8_t(ixe2o>>32), int8_t(ixe2o>>40), int8_t(ixe2o>>48), int8_t(ixe2o>>56)> (Vec8s(swapped));
+
+            if constexpr (o2e) to2e = permute8 < int8_t(ixo2e), int8_t(ixo2e>>8), int8_t(ixo2e>>16), int8_t(ixo2e>>24),
+                int8_t(ixo2e>>32), int8_t(ixo2e>>40), int8_t(ixo2e>>48), int8_t(ixo2e>>56)> (Vec8s(swapped));
+
+            if constexpr (o2o) to2o = permute8 < int8_t(ixo2o), int8_t(ixo2o>>8), int8_t(ixo2o>>16), int8_t(ixo2o>>24),
+                int8_t(ixo2o>>32), int8_t(ixo2o>>40), int8_t(ixo2o>>48), int8_t(ixo2o>>56)> (Vec8s(a));
+
+            if constexpr (e2e && o2e) combeven = te2e | to2e;
+            else if constexpr (e2e)   combeven = te2e;
+            else if constexpr (o2e)   combeven = to2e;
+            else                      combeven = _mm_setzero_si128();
+
+            if constexpr (e2o && o2o) combodd = te2o | to2o;
+            else if constexpr (e2o)   combodd = te2o;
+            else if constexpr (o2o)   combodd = to2o;
+            else                      combodd = _mm_setzero_si128();
+
+            __m128i maske = constant4ui <        // mask used even bytes
+                (i0  < 0 ? 0 : 0xFF)   | (i2  < 0 ? 0 : 0xFF0000u),
+                (i4  < 0 ? 0 : 0xFF)   | (i6  < 0 ? 0 : 0xFF0000u),
+                (i8  < 0 ? 0 : 0xFF)   | (i10 < 0 ? 0 : 0xFF0000u),
+                (i12 < 0 ? 0 : 0xFF)   | (i14 < 0 ? 0 : 0xFF0000u) >();
+            __m128i masko = constant4ui <        // mask used odd bytes
+                (i1  < 0 ? 0 : 0xFF00) | (i3  < 0 ? 0 : 0xFF000000u),
+                (i5  < 0 ? 0 : 0xFF00) | (i7  < 0 ? 0 : 0xFF000000u),
+                (i9  < 0 ? 0 : 0xFF00) | (i11 < 0 ? 0 : 0xFF000000u),
+                (i13 < 0 ? 0 : 0xFF00) | (i15 < 0 ? 0 : 0xFF000000u) >();
+
+            return  _mm_or_si128(                // combine even and odd bytes
+                    _mm_and_si128(combeven, maske),
+                    _mm_and_si128(combodd,  masko));
+        }
+    }
+#endif
+    if constexpr (fit_zeroing) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi8(zero_mask<16>(indexs), y);
+#else  // use broad mask
+        const EList <int8_t, 16> bm = zero_mask_broad<Vec16c>(indexs);
+        y = _mm_and_si128(Vec16c().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+    static inline Vec16uc permute16(Vec16uc const a) {
+    return Vec16uc(permute16 <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(Vec16c(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+******************************************************************************
+*
+* These blend functions can mix elements from two different vectors of N
+* elements eadh and optionally set some elements to zero.
+*
+* The N indexes are inserted as template parameters in <>.
+* These indexes must be compile-time constants. Each template parameter
+* selects an element from one of the input vectors a and b.
+* An index in the range 0 .. N-1 selects the corresponding element from a.
+* An index in the range N .. 2*N-1 selects an element from b.
+* An index with the value -1 gives zero in the corresponding element of
+* the result.
+* An index with the value V_DC means don't care. The code will select the
+* optimal sequence of instructions that fits the remaining indexes.
+*
+* Example:
+* Vec4i a(100,101,102,103);         // a is (100, 101, 102, 103)
+* Vec4i b(200,201,202,203);         // b is (200, 201, 202, 203)
+* Vec4i c;
+* c = blend4<1,4,-1,7> (a,b);       // c is (101, 200,   0, 203)
+*
+* A lot of the code here is metaprogramming aiming to find the instructions
+* that best fit the template parameters and instruction set. The metacode
+* will be reduced out to leave only a few vector instructions in release
+* mode with optimization on.
+*****************************************************************************/
+
+// permute and blend Vec2q
+template <int i0, int i1>
+static inline Vec2q blend2(Vec2q const a, Vec2q const b) {
+    int constexpr indexs[2] = { i0, i1 };                  // indexes as array
+    __m128i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec2q>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128();
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute2 <i0, i1> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute2 <i0<0 ? i0 : i0&1, i1<0 ? i1 : i1&1> (b);
+    }
+
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) {// no permutation, only blending
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_epi64 (a, (uint8_t)make_bit_mask<2, 0x301>(indexs), b);
+#elif INSTRSET >= 5  // SSE4.1
+        y = _mm_blend_epi16 (a, b, ((i0 & 2) ? 0x0F : 0) | ((i1 & 2) ? 0xF0 : 0));
+#else  // SSE2
+        const EList <int64_t, 2> bm = make_broad_mask<Vec2q>(make_bit_mask<2, 0x301>(indexs));
+        y = selectb(Vec2q().load(bm.a), b, a);
+#endif
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm_unpacklo_epi64 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm_unpacklo_epi64 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm_unpackhi_epi64 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm_unpackhi_epi64 (b, a);
+    }
+#if INSTRSET >= 4 // SSSE3
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+#endif
+#if ALLOW_FP_PERMUTE  // allow floating point permute instructions on integer vectors
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), (flags >> blend_shufpattern) & 3));
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), (flags >> blend_shufpattern) & 3));
+    }
+#endif
+    else { // No special cases. permute a and b separately, then blend.
+           // This will not occur if ALLOW_FP_PERMUTE is true
+#if INSTRSET >= 5  // SSE4.1
+        constexpr bool dozero = false;
+#else  // SSE2
+        constexpr bool dozero = true;
+#endif
+        constexpr EList<int, 4> L = blend_perm_indexes<2, (int)dozero>(indexs); // get permutation indexes
+        __m128i ya = permute2<L.a[0], L.a[1]>(a);
+        __m128i yb = permute2<L.a[2], L.a[3]>(b);
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_epi64 (ya, (uint8_t)make_bit_mask<2, 0x301>(indexs), yb);
+#elif INSTRSET >= 5  // SSE4.1
+        y = _mm_blend_epi16 (ya, yb, ((i0 & 2) ? 0x0F : 0) | ((i1 & 2) ? 0xF0 : 0));
+#else  // SSE2
+        return _mm_or_si128(ya, yb);
+#endif
+    }
+
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi64(zero_mask<2>(indexs), y);
+#else  // use broad mask
+        const EList <int64_t, 2> bm = zero_mask_broad<Vec2q>(indexs);
+        y = _mm_and_si128(Vec2q().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1>
+static inline Vec2uq blend2(Vec2uq const a, Vec2uq const b) {
+    return Vec2uq(blend2 <i0, i1>(Vec2q(a), Vec2q(b)));
+}
+
+
+// permute and blend Vec4i
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i blend4(Vec4i const a, Vec4i const b) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 };          // indexes as array
+    __m128i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec4i>(indexs); // get flags for possibilities that fit the index pattern
+
+    constexpr bool blendonly = (flags & (blend_perma | blend_permb)) == 0; // no permutation, only blending
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128();
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute4 <i0, i1, i2, i3> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute4 < i0<0?i0:i0&3, i1<0?i1:i1&3, i2<0?i2:i2&3, i3<0?i3:i3&3> (b);
+    }
+    if constexpr ((flags & blend_largeblock) != 0) {       // fits blending with larger block size
+        constexpr EList<int, 2> L = largeblock_indexes<4>(indexs);
+        y = blend2 <L.a[0], L.a[1]> (Vec2q(a), Vec2q(b));
+        if constexpr ((flags & blend_addz) == 0) {
+            return y;                                      // any zeroing has been done by larger blend
+        }
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm_unpacklo_epi32 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm_unpacklo_epi32 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm_unpackhi_epi32 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm_unpackhi_epi32 (b, a);
+    }
+#if INSTRSET >= 4 // SSSE3
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+#endif
+#if ALLOW_FP_PERMUTE  // allow floating point permute instructions on integer vectors
+    else if constexpr ((flags & blend_shufab) != 0 && !blendonly) { // use floating point instruction shufps
+        y = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), uint8_t(flags >> blend_shufpattern)));
+    }
+    else if constexpr ((flags & blend_shufba) != 0 && !blendonly) { // use floating point instruction shufps
+        y = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), uint8_t(flags >> blend_shufpattern)));
+    }
+#endif
+    else { // No special cases. permute a and b separately, then blend.
+#if INSTRSET >= 5  // SSE4.1
+        constexpr bool dozero = false;
+#else  // SSE2
+        constexpr bool dozero = true;
+#endif
+        Vec4i ya = a, yb = b;   // a and b permuted
+        constexpr EList<int, 8> L = blend_perm_indexes<4, (int)dozero>(indexs); // get permutation indexes
+        if constexpr ((flags & blend_perma) != 0 || dozero) {
+            ya = permute4 <L.a[0], L.a[1], L.a[2], L.a[3]>(a);
+        }
+        if constexpr ((flags & blend_permb) != 0 || dozero) {
+            yb = permute4 <L.a[4], L.a[5], L.a[6], L.a[7]>(b);
+        }
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_epi32 (ya, (uint8_t)make_bit_mask<4, 0x302>(indexs), yb);
+#elif INSTRSET >= 5  // SSE4.1
+        constexpr uint8_t mm = ((i0 & 4) ? 0x03 : 0) | ((i1 & 4) ? 0x0C : 0) | ((i2 & 4) ? 0x30 : 0) | ((i3 & 4) ? 0xC0 : 0);
+        y = _mm_blend_epi16 (ya, yb, mm);
+#else  // SSE2. dozero = true
+        return _mm_or_si128(ya, yb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi32(zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 4> bm = zero_mask_broad<Vec4i>(indexs);
+        y = _mm_and_si128(Vec4i().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4ui blend4(Vec4ui const a, Vec4ui const b) {
+    return Vec4ui(blend4<i0, i1, i2, i3>(Vec4i(a), Vec4i(b)));
+}
+
+
+// permute and blend Vec8s
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8s blend8(Vec8s const a, Vec8s const b) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 };  // indexes as array
+    __m128i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec8s>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128();
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute8 <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute8 < i0<0?i0:i0&7, i1<0?i1:i1&7, i2<0?i2:i2&7, i3<0?i3:i3&7,
+                          i4<0?i4:i4&7, i5<0?i5:i5&7, i6<0?i6:i6&7, i7<0?i7:i7&7 > (b);
+    }
+    if constexpr ((flags & blend_largeblock) != 0) {       // fits blending with larger block size
+        constexpr EList<int, 4> L = largeblock_indexes<8>(indexs);
+        y = blend4 <L.a[0], L.a[1], L.a[2], L.a[3]> (Vec4i(a), Vec4i(b));
+        if constexpr ((flags & blend_addz) == 0) {
+            return y;                                      // any zeroing has been done by larger blend
+        }
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm_unpacklo_epi16 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm_unpacklo_epi16 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm_unpackhi_epi16 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm_unpackhi_epi16 (b, a);
+    }
+#if INSTRSET >= 4 // SSSE3
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+#endif
+    else { // No special cases.
+#if INSTRSET >= 10  // AVX512BW
+        const EList <int16_t, 8> bm = perm_mask_broad<Vec8s>(indexs);
+        return _mm_maskz_permutex2var_epi16(zero_mask<8>(indexs), a, Vec8s().load(bm.a), b);
+#endif
+        // full blend instruction not available,
+        // permute a and b separately, then blend.
+#if INSTRSET >= 5  // SSE4.1
+        constexpr bool dozero = (flags & blend_zeroing) != 0;
+#else  // SSE2
+        constexpr bool dozero = true;
+#endif
+        Vec8s ya = a, yb = b;   // a and b permuted
+        constexpr EList<int, 16> L = blend_perm_indexes<8, (int)dozero>(indexs); // get permutation indexes
+        if constexpr ((flags & blend_perma) != 0 || dozero) {
+            ya = permute8 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]> (a);
+        }
+        if constexpr ((flags & blend_permb) != 0 || dozero) {
+            yb = permute8 <L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15]> (b);
+        }
+        if constexpr (dozero) {  // unused elements are zero
+            return _mm_or_si128(ya, yb);
+        }
+        else { // blend ya and yb
+
+#if  INSTRSET >= 5  // SSE4.1
+        constexpr uint8_t mm = ((i0 & 8) ? 0x01 : 0) | ((i1 & 8) ? 0x02 : 0) | ((i2 & 8) ? 0x04 : 0) | ((i3 & 8) ? 0x08 : 0) |
+                               ((i4 & 8) ? 0x10 : 0) | ((i5 & 8) ? 0x20 : 0) | ((i6 & 8) ? 0x40 : 0) | ((i7 & 8) ? 0x80 : 0);
+        y = _mm_blend_epi16 (ya, yb, mm);
+#endif
+        }
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed after special cases
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi16(zero_mask<8>(indexs), y);
+#else  // use broad mask
+        const EList <int16_t, 8> bm = zero_mask_broad<Vec8s>(indexs);
+        y = _mm_and_si128(Vec8s().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8us blend8(Vec8us const a, Vec8us const b) {
+    return Vec8us(blend8<i0, i1, i2, i3, i4, i5, i6, i7>(Vec8s(a), Vec8s(b)));
+}
+
+
+// permute and blend Vec16c
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+    static inline Vec16c blend16(Vec16c const a, Vec16c const b) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };          // indexes as array
+    __m128i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec16c>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128();
+
+    else if constexpr ((flags & blend_b) == 0) {           // nothing from b. just permute a
+        return permute16 <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (a);
+    }
+    else if constexpr ((flags & blend_a) == 0) {           // nothing from a. just permute b
+        constexpr EList<int, 32> L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes
+        return permute16 < L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (b);
+    }
+#if INSTRSET >= 4 // SSSE3
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+#endif
+    else if constexpr ((flags & blend_largeblock) != 0) {  // fits blending with larger block size
+        constexpr EList<int, 8> L = largeblock_indexes<16>(indexs);
+        y = blend8 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7] > (Vec8s(a), Vec8s(b));
+        if constexpr ((flags & blend_addz) == 0) {
+            return y;                                      // any zeroing has been done by larger blend
+        }
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm_unpacklo_epi8 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm_unpacklo_epi8 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm_unpackhi_epi8 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm_unpackhi_epi8 (b, a);
+    }
+    else { // No special cases. Full permute needed
+#if INSTRSET >= 10 && defined ( __AVX512VBMI__ ) // AVX512VBMI
+        const EList <int8_t, 16> bm = perm_mask_broad<Vec16c>(indexs);
+        return _mm_maskz_permutex2var_epi8(zero_mask<16>(indexs), a, Vec16c().load(bm.a), b);
+#endif // __AVX512VBMI__
+
+        // full blend instruction not available,
+        // permute a and b separately, then blend.
+#if INSTRSET >= 10  // AVX512VL
+//#elif INSTRSET >= 5  // SSE4.1    // This is optimal only if both permute16<> calls below have simple special cases
+        constexpr bool dozero = (flags & blend_zeroing) != 0;
+#else  // SSE2
+        constexpr bool dozero = true;
+#endif
+        Vec16c ya = a, yb = b;   // a and b permuted
+        constexpr EList<int, 32> L = blend_perm_indexes<16, (int)dozero>(indexs); // get permutation indexes
+        if constexpr ((flags & blend_perma) != 0 || dozero) {
+            ya = permute16 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7],
+                L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15]> (a);
+        }
+        if constexpr ((flags & blend_permb) != 0 || dozero) {
+            yb = permute16 <L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+                L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]> (b);
+        }
+        if constexpr (dozero) {  // unused fields in ya and yb are zero
+            return _mm_or_si128(ya, yb);
+        }
+        else {
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm_mask_mov_epi8 (ya, (__mmask16)make_bit_mask<16, 0x304>(indexs), yb);
+#endif
+        }
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm_maskz_mov_epi8(zero_mask<16>(indexs), y);
+#else  // use broad mask
+        const EList <int8_t, 16> bm = zero_mask_broad<Vec16c>(indexs);
+        y = _mm_and_si128(Vec16c().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+    static inline Vec16uc blend16(Vec16uc const a, Vec16uc const b) {
+    return Vec16uc(blend16<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(Vec16c(a), Vec16c(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+* This can be used for several purposes:
+*  - table lookup
+*  - permute or blend with variable indexes
+*  - blend from more than two sources
+*  - gather non-contiguous data
+*
+* An index out of range may produce any value - the actual value produced is
+* implementation dependent and may be different for different instruction
+* sets. An index out of range does not produce an error message or exception.
+*
+* Example:
+* Vec4i a(2,0,0,3);           // index a is (  2,   0,   0,   3)
+* Vec4i b(100,101,102,103);   // table b is (100, 101, 102, 103)
+* Vec4i c;
+* c = lookup4 (a,b);          // c is (102, 100, 100, 103)
+*
+*****************************************************************************/
+
+static inline Vec16c lookup16(Vec16c const index, Vec16c const table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index);
+#else
+    uint8_t ii[16];
+    int8_t  tt[16], rr[16];
+    table.store(tt);  index.store(ii);
+    for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x0F];
+    return Vec16c().load(rr);
+#endif
+}
+
+static inline Vec16c lookup32(Vec16c const index, Vec16c const table0, Vec16c const table1) {
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    return (Vec16c)_mm_perm_epi8(table0, table1, index);
+#elif INSTRSET >= 5  // SSSE3
+    Vec16c r0 = _mm_shuffle_epi8(table0, index + 0x70);           // make negative index for values >= 16
+    Vec16c r1 = _mm_shuffle_epi8(table1, (index ^ 0x10) + 0x70);  // make negative index for values <  16
+    return r0 | r1;
+#else
+    uint8_t ii[16];
+    int8_t  tt[32], rr[16];
+    table0.store(tt);  table1.store(tt + 16);  index.store(ii);
+    for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F];
+    return Vec16c().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec16c lookup(Vec16c const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 16) return lookup16(index, Vec16c().load(table));
+    if constexpr (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16));
+    // n > 32. Limit index
+    Vec16uc index1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16uc(index) & uint8_t(n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16uc(index), uint8_t(n - 1));
+    }
+    uint8_t ii[16];  index1.store(ii);
+    int8_t  rr[16];
+    for (int j = 0; j < 16; j++) {
+        rr[j] = ((int8_t*)table)[ii[j]];
+    }
+    return Vec16c().load(rr);
+}
+
+static inline Vec8s lookup8(Vec8s const index, Vec8s const table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index * 0x202 + 0x100);
+#else
+    int16_t ii[8], tt[8], rr[8];
+    table.store(tt);  index.store(ii);
+    for (int j = 0; j < 8; j++) rr[j] = tt[ii[j] & 0x07];
+    return Vec8s().load(rr);
+#endif
+}
+
+static inline Vec8s lookup16(Vec8s const index, Vec8s const table0, Vec8s const table1) {
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    return (Vec8s)_mm_perm_epi8(table0, table1, index * 0x202 + 0x100);
+#elif INSTRSET >= 5  // SSSE3
+    Vec8s r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x202) + Vec16c(Vec8s(0x7170)));
+    Vec8s r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x202 ^ 0x1010) + Vec16c(Vec8s(0x7170)));
+    return r0 | r1;
+#else
+    int16_t ii[16], tt[32], rr[16];
+    table0.store(tt);  table1.store(tt + 8);  index.store(ii);
+    for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F];
+    return Vec8s().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec8s lookup(Vec8s const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 8) return lookup8(index, Vec8s().load(table));
+    if constexpr (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8));
+    // n > 16. Limit index
+    Vec8us index1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8us(index) & (n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8us(index), n - 1);
+    }
+#if INSTRSET >= 8 // AVX2. Use VPERMD
+    Vec8s t1 = _mm_i32gather_epi32((const int *)table, __m128i((Vec4i(index1)) & (Vec4i(0x0000FFFF))), 2);  // even positions
+    Vec8s t2 = _mm_i32gather_epi32((const int *)table, _mm_srli_epi32(index1, 16), 2);  // odd  positions
+    return blend8<0, 8, 2, 10, 4, 12, 6, 14>(t1, t2);
+#else
+    uint16_t ii[8];  index1.store(ii);
+    return Vec8s(((int16_t*)table)[ii[0]], ((int16_t*)table)[ii[1]], ((int16_t*)table)[ii[2]], ((int16_t*)table)[ii[3]],
+        ((int16_t*)table)[ii[4]], ((int16_t*)table)[ii[5]], ((int16_t*)table)[ii[6]], ((int16_t*)table)[ii[7]]);
+#endif
+}
+
+
+static inline Vec4i lookup4(Vec4i const index, Vec4i const table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index * 0x04040404 + 0x03020100);
+#else
+    return Vec4i(table[index[0]], table[index[1]], table[index[2]], table[index[3]]);
+#endif
+}
+
+static inline Vec4i lookup8(Vec4i const index, Vec4i const table0, Vec4i const table1) {
+    // return Vec4i(lookup16(Vec8s(index * 0x20002 + 0x10000), Vec8s(table0), Vec8s(table1)));
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    return (Vec4i)_mm_perm_epi8(table0, table1, index * 0x04040404 + 0x03020100);
+#elif INSTRSET >= 8 // AVX2. Use VPERMD
+    __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector
+    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index)));
+
+#elif INSTRSET >= 4  // SSSE3
+    Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x04040404) + Vec16c(Vec4i(0x73727170)));
+    Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x04040404 ^ 0x10101010) + Vec16c(Vec4i(0x73727170)));
+    return r0 | r1;
+#else    // SSE2
+    int32_t ii[4], tt[8], rr[4];
+    table0.store(tt);  table1.store(tt + 4);  index.store(ii);
+    for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x07];
+    return Vec4i().load(rr);
+#endif
+}
+
+static inline Vec4i lookup16(Vec4i const index, Vec4i const table0, Vec4i const table1, Vec4i const table2, Vec4i const table3) {
+#if INSTRSET >= 8 // AVX2. Use VPERMD
+    __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector
+    __m256i table23 = _mm256_inserti128_si256(_mm256_castsi128_si256(table2), table3, 1); // join tables into 256 bit vector
+    __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index)));
+    __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table23, _mm256_castsi128_si256(index ^ 8)));
+    return select(index >= 8, Vec4i(r1), Vec4i(r0));
+    //return _mm_blendv_epi8(r0, r1, index >= 8);
+
+#elif defined (__XOP__)  // AMD XOP instruction set. Use VPPERM
+    Vec4i r0 = _mm_perm_epi8(table0, table1, ((index) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu);
+    Vec4i r1 = _mm_perm_epi8(table2, table3, ((index ^ 8) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu);
+    return r0 | r1;
+
+#elif INSTRSET >= 5  // SSSE3
+    Vec16c aa = Vec16c(Vec4i(0x73727170));
+    Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c((index) * 0x04040404) + aa);
+    Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c((index ^ 4) * 0x04040404) + aa);
+    Vec4i r2 = _mm_shuffle_epi8(table2, Vec16c((index ^ 8) * 0x04040404) + aa);
+    Vec4i r3 = _mm_shuffle_epi8(table3, Vec16c((index ^ 12) * 0x04040404) + aa);
+    return (r0 | r1) | (r2 | r3);
+
+#else    // SSE2
+    int32_t ii[4], tt[16], rr[4];
+    table0.store(tt);  table1.store(tt + 4);  table2.store(tt + 8);  table3.store(tt + 12);
+    index.store(ii);
+    for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x0F];
+    return Vec4i().load(rr);
+#endif
+}
+
+template <int n>
+static inline Vec4i lookup(Vec4i const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 4) return lookup4(index, Vec4i().load(table));
+    if constexpr (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4));
+    // n > 8. Limit index
+    Vec4ui index1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4ui(index) & (n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec4ui(index), n - 1);
+    }
+#if INSTRSET >= 8 // AVX2. Use VPERMD
+    return _mm_i32gather_epi32((const int *)table, index1, 4);
+#else
+    uint32_t ii[4];  index1.store(ii);
+    return Vec4i(((int32_t*)table)[ii[0]], ((int32_t*)table)[ii[1]], ((int32_t*)table)[ii[2]], ((int32_t*)table)[ii[3]]);
+#endif
+}
+
+
+static inline Vec2q lookup2(Vec2q const index, Vec2q const table) {
+#if INSTRSET >= 5  // SSSE3
+    return _mm_shuffle_epi8(table, index * 0x0808080808080808ll + 0x0706050403020100ll);
+#else
+    int64_t ii[2], tt[2];
+    table.store(tt);  index.store(ii);
+    return Vec2q(tt[int(ii[0])], tt[int(ii[1])]);
+#endif
+}
+
+template <int n>
+static inline Vec2q lookup(Vec2q const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    // n > 0. Limit index
+    Vec2uq index1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec2uq(index) & (n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1.
+        // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+        // since n is a 32-bit integer
+        index1 = Vec2uq(min(Vec2uq(index), constant4ui<n - 1, 0, n - 1, 0>()));
+    }
+    uint32_t ii[4];  index1.store(ii);  // use only lower 32 bits of each index
+    int64_t const * tt = (int64_t const *)table;
+    return Vec2q(tt[ii[0]], tt[ii[2]]);
+}
+
+
+/*****************************************************************************
+*
+*          Byte shifts
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+template <unsigned int b>
+static inline Vec16c shift_bytes_up(Vec16c const a) {
+#if INSTRSET >= 4    // SSSE3
+    if (b < 16) {
+        return _mm_alignr_epi8(a, _mm_setzero_si128(), 16 - b);
+    }
+    else {
+        return _mm_setzero_si128();                       // zero
+    }
+#else
+    int8_t dat[32];
+    if (b < 16) {
+        Vec16c(0).store(dat);
+        a.store(dat + b);
+        return Vec16c().load(dat);
+    }
+    else return 0;
+#endif
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+template <unsigned int b>
+static inline Vec16c shift_bytes_down(Vec16c const a) {
+#if INSTRSET >= 4    // SSSE3
+    if (b < 16) {
+        return _mm_alignr_epi8(_mm_setzero_si128(), a, b);
+    }
+    else {
+        return _mm_setzero_si128();
+    }
+#else
+    int8_t dat[32];
+    if (b < 16) {
+        a.store(dat);
+        Vec16c(0).store(dat + 16);
+        return Vec16c().load(dat + b);
+    }
+    else return 0;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// find lowest and highest index
+template <int N>
+constexpr int min_index(const int (&a)[N]) {
+    int ix = a[0];
+    for (int i = 1; i < N; i++) {
+        if (a[i] < ix) ix = a[i];
+    }
+    return ix;
+}
+
+template <int N>
+constexpr int max_index(const int (&a)[N]) {
+    int ix = a[0];
+    for (int i = 1; i < N; i++) {
+        if (a[i] > ix) ix = a[i];
+    }
+    return ix;
+}
+
+// Load elements from array a with indices i0, i1, i2, i3
+template <int i0, int i1, int i2, int i3>
+static inline Vec4i gather4i(void const * a) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 3) {
+        // load one contiguous block and permute
+        if constexpr (imax > 3) {
+            // make sure we don't read past the end of the array
+            Vec4i b = Vec4i().load((int32_t const *)a + imax - 3);
+            return permute4<i0 - imax + 3, i1 - imax + 3, i2 - imax + 3, i3 - imax + 3>(b);
+        }
+        else {
+            Vec4i b = Vec4i().load((int32_t const *)a + imin);
+            return permute4<i0 - imin, i1 - imin, i2 - imin, i3 - imin>(b);
+        }
+    }
+    if constexpr ((i0<imin + 4 || i0>imax - 4) && (i1<imin + 4 || i1>imax - 4) && (i2<imin + 4 || i2>imax - 4) && (i3<imin + 4 || i3>imax - 4)) {
+        // load two contiguous blocks and blend
+        Vec4i b = Vec4i().load((int32_t const *)a + imin);
+        Vec4i c = Vec4i().load((int32_t const *)a + imax - 3);
+        constexpr int j0 = i0 < imin + 4 ? i0 - imin : 7 - imax + i0;
+        constexpr int j1 = i1 < imin + 4 ? i1 - imin : 7 - imax + i1;
+        constexpr int j2 = i2 < imin + 4 ? i2 - imin : 7 - imax + i2;
+        constexpr int j3 = i3 < imin + 4 ? i3 - imin : 7 - imax + i3;
+        return blend4<j0, j1, j2, j3>(b, c);
+    }
+    // use AVX2 gather if available
+#if INSTRSET >= 8
+    return _mm_i32gather_epi32((const int *)a, Vec4i(i0, i1, i2, i3), 4);
+#else
+    return lookup<imax + 1>(Vec4i(i0, i1, i2, i3), a);
+#endif
+}
+
+// Load elements from array a with indices i0, i1
+template <int i0, int i1>
+static inline Vec2q gather2q(void const * a) {
+    constexpr int imin = i0 < i1 ? i0 : i1;
+    constexpr int imax = i0 > i1 ? i0 : i1;
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 1) {
+        // load one contiguous block and permute
+        if constexpr (imax > 1) {
+            // make sure we don't read past the end of the array
+            Vec2q b = Vec2q().load((int64_t const *)a + imax - 1);
+            return permute2<i0 - imax + 1, i1 - imax + 1>(b);
+        }
+        else {
+            Vec2q b = Vec2q().load((int64_t const *)a + imin);
+            return permute2<i0 - imin, i1 - imin>(b);
+        }
+    }
+    return Vec2q(((int64_t*)a)[i0], ((int64_t*)a)[i1]);
+}
+
+
+/*****************************************************************************
+*
+*          Vector scatter functions with fixed indexes
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+* The scatter functions are useful if the data are distributed in a sparce
+* manner into the array. If the array is dense then it is more efficient
+* to permute the data into the right positions and then write the whole
+* permuted vector into the array.
+*
+* Example:
+* Vec8q a(10,11,12,13,14,15,16,17);
+* int64_t b[16] = {0};
+* scatter<0,2,14,10,1,-1,5,9>(a,b); // b = (10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0)
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4i const data, void * destination) {
+#if INSTRSET >= 10 // AVX512VL
+    __m128i indx = constant4ui<i0, i1, i2, i3>();
+    __mmask8 mask = uint8_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3));
+    _mm_mask_i32scatter_epi32((int*)destination, mask, indx, data, 4);
+
+#elif INSTRSET >= 9  //  AVX512F
+    __m512i indx = _mm512_castsi128_si512(constant4ui<i0, i1, i2, i3>());
+    __mmask16 mask = uint16_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3));
+    _mm512_mask_i32scatter_epi32(destination, mask, indx, _mm512_castsi128_si512(data), 4);
+
+#else
+    int32_t * arr = (int32_t*)destination;
+    const int index[4] = { i0,i1,i2,i3 };
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1>
+static inline void scatter(Vec2q const data, void * destination) {
+    int64_t* arr = (int64_t*)destination;
+    if (i0 >= 0) arr[i0] = data[0];
+    if (i1 >= 0) arr[i1] = data[1];
+}
+
+
+/*****************************************************************************
+*
+*          Scatter functions with variable indexes
+*
+*****************************************************************************/
+
+static inline void scatter(Vec4i const index, uint32_t limit, Vec4i const data, void * destination) {
+#if INSTRSET >= 10  //  AVX512VL
+    __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+    _mm_mask_i32scatter_epi32((int*)destination, mask, index, data, 4);
+#elif INSTRSET >= 9 //  AVX512F
+    __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xF, _mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit)));
+    _mm512_mask_i32scatter_epi32((int*)destination, mask, _mm512_castsi128_si512(index), _mm512_castsi128_si512(data), 4);
+#else
+    int32_t* arr = (int32_t*)destination;
+    for (int i = 0; i < 4; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec2q const index, uint32_t limit, Vec2q const data, void * destination) {
+    int64_t* arr = (int64_t*)destination;
+    if (uint64_t(index[0]) < uint64_t(limit)) arr[index[0]] = data[0];
+    if (uint64_t(index[1]) < uint64_t(limit)) arr[index[1]] = data[1];
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 16 bits with sign extension
+static inline Vec8s extend_low(Vec16c const a) {
+    __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), a);  // 0 > a
+    return         _mm_unpacklo_epi8(a, sign);              // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 16 bits with sign extension
+static inline Vec8s extend_high(Vec16c const a) {
+    __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), a);  // 0 > a
+    return         _mm_unpackhi_epi8(a, sign);              // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 16 bits with zero extension
+static inline Vec8us extend_low(Vec16uc const a) {
+    return    _mm_unpacklo_epi8(a, _mm_setzero_si128());    // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 16 bits with zero extension
+static inline Vec8us extend_high(Vec16uc const a) {
+    return    _mm_unpackhi_epi8(a, _mm_setzero_si128());    // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 32 bits with sign extension
+static inline Vec4i extend_low(Vec8s const a) {
+    __m128i sign = _mm_srai_epi16(a, 15);                   // sign bit
+    return         _mm_unpacklo_epi16(a, sign);             // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 4 elements to 32 bits with sign extension
+static inline Vec4i extend_high(Vec8s const a) {
+    __m128i sign = _mm_srai_epi16(a, 15);                   // sign bit
+    return         _mm_unpackhi_epi16(a, sign);             // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_low(Vec8us const a) {
+    return    _mm_unpacklo_epi16(a, _mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_high(Vec8us const a) {
+    return    _mm_unpackhi_epi16(a, _mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 2 elements to 64 bits with sign extension
+static inline Vec2q extend_low(Vec4i const a) {
+    __m128i sign = _mm_srai_epi32(a, 31);                   // sign bit
+    return         _mm_unpacklo_epi32(a, sign);             // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 2 elements to 64 bits with sign extension
+static inline Vec2q extend_high(Vec4i const a) {
+    __m128i sign = _mm_srai_epi32(a, 31);                   // sign bit
+    return         _mm_unpackhi_epi32(a, sign);             // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 2 elements to 64 bits with zero extension
+static inline Vec2uq extend_low(Vec4ui const a) {
+    return    _mm_unpacklo_epi32(a, _mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 2 elements to 64 bits with zero extension
+static inline Vec2uq extend_high(Vec4ui const a) {
+    return    _mm_unpackhi_epi32(a, _mm_setzero_si128());   // interleave with zero extensions
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress(Vec8s const low, Vec8s const high) {
+    __m128i mask = _mm_set1_epi32(0x00FF00FF);            // mask for low bytes
+    __m128i lowm = _mm_and_si128(low, mask);               // bytes of low
+    __m128i highm = _mm_and_si128(high, mask);              // bytes of high
+    return  _mm_packus_epi16(lowm, highm);                  // unsigned pack
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec16c compress_saturated(Vec8s const low, Vec8s const high) {
+    return  _mm_packs_epi16(low, high);
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec16uc compress(Vec8us const low, Vec8us const high) {
+    return  Vec16uc(compress((Vec8s)low, (Vec8s)high));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec16uc compress_saturated(Vec8us const low, Vec8us const high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i maxval = _mm_set1_epi32(0x00FF00FF);           // maximum value
+    __m128i low1 = _mm_min_epu16(low, maxval);             // upper limit
+    __m128i high1 = _mm_min_epu16(high, maxval);           // upper limit
+    return            _mm_packus_epi16(low1, high1);       // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+    __m128i zero = _mm_setzero_si128();
+    __m128i signlow = _mm_cmpgt_epi16(zero, low);          // sign bit of low
+    __m128i signhi = _mm_cmpgt_epi16(zero, high);          // sign bit of high
+    __m128i slow2 = _mm_srli_epi16(signlow, 8);            // FF if low negative
+    __m128i shigh2 = _mm_srli_epi16(signhi, 8);            // FF if high negative
+    __m128i maskns = _mm_set1_epi32(0x7FFF7FFF);           // mask for removing sign bit
+    __m128i lowns = _mm_and_si128(low, maskns);            // low,  with sign bit removed
+    __m128i highns = _mm_and_si128(high, maskns);          // high, with sign bit removed
+    __m128i lowo = _mm_or_si128(lowns, slow2);             // low,  sign bit replaced by 00FF
+    __m128i higho = _mm_or_si128(highns, shigh2);          // high, sign bit replaced by 00FF
+    return            _mm_packus_epi16(lowo, higho);       // this instruction saturates from signed 16 bit to unsigned 8 bit
+#endif
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed to unsigned, with saturation
+static inline Vec16uc compress_saturated_s2u(Vec8s const low, Vec8s const high) {
+    return  _mm_packus_epi16(low, high);                   // this instruction saturates from signed 16 bit to unsigned 8 bit
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8s compress(Vec4i const low, Vec4i const high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i mask = _mm_set1_epi32(0x0000FFFF);             // mask for low words
+    __m128i lowm = _mm_and_si128(low, mask);               // bytes of low
+    __m128i highm = _mm_and_si128(high, mask);             // bytes of high
+    return  _mm_packus_epi32(lowm, highm);                 // unsigned pack
+#else
+    __m128i low1 = _mm_shufflelo_epi16(low, 0xD8);         // low words in place
+    __m128i high1 = _mm_shufflelo_epi16(high, 0xD8);       // low words in place
+    __m128i low2 = _mm_shufflehi_epi16(low1, 0xD8);        // low words in place
+    __m128i high2 = _mm_shufflehi_epi16(high1, 0xD8);      // low words in place
+    __m128i low3 = _mm_shuffle_epi32(low2, 0xD8);          // low dwords of low  to pos. 0 and 32
+    __m128i high3 = _mm_shuffle_epi32(high2, 0xD8);        // low dwords of high to pos. 0 and 32
+    return  _mm_unpacklo_epi64(low3, high3);               // interleave
+#endif
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec8s compress_saturated(Vec4i const low, Vec4i const high) {
+    return  _mm_packs_epi32(low, high);                    // pack with signed saturation
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8us compress(Vec4ui const low, Vec4ui const high) {
+    return Vec8us(compress((Vec4i)low, (Vec4i)high));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec8us compress_saturated(Vec4ui const low, Vec4ui const high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i maxval = _mm_set1_epi32(0x0000FFFF);           // maximum value
+    __m128i low1 = _mm_min_epu32(low, maxval);             // upper limit
+    __m128i high1 = _mm_min_epu32(high, maxval);           // upper limit
+    return            _mm_packus_epi32(low1, high1);       // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+    __m128i zero = _mm_setzero_si128();
+    __m128i lowzero = _mm_cmpeq_epi16(low, zero);          // for each word is zero
+    __m128i highzero = _mm_cmpeq_epi16(high, zero);        // for each word is zero
+    __m128i mone = _mm_set1_epi32(-1);                     // FFFFFFFF
+    __m128i lownz = _mm_xor_si128(lowzero, mone);          // for each word is nonzero
+    __m128i highnz = _mm_xor_si128(highzero, mone);        // for each word is nonzero
+    __m128i lownz2 = _mm_srli_epi32(lownz, 16);            // shift down to low dword
+    __m128i highnz2 = _mm_srli_epi32(highnz, 16);          // shift down to low dword
+    __m128i lowsatur = _mm_or_si128(low, lownz2);          // low, saturated
+    __m128i hisatur = _mm_or_si128(high, highnz2);         // high, saturated
+    return  Vec8us(compress(Vec4i(lowsatur), Vec4i(hisatur)));
+#endif
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed to unsigned, with saturation
+static inline Vec8us compress_saturated_s2u(Vec4i const low, Vec4i const high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return  _mm_packus_epi32(low, high);                   // this instruction saturates from signed 32 bit to unsigned 16 bit
+#else
+    __m128i val_32 = _mm_set1_epi32(0x8000);
+    __m128i val_16 = _mm_set1_epi16(-0x8000);
+    __m128i low1   = _mm_sub_epi32(low, val_32);
+    __m128i high1  = _mm_sub_epi32(high, val_32);
+    return  _mm_add_epi16(_mm_packs_epi32(low1, high1), val_16);
+#endif
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec4i compress(Vec2q const low, Vec2q const high) {
+    __m128i low2 = _mm_shuffle_epi32(low, 0xD8);           // low dwords of low  to pos. 0 and 32
+    __m128i high2 = _mm_shuffle_epi32(high, 0xD8);         // low dwords of high to pos. 0 and 32
+    return  _mm_unpacklo_epi64(low2, high2);               // interleave
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+// This function is very inefficient unless the SSE4.2 instruction set is supported
+static inline Vec4i compress_saturated(Vec2q const low, Vec2q const high) {
+    Vec2q maxval = _mm_set_epi32(0, 0x7FFFFFFF, 0, 0x7FFFFFFF);
+    Vec2q minval = _mm_set_epi32(-1, 0x80000000, -1, 0x80000000);
+    Vec2q low1 = min(low, maxval);
+    Vec2q high1 = min(high, maxval);
+    Vec2q low2 = max(low1, minval);
+    Vec2q high2 = max(high1, minval);
+    return compress(low2, high2);
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec4ui compress(Vec2uq const low, Vec2uq const high) {
+    return Vec4ui(compress((Vec2q)low, (Vec2q)high));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec4ui compress_saturated(Vec2uq const low, Vec2uq const high) {
+    __m128i zero = _mm_setzero_si128();
+    __m128i lowzero = _mm_cmpeq_epi32(low, zero);          // for each dword is zero
+    __m128i highzero = _mm_cmpeq_epi32(high, zero);        // for each dword is zero
+    __m128i mone = _mm_set1_epi32(-1);                     // FFFFFFFF
+    __m128i lownz = _mm_xor_si128(lowzero, mone);          // for each dword is nonzero
+    __m128i highnz = _mm_xor_si128(highzero, mone);        // for each dword is nonzero
+    __m128i lownz2 = _mm_srli_epi64(lownz, 32);            // shift down to low dword
+    __m128i highnz2 = _mm_srli_epi64(highnz, 32);          // shift down to low dword
+    __m128i lowsatur = _mm_or_si128(low, lownz2);          // low, saturated
+    __m128i hisatur = _mm_or_si128(high, highnz2);         // high, saturated
+    return  Vec4ui(compress(Vec2q(lowsatur), Vec2q(hisatur)));
+}
+
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+******************************************************************************
+*
+* The instruction set does not support integer vector division. Instead, we
+* are using a method for fast integer division based on multiplication and
+* shift operations. This method is faster than simple integer division if the
+* same divisor is used multiple times.
+*
+* All elements in a vector are divided by the same divisor. It is not possible
+* to divide different elements of the same vector by different divisors.
+*
+* The parameters used for fast division are stored in an object of a
+* Divisor class. This object can be created implicitly, for example in:
+*        Vec4i a, b; int c;
+*        a = b / c;
+* or explicitly as:
+*        a = b / Divisor_i(c);
+*
+* It takes more time to compute the parameters used for fast division than to
+* do the division. Therefore, it is advantageous to use the same divisor object
+* multiple times. For example, to divide 80 unsigned short integers by 10:
+*
+*        uint16_t dividends[80], quotients[80];         // numbers to work with
+*        Divisor_us div10(10);                          // make divisor object for dividing by 10
+*        Vec8us temp;                                   // temporary vector
+*        for (int i = 0; i < 80; i += 8) {              // loop for 4 elements per iteration
+*            temp.load(dividends+i);                    // load 4 elements
+*            temp /= div10;                             // divide each element by 10
+*            temp.store(quotients+i);                   // store 4 elements
+*        }
+*
+* The parameters for fast division can also be computed at compile time. This is
+* an advantage if the divisor is known at compile time. Use the const_int or const_uint
+* macro to do this. For example, for signed integers:
+*        Vec8s a, b;
+*        a = b / const_int(10);
+* Or, for unsigned integers:
+*        Vec8us a, b;
+*        a = b / const_uint(10);
+*
+* The division of a vector of 16-bit integers is faster than division of a vector
+* of other integer sizes.
+*
+*
+* Mathematical formula, used for signed division with fixed or variable divisor:
+* (From T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication,
+* Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation.
+* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 )
+* x = dividend
+* d = abs(divisor)
+* w = integer word size, bits
+* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1
+* L = max(L,1)
+* m = 1 + 2^(w+L-1)/d - 2^w                      [division should overflow to 0 if d = 1]
+* sh1 = L-1
+* q = x + (m*x >> w)                             [high part of signed multiplication with 2w bits]
+* q = (q >> sh1) - (x<0 ? -1 : 0)
+* if (divisor < 0) q = -q
+* result = trunc(x/d) = q
+*
+* Mathematical formula, used for unsigned division with variable divisor:
+* (Also from T. Granlund and P. L. Montgomery)
+* x = dividend
+* d = divisor
+* w = integer word size, bits
+* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1
+* m = 1 + 2^w * (2^L-d) / d                      [2^L should overflow to 0 if L = w]
+* sh1 = min(L,1)
+* sh2 = max(L-1,0)
+* t = m*x >> w                                   [high part of unsigned multiplication with 2w bits]
+* result = floor(x/d) = (((x-t) >> sh1) + t) >> sh2
+*
+* Mathematical formula, used for unsigned division with fixed divisor:
+* (From Terje Mathisen, unpublished)
+* x = dividend
+* d = divisor
+* w = integer word size, bits
+* b = floor(log2(d)) = bit_scan_reverse(d)
+* f = 2^(w+b) / d                                [exact division]
+* If f is an integer then d is a power of 2 then go to case A
+* If the fractional part of f is < 0.5 then go to case B
+* If the fractional part of f is > 0.5 then go to case C
+* Case A:  [shift only]
+* result = x >> b
+* Case B:  [round down f and compensate by adding one to x]
+* result = ((x+1)*floor(f)) >> (w+b)             [high part of unsigned multiplication with 2w bits]
+* Case C:  [round up f, no compensation for rounding error]
+* result = (x*ceil(f)) >> (w+b)                  [high part of unsigned multiplication with 2w bits]
+*
+*
+*****************************************************************************/
+
+// encapsulate parameters for fast division on vector of 4 32-bit signed integers
+class Divisor_i {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count used in fast division
+    __m128i sign;                                          // sign of divisor
+public:
+    Divisor_i() {};                                        // Default constructor
+    Divisor_i(int32_t d) {                                 // Constructor with divisor
+        set(d);
+    }
+    Divisor_i(int m, int s1, int sgn) {                    // Constructor with precalculated multiplier, shift and sign
+        multiplier = _mm_set1_epi32(m);
+        shift1 = _mm_cvtsi32_si128(s1);
+        sign = _mm_set1_epi32(sgn);
+    }
+    void set(int32_t d) {                                  // Set or change divisor, calculate parameters
+        const int32_t d1 = ::abs(d);
+        int32_t sh, m;
+        if (d1 > 1) {
+            sh = (int)bit_scan_reverse(uint32_t(d1 - 1));  // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
+            m = int32_t((int64_t(1) << (32 + sh)) / d1 - ((int64_t(1) << 32) - 1)); // calculate multiplier
+        }
+        else {
+            m = 1;                                         // for d1 = 1
+            sh = 0;
+            if (d == 0) m /= d;                            // provoke error here if d = 0
+            if (uint32_t(d) == 0x80000000u) {              // fix overflow for this special case
+                m = 0x80000001;
+                sh = 30;
+            }
+        }
+        multiplier = _mm_set1_epi32(m);                    // broadcast multiplier
+        shift1 = _mm_cvtsi32_si128(sh);                    // shift count
+        //sign = _mm_set1_epi32(d < 0 ? -1 : 0);           // bug in VS2019, 32 bit release. Replace by this:
+        if (d < 0) sign = _mm_set1_epi32(-1); else sign = _mm_set1_epi32(0);  // sign of divisor
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count
+        return shift1;
+    }
+    __m128i getsign() const {                              // get sign of divisor
+        return sign;
+    }
+};
+
+// encapsulate parameters for fast division on vector of 4 32-bit unsigned integers
+class Divisor_ui {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count 1 used in fast division
+    __m128i shift2;                                        // shift count 2 used in fast division
+public:
+    Divisor_ui() {};                                       // Default constructor
+    Divisor_ui(uint32_t d) {                               // Constructor with divisor
+        set(d);
+    }
+    Divisor_ui(uint32_t m, int s1, int s2) {               // Constructor with precalculated multiplier and shifts
+        multiplier = _mm_set1_epi32((int32_t)m);
+        shift1 = _mm_setr_epi32(s1, 0, 0, 0);
+        shift2 = _mm_setr_epi32(s2, 0, 0, 0);
+    }
+    void set(uint32_t d) {                                 // Set or change divisor, calculate parameters
+        uint32_t L, L2, sh1, sh2, m;
+        switch (d) {
+        case 0:
+            m = sh1 = sh2 = 1 / d;                         // provoke error for d = 0
+            break;
+        case 1:
+            m = 1; sh1 = sh2 = 0;                          // parameters for d = 1
+            break;
+        case 2:
+            m = 1; sh1 = 1; sh2 = 0;                       // parameters for d = 2
+            break;
+        default:                                           // general case for d > 2
+            L = bit_scan_reverse(d - 1) + 1;               // ceil(log2(d))
+            L2 = uint32_t(L < 32 ? 1 << L : 0);            // 2^L, overflow to 0 if L = 32
+            m = 1 + uint32_t((uint64_t(L2 - d) << 32) / d);// multiplier
+            sh1 = 1;  sh2 = L - 1;                         // shift counts
+        }
+        multiplier = _mm_set1_epi32((int32_t)m);
+        shift1 = _mm_setr_epi32((int32_t)sh1, 0, 0, 0);
+        shift2 = _mm_setr_epi32((int32_t)sh2, 0, 0, 0);
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count 1
+        return shift1;
+    }
+    __m128i gets2() const {                                // get shift count 2
+        return shift2;
+    }
+};
+
+
+// encapsulate parameters for fast division on vector of 8 16-bit signed integers
+class Divisor_s {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count used in fast division
+    __m128i sign;                                          // sign of divisor
+public:
+    Divisor_s() {};                                        // Default constructor
+    Divisor_s(int16_t d) {                                 // Constructor with divisor
+        set(d);
+    }
+    Divisor_s(int16_t m, int s1, int sgn) {                // Constructor with precalculated multiplier, shift and sign
+        multiplier = _mm_set1_epi16(m);
+        shift1 = _mm_setr_epi32(s1, 0, 0, 0);
+        sign = _mm_set1_epi32(sgn);
+    }
+    void set(int16_t d) {                                  // Set or change divisor, calculate parameters
+        const int32_t d1 = ::abs(d);
+        int32_t sh, m;
+        if (d1 > 1) {
+            sh = (int32_t)bit_scan_reverse(uint32_t(d1-1));// shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1
+            m = ((int32_t(1) << (16 + sh)) / d1 - ((int32_t(1) << 16) - 1)); // calculate multiplier
+        }
+        else {
+            m = 1;                                         // for d1 = 1
+            sh = 0;
+            if (d == 0) m /= d;                            // provoke error here if d = 0
+            if (uint16_t(d) == 0x8000u) {                  // fix overflow for this special case
+                m = 0x8001;
+                sh = 14;
+            }
+        }
+        multiplier = _mm_set1_epi16(int16_t(m));           // broadcast multiplier
+        shift1 = _mm_setr_epi32(sh, 0, 0, 0);              // shift count
+        sign = _mm_set1_epi32(d < 0 ? -1 : 0);             // sign of divisor
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count
+        return shift1;
+    }
+    __m128i getsign() const {                              // get sign of divisor
+        return sign;
+    }
+};
+
+
+// encapsulate parameters for fast division on vector of 8 16-bit unsigned integers
+class Divisor_us {
+protected:
+    __m128i multiplier;                                    // multiplier used in fast division
+    __m128i shift1;                                        // shift count 1 used in fast division
+    __m128i shift2;                                        // shift count 2 used in fast division
+public:
+    Divisor_us() {};                                       // Default constructor
+    Divisor_us(uint16_t d) {                               // Constructor with divisor
+        set(d);
+    }
+    Divisor_us(uint16_t m, int s1, int s2) {               // Constructor with precalculated multiplier and shifts
+        multiplier = _mm_set1_epi16((int16_t)m);
+        shift1 = _mm_setr_epi32(s1, 0, 0, 0);
+        shift2 = _mm_setr_epi32(s2, 0, 0, 0);
+    }
+    void set(uint16_t d) {                                 // Set or change divisor, calculate parameters
+        uint16_t L, L2, sh1, sh2, m;
+        switch (d) {
+        case 0:
+            m = sh1 = sh2 = 1u / d;                        // provoke error for d = 0
+            break;
+        case 1:
+            m = 1; sh1 = sh2 = 0;                          // parameters for d = 1
+            break;
+        case 2:
+            m = 1; sh1 = 1; sh2 = 0;                       // parameters for d = 2
+            break;
+        default:                                           // general case for d > 2
+            L = (uint16_t)bit_scan_reverse(d - 1u) + 1u;   // ceil(log2(d))
+            L2 = uint16_t(1 << L);                         // 2^L, overflow to 0 if L = 16
+            m = 1u + uint16_t((uint32_t(L2 - d) << 16) / d); // multiplier
+            sh1 = 1;  sh2 = L - 1u;                        // shift counts
+        }
+        multiplier = _mm_set1_epi16((int16_t)m);
+        shift1 = _mm_setr_epi32((int32_t)sh1, 0, 0, 0);
+        shift2 = _mm_setr_epi32((int32_t)sh2, 0, 0, 0);
+    }
+    __m128i getm() const {                                 // get multiplier
+        return multiplier;
+    }
+    __m128i gets1() const {                                // get shift count 1
+        return shift1;
+    }
+    __m128i gets2() const {                                // get shift count 2
+        return shift2;
+    }
+};
+
+
+// vector operator / : divide each element by divisor
+
+// vector of 4 32-bit signed integers
+static inline Vec4i operator / (Vec4i const a, Divisor_i const d) {
+#if INSTRSET >= 5
+    __m128i t1 = _mm_mul_epi32(a, d.getm());               // 32x32->64 bit signed multiplication of a[0] and a[2]
+    __m128i t2 = _mm_srli_epi64(t1, 32);                   // high dword of result 0 and 2
+    __m128i t3 = _mm_srli_epi64(a, 32);                    // get a[1] and a[3] into position for multiplication
+    __m128i t4 = _mm_mul_epi32(t3, d.getm());              // 32x32->64 bit signed multiplication of a[1] and a[3]
+    __m128i t7 = _mm_blend_epi16(t2, t4, 0xCC);
+    __m128i t8 = _mm_add_epi32(t7, a);                     // add
+    __m128i t9 = _mm_sra_epi32(t8, d.gets1());             // shift right arithmetic
+    __m128i t10 = _mm_srai_epi32(a, 31);                   // sign of a
+    __m128i t11 = _mm_sub_epi32(t10, d.getsign());         // sign of a - sign of d
+    __m128i t12 = _mm_sub_epi32(t9, t11);                  // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t12, d.getsign());         // change sign if divisor negative
+#else  // not SSE4.1
+    __m128i t1 = _mm_mul_epu32(a, d.getm());               // 32x32->64 bit unsigned multiplication of a[0] and a[2]
+    __m128i t2 = _mm_srli_epi64(t1, 32);                   // high dword of result 0 and 2
+    __m128i t3 = _mm_srli_epi64(a, 32);                    // get a[1] and a[3] into position for multiplication
+    __m128i t4 = _mm_mul_epu32(t3, d.getm());              // 32x32->64 bit unsigned multiplication of a[1] and a[3]
+    __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);              // mask of dword 1 and 3
+    __m128i t6 = _mm_and_si128(t4, t5);                    // high dword of result 1 and 3
+    __m128i t7 = _mm_or_si128(t2, t6);                     // combine all four results of unsigned high mul into one vector
+    // convert unsigned to signed high multiplication (from: H S Warren: Hacker's delight, 2003, p. 132)
+    __m128i u1 = _mm_srai_epi32(a, 31);                    // sign of a
+    __m128i u2 = _mm_srai_epi32(d.getm(), 31);             // sign of m [ m is always negative, except for abs(d) = 1 ]
+    __m128i u3 = _mm_and_si128(d.getm(), u1);              // m * sign of a
+    __m128i u4 = _mm_and_si128(a, u2);                     // a * sign of m
+    __m128i u5 = _mm_add_epi32(u3, u4);                    // sum of sign corrections
+    __m128i u6 = _mm_sub_epi32(t7, u5);                    // high multiplication result converted to signed
+    __m128i t8 = _mm_add_epi32(u6, a);                     // add a
+    __m128i t9 = _mm_sra_epi32(t8, d.gets1());             // shift right arithmetic
+    __m128i t10 = _mm_sub_epi32(u1, d.getsign());          // sign of a - sign of d
+    __m128i t11 = _mm_sub_epi32(t9, t10);                  // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t11, d.getsign());         // change sign if divisor negative
+#endif
+}
+
+// vector of 4 32-bit unsigned integers
+static inline Vec4ui operator / (Vec4ui const a, Divisor_ui const d) {
+    __m128i t1 = _mm_mul_epu32(a, d.getm());               // 32x32->64 bit unsigned multiplication of a[0] and a[2]
+    __m128i t2 = _mm_srli_epi64(t1, 32);                   // high dword of result 0 and 2
+    __m128i t3 = _mm_srli_epi64(a, 32);                    // get a[1] and a[3] into position for multiplication
+    __m128i t4 = _mm_mul_epu32(t3, d.getm());              // 32x32->64 bit unsigned multiplication of a[1] and a[3]
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i t7 = _mm_blend_epi16(t2, t4, 0xCC);            // blend two results
+#else
+    __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);              // mask of dword 1 and 3
+    __m128i t6 = _mm_and_si128(t4, t5);                    // high dword of result 1 and 3
+    __m128i t7 = _mm_or_si128(t2, t6);                     // combine all four results into one vector
+#endif
+    __m128i t8 = _mm_sub_epi32(a, t7);                     // subtract
+    __m128i t9 = _mm_srl_epi32(t8, d.gets1());             // shift right logical
+    __m128i t10 = _mm_add_epi32(t7, t9);                   // add
+    return        _mm_srl_epi32(t10, d.gets2());           // shift right logical
+}
+
+// vector of 8 16-bit signed integers
+static inline Vec8s operator / (Vec8s const a, Divisor_s const d) {
+    __m128i t1 = _mm_mulhi_epi16(a, d.getm());             // multiply high signed words
+    __m128i t2 = _mm_add_epi16(t1, a);                     // + a
+    __m128i t3 = _mm_sra_epi16(t2, d.gets1());             // shift right arithmetic
+    __m128i t4 = _mm_srai_epi16(a, 15);                    // sign of a
+    __m128i t5 = _mm_sub_epi16(t4, d.getsign());           // sign of a - sign of d
+    __m128i t6 = _mm_sub_epi16(t3, t5);                    // + 1 if a < 0, -1 if d < 0
+    return        _mm_xor_si128(t6, d.getsign());          // change sign if divisor negative
+}
+
+// vector of 8 16-bit unsigned integers
+static inline Vec8us operator / (Vec8us const a, Divisor_us const d) {
+    __m128i t1 = _mm_mulhi_epu16(a, d.getm());             // multiply high unsigned words
+    __m128i t2 = _mm_sub_epi16(a, t1);                     // subtract
+    __m128i t3 = _mm_srl_epi16(t2, d.gets1());             // shift right logical
+    __m128i t4 = _mm_add_epi16(t1, t3);                    // add
+    return        _mm_srl_epi16(t4, d.gets2());            // shift right logical
+}
+
+
+// vector of 16 8-bit signed integers
+static inline Vec16c operator / (Vec16c const a, Divisor_s const d) {
+    // expand into two Vec8s
+    Vec8s low = extend_low(a) / d;
+    Vec8s high = extend_high(a) / d;
+    return compress(low, high);
+}
+
+// vector of 16 8-bit unsigned integers
+static inline Vec16uc operator / (Vec16uc const a, Divisor_us const d) {
+    // expand into two Vec8s
+    Vec8us low = extend_low(a) / d;
+    Vec8us high = extend_high(a) / d;
+    return compress(low, high);
+}
+
+// vector operator /= : divide
+static inline Vec8s & operator /= (Vec8s & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec8us & operator /= (Vec8us & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec4i & operator /= (Vec4i & a, Divisor_i const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec4ui & operator /= (Vec4ui & a, Divisor_ui const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16c & operator /= (Vec16c & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16uc & operator /= (Vec16uc & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec4i by compile-time constant
+template <int32_t d>
+static inline Vec4i divide_by_i(Vec4i const x) {
+    static_assert(d != 0, "Integer division by zero");     // Error message if dividing by zero
+    if constexpr (d == 1) return  x;
+    if constexpr (d == -1) return -x;
+    if constexpr (uint32_t(d) == 0x80000000u) return Vec4i(x == Vec4i(0x80000000)) & 1; // prevent overflow when changing sign
+    constexpr uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);// compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+    if constexpr ((d1 & (d1 - 1)) == 0) {
+        // d1 is a power of 2. use shift
+        constexpr int k = bit_scan_reverse_const(d1);
+        __m128i sign;
+        if constexpr (k > 1) sign = _mm_srai_epi32(x, k-1); else sign = x;// k copies of sign bit
+        __m128i bias = _mm_srli_epi32(sign, 32 - k);       // bias = x >= 0 ? 0 : k-1
+        __m128i xpbias = _mm_add_epi32(x, bias);           // x + bias
+        __m128i q = _mm_srai_epi32(xpbias, k);             // (x + bias) >> k
+        if (d > 0)      return q;                          // d > 0: return  q
+        return _mm_sub_epi32(_mm_setzero_si128(), q);      // d < 0: return -q
+    }
+    // general case
+    constexpr int32_t sh = bit_scan_reverse_const(uint32_t(d1) - 1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+    constexpr int32_t mult = int(1 + (uint64_t(1) << (32 + sh)) / uint32_t(d1) - (int64_t(1) << 32));   // multiplier
+    const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+    return x / div;
+}
+
+// define Vec4i a / const_int(d)
+template <int32_t d>
+static inline Vec4i operator / (Vec4i const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec4i a / const_uint(d)
+template <uint32_t d>
+static inline Vec4i operator / (Vec4i const a, Const_uint_t<d>) {
+    static_assert(d < 0x80000000u, "Dividing signed by overflowing unsigned");
+    return divide_by_i<int32_t(d)>(a);                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec4i & operator /= (Vec4i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec4i & operator /= (Vec4i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec4ui by compile-time constant
+template <uint32_t d>
+static inline Vec4ui divide_by_ui(Vec4ui const x) {
+    static_assert(d != 0, "Integer division by zero");     // Error message if dividing by zero
+    if constexpr (d == 1) return x;                        // divide by 1
+    constexpr int b = bit_scan_reverse_const(d);           // floor(log2(d))
+    if constexpr ((uint32_t(d) & (uint32_t(d) - 1)) == 0) {
+        // d is a power of 2. use shift
+        return    _mm_srli_epi32(x, b);                    // x >> b
+    }
+    // general case (d > 2)
+    constexpr uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d
+    constexpr uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
+    constexpr bool round_down = (2 * rem < d);             // check if fraction is less than 0.5
+    constexpr uint32_t mult1 = round_down ? mult : mult + 1;
+    // do 32*32->64 bit unsigned multiplication and get high part of result
+#if INSTRSET >= 10
+    const __m128i multv = _mm_maskz_set1_epi32(0x05, mult1);// zero-extend mult and broadcast
+#else
+    const __m128i multv = Vec2uq(uint64_t(mult1));         // zero-extend mult and broadcast
+#endif
+    __m128i t1 = _mm_mul_epu32(x, multv);                  // 32x32->64 bit unsigned multiplication of x[0] and x[2]
+    if constexpr (round_down) {
+        t1 = _mm_add_epi64(t1, multv);                     // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m128i t2 = _mm_srli_epi64(t1, 32);                   // high dword of result 0 and 2
+    __m128i t3 = _mm_srli_epi64(x, 32);                    // get x[1] and x[3] into position for multiplication
+    __m128i t4 = _mm_mul_epu32(t3, multv);                 // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+    if constexpr (round_down) {
+        t4 = _mm_add_epi64(t4, multv);                     // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+#if INSTRSET >= 5   // SSE4.1 supported
+    __m128i t7 = _mm_blend_epi16(t2, t4, 0xCC);            // blend two results
+#else
+    __m128i t5 = _mm_set_epi32(-1, 0, -1, 0);              // mask of dword 1 and 3
+    __m128i t6 = _mm_and_si128(t4, t5);                    // high dword of result 1 and 3
+    __m128i t7 = _mm_or_si128(t2, t6);                     // combine all four results into one vector
+#endif
+    Vec4ui q = _mm_srli_epi32(t7, b);                      // shift right by b
+    return q;                                              // no overflow possible
+}
+
+// define Vec4ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec4ui operator / (Vec4ui const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec4ui a / const_int(d)
+template <int32_t d>
+static inline Vec4ui operator / (Vec4ui const a, Const_int_t<d>) {
+    static_assert(d < 0x80000000u, "Dividing unsigned integer by negative value is ambiguous");
+    return divide_by_ui<d>(a);                             // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec4ui & operator /= (Vec4ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec4ui & operator /= (Vec4ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8s by compile-time constant
+template <int d>
+static inline Vec8s divide_by_i(Vec8s const x) {
+    constexpr int16_t d0 = int16_t(d);                     // truncate d to 16 bits
+    static_assert(d0 != 0, "Integer division by zero");    // Error message if dividing by zero
+    if constexpr (d0 == 1) return  x;                      // divide by  1
+    if constexpr (d0 == -1) return -x;                     // divide by -1
+    if constexpr (uint16_t(d0) == 0x8000u) return Vec8s(x == Vec8s(0x8000)) & 1;// prevent overflow when changing sign
+    // if (d > 0x7FFF || d < -0x8000) return 0;            // not relevant when d truncated to 16 bits
+    const uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);// compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+    if constexpr ((d1 & (d1 - 1)) == 0) {
+        // d is a power of 2. use shift
+        constexpr int k = bit_scan_reverse_const(uint32_t(d1));
+        __m128i sign;
+        if constexpr (k > 1) sign = _mm_srai_epi16(x, k-1); else sign = x;// k copies of sign bit
+        __m128i bias = _mm_srli_epi16(sign, 16 - k);       // bias = x >= 0 ? 0 : k-1
+        __m128i xpbias = _mm_add_epi16(x, bias);           // x + bias
+        __m128i q = _mm_srai_epi16(xpbias, k);             // (x + bias) >> k
+        if (d0 > 0)  return q;                             // d0 > 0: return  q
+        return _mm_sub_epi16(_mm_setzero_si128(), q);      // d0 < 0: return -q
+    }
+    // general case
+    const int L = bit_scan_reverse_const(uint16_t(d1 - 1)) + 1;// ceil(log2(d)). (d < 2 handled above)
+    const int16_t mult = int16_t(1 + (1u << (15 + L)) / uint32_t(d1) - 0x10000);// multiplier
+    const int shift1 = L - 1;
+    const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+    return x / div;
+}
+
+// define Vec8s a / const_int(d)
+template <int d>
+static inline Vec8s operator / (Vec8s const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec8s a / const_uint(d)
+template <uint32_t d>
+static inline Vec8s operator / (Vec8s const a, Const_uint_t<d>) {
+    static_assert(d < 0x8000u, "Dividing signed by overflowing unsigned");
+    return divide_by_i<int(d)>(a);                         // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8s & operator /= (Vec8s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8s & operator /= (Vec8s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8us by compile-time constant
+template <uint32_t d>
+static inline Vec8us divide_by_ui(Vec8us const x) {
+    constexpr uint16_t d0 = uint16_t(d);                   // truncate d to 16 bits
+    static_assert(d0 != 0, "Integer division by zero");    // Error message if dividing by zero
+    if constexpr (d0 == 1) return x;                       // divide by 1
+    constexpr int b = bit_scan_reverse_const(d0);          // floor(log2(d))
+    if constexpr ((d0 & (d0 - 1u)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm_srli_epi16(x, b);                      // x >> b
+    }
+    // general case (d > 2)
+    constexpr uint16_t mult = uint16_t((1u << uint32_t(b+16)) / d0); // multiplier = 2^(32+b) / d
+    constexpr uint32_t rem = (uint32_t(1) << uint32_t(b + 16)) - uint32_t(d0) * mult;// remainder 2^(32+b) % d
+    constexpr bool round_down = (2u * rem < d0);           // check if fraction is less than 0.5
+    Vec8us x1 = x;
+    if (round_down) {
+        x1 = x1 + 1u;                                      // round down mult and compensate by adding 1 to x
+    }
+    constexpr uint16_t mult1 = round_down ? mult : mult + 1;
+    const __m128i multv = _mm_set1_epi16((int16_t)mult1);  // broadcast mult
+    __m128i xm = _mm_mulhi_epu16(x1, multv);               // high part of 16x16->32 bit unsigned multiplication
+    Vec8us q = _mm_srli_epi16(xm, (int)b);                 // shift right by b
+    if constexpr (round_down) {
+        Vec8sb overfl = (x1 == (Vec8us)_mm_setzero_si128());// check for overflow of x+1
+        return select(overfl, Vec8us(uint16_t(mult1 >> (uint16_t)b)), q); // deal with overflow (rarely needed)
+    }
+    else {
+        return q;                                          // no overflow possible
+    }
+}
+
+// define Vec8us a / const_uint(d)
+template <uint32_t d>
+static inline Vec8us operator / (Vec8us const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8us a / const_int(d)
+template <int d>
+static inline Vec8us operator / (Vec8us const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                             // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8us & operator /= (Vec8us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8us & operator /= (Vec8us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// define Vec16c a / const_int(d)
+template <int d>
+static inline Vec16c operator / (Vec16c const a, Const_int_t<d>) {
+    // expand into two Vec8s
+    Vec8s low = extend_low(a) / Const_int_t<d>();
+    Vec8s high = extend_high(a) / Const_int_t<d>();
+    return compress(low, high);
+}
+
+// define Vec16c a / const_uint(d)
+template <uint32_t d>
+static inline Vec16c operator / (Vec16c const a, Const_uint_t<d>) {
+    static_assert (uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned");
+    return a / Const_int_t<d>();                           // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16c & operator /= (Vec16c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16c & operator /= (Vec16c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec16uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec16uc operator / (Vec16uc const a, Const_uint_t<d>) {
+    // expand into two Vec8usc
+    Vec8us low = extend_low(a) / Const_uint_t<d>();
+    Vec8us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low, high);
+}
+
+// define Vec16uc a / const_int(d)
+template <int d>
+static inline Vec16uc operator / (Vec16uc const a, Const_int_t<d>) {
+    static_assert (int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return a / Const_uint_t<d>();                          // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16uc & operator /= (Vec16uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16uc & operator /= (Vec16uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9  // compact boolean vector Vec16b
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16b const x) {
+    return __mmask16(x);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8b const x) {
+    return uint8_t(Vec8b_masktype(x));
+}
+
+#endif
+
+#if INSTRSET >= 10  // compact boolean vectors, other sizes
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4b const x) {
+    return __mmask8(x) & 0x0F;
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2b const x) {
+    return __mmask8(x) & 0x03;
+}
+
+#else  // broad boolean vectors
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16cb const x) {
+    return (uint16_t)_mm_movemask_epi8(x);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8sb const x) {
+    __m128i a = _mm_packs_epi16(x, x);  // 16-bit words to bytes
+    return (uint8_t)_mm_movemask_epi8(a);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4ib const x) {
+    __m128i a = _mm_packs_epi32(x, x);  // 32-bit dwords to 16-bit words
+    __m128i b = _mm_packs_epi16(a, a);  // 16-bit words to bytes
+    return uint8_t(_mm_movemask_epi8(b) & 0xF);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec2qb const x) {
+    uint32_t a = (uint32_t)_mm_movemask_epi8(x);
+    return (a & 1) | ((a >> 7) & 2);
+}
+
+#endif
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI128_H
diff --git a/Bwdif/VCL2/vectori256.h b/Bwdif/VCL2/vectori256.h
new file mode 100644
index 0000000..035693f
--- /dev/null
+++ b/Bwdif/VCL2/vectori256.h
@@ -0,0 +1,5778 @@
+/****************************  vectori256.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining integer vector classes as interface to intrinsic
+* functions in x86 microprocessors with AVX2 and later instruction sets.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec256b   Vector of 256  bits. Used internally as base class
+* Vec32c    Vector of  32  8-bit signed    integers
+* Vec32uc   Vector of  32  8-bit unsigned  integers
+* Vec32cb   Vector of  32  Booleans for use with Vec32c and Vec32uc
+* Vec16s    Vector of  16  16-bit signed   integers
+* Vec16us   Vector of  16  16-bit unsigned integers
+* Vec16sb   Vector of  16  Booleans for use with Vec16s and Vec16us
+* Vec8i     Vector of   8  32-bit signed   integers
+* Vec8ui    Vector of   8  32-bit unsigned integers
+* Vec8ib    Vector of   8  Booleans for use with Vec8i and Vec8ui
+* Vec4q     Vector of   4  64-bit signed   integers
+* Vec4uq    Vector of   4  64-bit unsigned integers
+* Vec4qb    Vector of   4  Booleans for use with Vec4q and Vec4uq
+*
+* Each vector object is represented internally in the CPU as a 256-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORI256_H
+#define VECTORI256_H 1
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+// check combination of header files
+#if defined (VECTORI256E_H)
+#error Two different versions of vectori256.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+// Generate a constant vector of 8 integers stored in memory.
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7 >
+    static inline constexpr __m256i constant8ui() {
+    /*
+    const union {
+        uint32_t i[8];
+        __m256i ymm;
+    } u = { {i0,i1,i2,i3,i4,i5,i6,i7} };
+    return u.ymm;
+    */
+    return _mm256_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7);
+}
+
+
+// Join two 128-bit vectors
+#define set_m128ir(lo,hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo),(hi),1)
+
+/*****************************************************************************
+*
+*         Compact boolean vectors
+*
+*****************************************************************************/
+
+#if INSTRSET >= 10  // 32-bit and 64-bit masks require AVX512BW
+
+// Compact vector of 32 booleans
+class Vec32b {
+protected:
+    __mmask32  mm; // Boolean mask register
+public:
+    // Default constructor:
+    Vec32b() {
+    }
+    // Constructor to convert from type __mmask32 used in intrinsics
+    // Made explicit to prevent implicit conversion from int
+    Vec32b(__mmask32 x) {
+        mm = x;
+    }
+    /*
+    // Constructor to build from all elements:
+    Vec32b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
+        bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15,
+        bool b16, bool b17, bool b18, bool b19, bool b20, bool b21, bool b22, bool b23,
+        bool b24, bool b25, bool b26, bool b27, bool b28, bool b29, bool b30, bool b31) {
+        mm = uint32_t(
+            (uint32_t)b0 | (uint32_t)b1 << 1 | (uint32_t)b2 << 2 | (uint32_t)b3 << 3 |
+            (uint32_t)b4 << 4 | (uint32_t)b5 << 5 | (uint32_t)b6 << 6 | (uint32_t)b7 << 7 |
+            (uint32_t)b8 << 8 | (uint32_t)b9 << 9 | (uint32_t)b10 << 10 | (uint32_t)b11 << 11 |
+            (uint32_t)b12 << 12 | (uint32_t)b13 << 13 | (uint32_t)b14 << 14 | (uint32_t)b15 << 15 |
+            (uint32_t)b16 << 16 | (uint32_t)b17 << 17 | (uint32_t)b18 << 18 | (uint32_t)b19 << 19 |
+            (uint32_t)b20 << 20 | (uint32_t)b21 << 21 | (uint32_t)b22 << 22 | (uint32_t)b23 << 23 |
+            (uint32_t)b24 << 24 | (uint32_t)b25 << 25 | (uint32_t)b26 << 26 | (uint32_t)b27 << 27 |
+            (uint32_t)b28 << 28 | (uint32_t)b29 << 29 | (uint32_t)b30 << 30 | (uint32_t)b31 << 31);
+    } */
+    // Constructor to broadcast single value:
+    Vec32b(bool b) {
+        mm = __mmask32(-int32_t(b));
+    }
+    // Constructor to make from two halves
+    Vec32b(Vec16b const x0, Vec16b const x1) {
+        mm = uint16_t(__mmask16(x0)) | uint32_t(__mmask16(x1)) << 16;
+    }
+    // Assignment operator to convert from type __mmask32 used in intrinsics:
+    Vec32b & operator = (__mmask32 x) {
+        mm = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec32b & operator = (bool b) {
+        mm = Vec32b(b);
+        return *this;
+    }
+    // Type cast operator to convert to __mmask32 used in intrinsics
+    operator __mmask32() const {
+        return mm;
+    }
+    // split into two halves
+    Vec16b get_low() const {
+        return Vec16b(__mmask16(mm));
+    }
+    Vec16b get_high() const {
+        return Vec16b(__mmask16(mm >> 16));
+    }
+    // Member function to change a single element in vector
+    Vec32b const insert(int index, bool value) {
+        mm = __mmask32(((uint32_t)mm & ~(1 << index)) | (uint32_t)value << index);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return ((uint32_t)mm >> index) & 1;
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec32b & load_bits(uint32_t a) {
+        mm = __mmask32(a);
+        return *this;
+    }
+    // Number of elements
+    static constexpr int size() {
+        return 32;
+    }
+    // Type of elements
+    static constexpr int elementtype() {
+        return 2;
+    }
+};
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Vector of 256 bits. Used as base class
+*
+*****************************************************************************/
+
+class Vec256b {
+protected:
+    __m256i ymm; // Integer vector
+public:
+    // Default constructor:
+    Vec256b() {}
+
+    // Constructor to broadcast the same value into all elements
+    // Removed because of undesired implicit conversions:
+    //Vec256b(int i) {ymm = _mm256_set1_epi32(-(i & 1));}
+
+    // Constructor to build from two Vec128b:
+    Vec256b(Vec128b const a0, Vec128b const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec256b(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec256b & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec256b & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    Vec256b & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm256_storeu_si256((__m256i*)p, ymm);
+    }
+    // Member function storing into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    void store_a(void * p) const {
+        _mm256_store_si256((__m256i*)p, ymm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 32
+    void store_nt(void * p) const {
+        _mm256_stream_si256((__m256i*)p, ymm);
+    }
+    // Member functions to split into two Vec128b:
+    Vec128b get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec128b get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int size() {
+        return 256;
+    }
+    static constexpr int elementtype() {
+        return 1;
+    }
+    typedef __m256i registertype;
+};
+
+
+// Define operators and functions for this class
+
+// vector operator & : bitwise and
+static inline Vec256b operator & (Vec256b const a, Vec256b const b) {
+    return _mm256_and_si256(a, b);
+}
+static inline Vec256b operator && (Vec256b const a, Vec256b const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec256b operator | (Vec256b const a, Vec256b const b) {
+    return _mm256_or_si256(a, b);
+}
+static inline Vec256b operator || (Vec256b const a, Vec256b const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec256b operator ^ (Vec256b const a, Vec256b const b) {
+    return _mm256_xor_si256(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec256b operator ~ (Vec256b const a) {
+    return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+}
+
+// vector operator &= : bitwise and
+static inline Vec256b & operator &= (Vec256b & a, Vec256b const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec256b & operator |= (Vec256b & a, Vec256b const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec256b & operator ^= (Vec256b & a, Vec256b const b) {
+    a = a ^ b;
+    return a;
+}
+
+// function andnot: a & ~ b
+static inline Vec256b andnot (Vec256b const a, Vec256b const b) {
+    return _mm256_andnot_si256(b, a);
+}
+
+static inline __m256i zero_si256() {
+    return _mm256_setzero_si256();
+}
+
+
+/*****************************************************************************
+*
+*          selectb function
+*
+*****************************************************************************/
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// Only bit 7 in each byte of s is checked,
+static inline __m256i selectb (__m256i const s, __m256i const a, __m256i const b) {
+    return _mm256_blendv_epi8 (b, a, s);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec256b const a) {
+    return _mm256_testc_si256(a,_mm256_set1_epi32(-1)) != 0;
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec256b const a) {
+    return ! _mm256_testz_si256(a,a);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec32c : public Vec256b {
+public:
+    // Default constructor:
+    Vec32c(){
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32c(int i) {
+        ymm = _mm256_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,
+        int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
+        int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) {
+        ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+    }
+    // Constructor to build from two Vec16c:
+    Vec32c(Vec16c const a0, Vec16c const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec32c(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec32c & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to convert from type Vec256b used in emulation
+    Vec32c(Vec256b const & x) {  // gcc requires const &
+        ymm = x;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec32c & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32c & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec32c & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        ymm = _mm256_maskz_loadu_epi8(__mmask32(((uint64_t)1 << n) - 1), p);
+#else
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 16) {
+            *this = Vec32c(Vec16c().load_partial(n, p), 0);
+        }
+        else if (n < 32) {
+            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char const*)p+16));
+        }
+        else {
+            load(p);
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        _mm256_mask_storeu_epi8(p, __mmask32(((uint64_t)1 << n) - 1), ymm);
+#else
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 16) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 32) {
+            get_low().store(p);
+            get_high().store_partial(n-16, (char*)p+16);
+        }
+        else {
+            store(p);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 32-n elements are set to zero
+    Vec32c & cutoff(int n) {
+#if INSTRSET >= 10
+        ymm = _mm256_maskz_mov_epi8(__mmask32(((uint64_t)1 << n) - 1), ymm);
+#else
+        if (uint32_t(n) >= 32) return *this;
+        const union {
+            int32_t i[16];
+            char    c[64];
+        } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
+        *this &= Vec32c().load(mask.c+32-n);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32c const insert(int index, int8_t value) {
+#if INSTRSET >= 10
+        ymm = _mm256_mask_set1_epi8(ymm, __mmask32(1u << index), value);
+#else
+        const int8_t maskl[64] = {0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+            -1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0};
+        __m256i broad = _mm256_set1_epi8(value);  // broadcast value into all elements
+        __m256i mask  = _mm256_loadu_si256((__m256i const*)(maskl+32-(index & 0x1F))); // mask with FF at index position
+        ymm = selectb(mask,broad,ymm);
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(int index) const {
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        __m256i x = _mm256_maskz_compress_epi8(__mmask32(1u << index), ymm);
+        return (int8_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(x));
+#else
+        int8_t x[32];
+        store(x);
+        return x[index & 0x1F];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16c:
+    Vec16c get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec16c get_high() const {
+        return _mm256_extracti128_si256(ymm,1);
+    }
+    static constexpr int size() {
+        return 32;
+    }
+    static constexpr int elementtype() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+class Vec32cb : public Vec32c {
+public:
+    // Default constructor:
+    Vec32cb(){
+    }
+    // Constructor to build from all elements:
+    /*
+    Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15,
+        bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23,
+        bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) :
+        Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7),
+            -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15),
+            -int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23),
+            -int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31))
+        {} */
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec32cb(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec32cb & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec32cb(bool b) : Vec32c(-int8_t(b)) {
+    }
+    // Constructor to convert from Vec32c
+    Vec32cb(Vec32c const a) {
+        ymm = a;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec32cb & operator = (bool b) {
+        *this = Vec32cb(b);
+        return *this;
+    }
+    // Constructor to build from two Vec16cb:
+    Vec32cb(Vec16cb const a0, Vec16cb const a1) : Vec32c(Vec16c(a0), Vec16c(a1)) {
+    }
+    // Member functions to split into two Vec16c:
+    Vec16cb get_low() const {
+        return Vec16cb(Vec32c::get_low());
+    }
+    Vec16cb get_high() const {
+        return Vec16cb(Vec32c::get_high());
+    }
+    Vec32cb & insert (int index, bool a) {
+        Vec32c::insert(index, -(int8_t)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec32c::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec32cb & load_bits(uint32_t a) {
+        __m256i b1 = _mm256_set1_epi32((int32_t)~a);       // broadcast a. Invert because we have no compare-not-equal
+        __m256i m1 = constant8ui<0,0,0x01010101,0x01010101,0x02020202,0x02020202,0x03030303,0x03030303>();
+        __m256i c1 = _mm256_shuffle_epi8(b1, m1);          // get right byte in each position
+        __m256i m2 = constant8ui<0x08040201,0x80402010,0x08040201,0x80402010,0x08040201,0x80402010,0x08040201,0x80402010>();
+        __m256i d1 = _mm256_and_si256(c1, m2);             // isolate one bit in each byte
+        ymm = _mm256_cmpeq_epi8(d1,_mm256_setzero_si256());// compare with 0
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec32cb(int b) = delete;
+    Vec32cb & operator = (int x) = delete;
+};
+#else
+
+typedef Vec32b Vec32cb;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators and functions for Vec32b or Vec32cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32cb operator & (Vec32cb const a, Vec32cb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask32(__mmask32(a) & __mmask32(b)); // _kand_mask32 not defined in all compilers
+#else
+    return Vec32c(Vec256b(a) & Vec256b(b));
+#endif
+}
+static inline Vec32cb operator && (Vec32cb const a, Vec32cb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32cb operator | (Vec32cb const a, Vec32cb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask32(__mmask32(a) | __mmask32(b)); // _kor_mask32
+#else
+    return Vec32c(Vec256b(a) | Vec256b(b));
+#endif
+}
+static inline Vec32cb operator || (Vec32cb const a, Vec32cb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32cb operator ^ (Vec32cb const a, Vec32cb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask32(__mmask32(a) ^ __mmask32(b)); // _kxor_mask32
+#else
+    return Vec32c(Vec256b(a) ^ Vec256b(b));
+#endif
+}
+// vector operator ^= : bitwise xor
+static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec32cb operator == (Vec32cb const a, Vec32cb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask32(__mmask32(a) ^ ~__mmask32(b)); // _kxnor_mask32
+#else
+    return Vec32c(a ^ (~b));
+#endif
+}
+
+// vector operator != : xor
+static inline Vec32cb operator != (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32cb operator ~ (Vec32cb const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask32(~ __mmask32(a)); // _knot_mask32
+#else
+    return Vec32c( ~ Vec256b(a));
+#endif
+}
+
+// vector operator ! : element not
+static inline Vec32cb operator ! (Vec32cb const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec32cb andnot (Vec32cb const a, Vec32cb const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return __mmask32(~__mmask32(b) & __mmask32(a)); // _kandn_mask32
+#else
+    return Vec32c(andnot(Vec256b(a), Vec256b(b)));
+#endif
+}
+
+#if INSTRSET >= 10  // compact boolean vectors
+
+// horizontal_and. Returns true if all elements are true
+static inline bool horizontal_and(Vec32b const a) {
+    return __mmask32(a) == 0xFFFFFFFF;
+}
+
+// horizontal_or. Returns true if at least one element is true
+static inline bool horizontal_or(Vec32b const a) {
+    return __mmask32(a) != 0;
+}
+
+// fix bug in gcc version 70400 header file: _mm256_cmp_epi8_mask returns 16 bit mask, should be 32 bit
+template <int i>
+static inline __mmask32 _mm256_cmp_epi8_mask_fix(__m256i a, __m256i b) {
+#if defined (GCC_VERSION) && GCC_VERSION < 70900 &&  ! defined (__INTEL_COMPILER)
+    return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)a, (__v32qi)b, i, (__mmask32)(-1));
+#else
+    return _mm256_cmp_epi8_mask(a, b, i);
+#endif
+}
+
+template <int i>
+static inline __mmask32 _mm256_cmp_epu8_mask_fix(__m256i a, __m256i b) {
+#if defined (GCC_VERSION) && GCC_VERSION < 70900 &&  ! defined (__INTEL_COMPILER)
+    return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)a, (__v32qi)b, i, (__mmask32)(-1));
+#else
+    return _mm256_cmp_epu8_mask(a, b, i);
+#endif
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Operators for Vec32c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32c operator + (Vec32c const a, Vec32c const b) {
+    return _mm256_add_epi8(a, b);
+}
+// vector operator += : add
+static inline Vec32c & operator += (Vec32c & a, Vec32c const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec32c operator ++ (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec32c & operator ++ (Vec32c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32c operator - (Vec32c const a, Vec32c const b) {
+    return _mm256_sub_epi8(a, b);
+}
+// vector operator - : unary minus
+static inline Vec32c operator - (Vec32c const a) {
+    return _mm256_sub_epi8(_mm256_setzero_si256(), a);
+}
+// vector operator -= : add
+static inline Vec32c & operator -= (Vec32c & a, Vec32c const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec32c operator -- (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec32c & operator -- (Vec32c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32c operator * (Vec32c const a, Vec32c const b) {
+    // There is no 8-bit multiply in AVX2. Split into two 16-bit multiplications
+    __m256i aodd    = _mm256_srli_epi16(a,8);              // odd numbered elements of a
+    __m256i bodd    = _mm256_srli_epi16(b,8);              // odd numbered elements of b
+    __m256i muleven = _mm256_mullo_epi16(a,b);             // product of even numbered elements
+    __m256i mulodd  = _mm256_mullo_epi16(aodd,bodd);       // product of odd  numbered elements
+            mulodd  = _mm256_slli_epi16(mulodd,8);         // put odd numbered elements back in place
+#if INSTRSET >= 10   // AVX512VL + AVX512BW
+    return _mm256_mask_mov_epi8(mulodd, 0x55555555, muleven);
+#else
+    __m256i mask    = _mm256_set1_epi32(0x00FF00FF);       // mask for even positions
+    __m256i product = selectb(mask,muleven,mulodd);        // interleave even and odd
+    return product;
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec32c & operator *= (Vec32c & a, Vec32c const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32c operator << (Vec32c const a, int b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                   // mask to remove bits that are shifted out
+    __m256i am    = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow
+    __m256i res   = _mm256_sll_epi16(am,_mm_cvtsi32_si128(b));       // 16-bit shifts
+    return res;
+}
+
+// vector operator <<= : shift left
+static inline Vec32c & operator <<= (Vec32c & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec32c operator >> (Vec32c const a, int b) {
+    __m256i aeven = _mm256_slli_epi16(a,8);                          // even numbered elements of a. get sign bit in position
+            aeven = _mm256_sra_epi16(aeven,_mm_cvtsi32_si128(b+8));  // shift arithmetic, back to position
+    __m256i aodd  = _mm256_sra_epi16(a,_mm_cvtsi32_si128(b));        // shift odd numbered elements arithmetic
+#if INSTRSET >= 10   // AVX512VL + AVX512BW
+    return _mm256_mask_mov_epi8(aodd, 0x55555555, aeven);
+#else
+    __m256i mask  = _mm256_set1_epi32(0x00FF00FF);                   // mask for even positions
+    __m256i res   = selectb(mask,aeven,aodd);                        // interleave even and odd
+    return res;
+#endif
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32c & operator >>= (Vec32c & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32cb operator == (Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    //return _mm256_cmp_epi8_mask (a, b, 0);
+    return _mm256_cmp_epi8_mask_fix<0> (a, b);
+#else
+    return _mm256_cmpeq_epi8(a,b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32cb operator != (Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi8_mask_fix<4> (a, b);
+#else
+    return Vec32cb(Vec32c(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec32cb operator > (Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi8_mask_fix<6> (a, b);
+#else
+    return _mm256_cmpgt_epi8(a,b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec32cb operator < (Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi8_mask_fix<1> (a, b);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32cb operator >= (Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi8_mask_fix<5> (a, b);
+#else
+    return Vec32cb(Vec32c(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32cb operator <= (Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi8_mask_fix<2> (a, b);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec32c operator & (Vec32c const a, Vec32c const b) {
+    return Vec32c(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32c operator && (Vec32c const a, Vec32c const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32c & operator &= (Vec32c & a, Vec32c const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32c operator | (Vec32c const a, Vec32c const b) {
+    return Vec32c(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32c operator || (Vec32c const a, Vec32c const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32c & operator |= (Vec32c & a, Vec32c const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32c operator ^ (Vec32c const a, Vec32c const b) {
+    return Vec32c(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec32c & operator ^= (Vec32c & a, Vec32c const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32c operator ~ (Vec32c const a) {
+    return Vec32c( ~ Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec32cb operator ! (Vec32c const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi8_mask_fix<0> (a, _mm256_setzero_si256());
+#else
+    return _mm256_cmpeq_epi8(a,_mm256_setzero_si256());
+#endif
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec32c select (Vec32cb const s, Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi8(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32c if_add (Vec32cb const f, Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi8 (a, f, a, b);
+#else
+    return a + (Vec32c(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec32c if_sub (Vec32cb const f, Vec32c const a, Vec32c const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi8 (a, f, a, b);
+#else
+    return a - (Vec32c(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec32c if_mul (Vec32cb const f, Vec32c const a, Vec32c const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int8_t horizontal_add (Vec32c const a) {
+    __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256());
+    __m256i sum2 = _mm256_shuffle_epi32(sum1,2);
+    __m256i sum3 = _mm256_add_epi16(sum1,sum2);
+    __m128i sum4 = _mm256_extracti128_si256(sum3,1);
+    __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4);
+    int8_t  sum6 = (int8_t)_mm_cvtsi128_si32(sum5);                  // truncate to 8 bits
+    return  sum6;                                                    // sign extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec32c const a) {
+    __m256i aeven = _mm256_slli_epi16(a,8);                          // even numbered elements of a. get sign bit in position
+            aeven = _mm256_srai_epi16(aeven,8);                      // sign extend even numbered elements
+    __m256i aodd  = _mm256_srai_epi16(a,8);                          // sign extend odd  numbered elements
+    __m256i sum1  = _mm256_add_epi16(aeven,aodd);                    // add even and odd elements
+    __m128i sum2  = _mm_add_epi16(_mm256_extracti128_si256(sum1,1),_mm256_castsi256_si128(sum1));
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+#if false
+    __m128i sum3  = _mm_hadd_epi16(sum2,sum2);
+    __m128i sum4  = _mm_hadd_epi16(sum3,sum3);
+    __m128i sum5  = _mm_hadd_epi16(sum4,sum4);
+#else
+    __m128i sum3  = _mm_add_epi16(sum2,_mm_unpackhi_epi64(sum2,sum2));
+    __m128i sum4  = _mm_add_epi16(sum3,_mm_shuffle_epi32(sum3,1));
+    __m128i sum5  = _mm_add_epi16(sum4,_mm_shufflelo_epi16(sum4,1));
+#endif
+    int16_t sum6  = (int16_t)_mm_cvtsi128_si32(sum5);                // 16 bit sum
+    return  sum6;                                                    // sign extend to 32 bits
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32c add_saturated(Vec32c const a, Vec32c const b) {
+    return _mm256_adds_epi8(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32c sub_saturated(Vec32c const a, Vec32c const b) {
+    return _mm256_subs_epi8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec32c max(Vec32c const a, Vec32c const b) {
+    return _mm256_max_epi8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec32c min(Vec32c const a, Vec32c const b) {
+    return _mm256_min_epi8(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32c abs(Vec32c const a) {
+    return _mm256_abs_epi8(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32c abs_saturated(Vec32c const a) {
+    __m256i absa = abs(a);                                 // abs(a)
+#if INSTRSET >= 10
+    return _mm256_min_epu8(absa, Vec32c(0x7F));
+#else
+    __m256i overfl = _mm256_cmpgt_epi8(_mm256_setzero_si256(), absa);// 0 > a
+    return           _mm256_add_epi8(absa, overfl);        // subtract 1 if 0x80
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32c rotate_left(Vec32c const a, int b) {
+    uint8_t mask = 0xFFu << b;                             // mask off overflow bits
+    __m256i m     = _mm256_set1_epi8(mask);
+    __m128i bb    = _mm_cvtsi32_si128(b & 7);              // b modulo 8
+    __m128i mbb   = _mm_cvtsi32_si128((- b) & 7);          // 8-b modulo 8
+    __m256i left  = _mm256_sll_epi16(a, bb);               // a << b
+    __m256i right = _mm256_srl_epi16(a, mbb);              // a >> 8-b
+            left  = _mm256_and_si256(m, left);             // mask off overflow bits
+            right = _mm256_andnot_si256(m, right);
+    return  _mm256_or_si256(left, right);                  // combine left and right shifted bits
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec32uc : public Vec32c {
+public:
+    // Default constructor:
+    Vec32uc(){
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32uc(uint32_t i) {
+        ymm = _mm256_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,
+        uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
+        uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) {
+        ymm = _mm256_setr_epi8((int8_t)i0, (int8_t)i1, (int8_t)i2, (int8_t)i3, (int8_t)i4, (int8_t)i5, (int8_t)i6, (int8_t)i7, (int8_t)i8, (int8_t)i9, (int8_t)i10, (int8_t)i11, (int8_t)i12, (int8_t)i13, (int8_t)i14, (int8_t)i15,
+            (int8_t)i16, (int8_t)i17, (int8_t)i18, (int8_t)i19, (int8_t)i20, (int8_t)i21, (int8_t)i22, (int8_t)i23, (int8_t)i24, (int8_t)i25, (int8_t)i26, (int8_t)i27, (int8_t)i28, (int8_t)i29, (int8_t)i30, (int8_t)i31);
+    }
+    // Constructor to build from two Vec16uc:
+    Vec32uc(Vec16uc const a0, Vec16uc const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec32uc(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec32uc & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec32uc & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32uc & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32uc const insert(int index, uint8_t value) {
+        Vec32c::insert(index, (int8_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(int index) const {
+        return (uint8_t)Vec32c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16uc:
+    Vec16uc get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec16uc get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int elementtype() {
+        return 5;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec32uc operator + (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc (Vec32c(a) + Vec32c(b));
+}
+
+// vector operator - : subtract
+static inline Vec32uc operator - (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc (Vec32c(a) - Vec32c(b));
+}
+
+// vector operator * : multiply
+static inline Vec32uc operator * (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc (Vec32c(a) * Vec32c(b));
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                // mask to remove bits that are shifted out
+    __m256i am    = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow
+    __m256i res   = _mm256_sll_epi16(am,_mm_cvtsi32_si128((int)b));    // 16-bit shifts
+    return res;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF << (uint32_t)b;                // mask to remove bits that are shifted out
+    __m256i am    = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow
+    __m256i res   = _mm256_srl_epi16(am,_mm_cvtsi32_si128((int)b));    // 16-bit shifts
+    return res;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32cb operator >= (Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    //return _mm256_cmp_epu8_mask (a, b, 5);
+    return _mm256_cmp_epu8_mask_fix<5> (a, b);
+#else
+    return _mm256_cmpeq_epi8(_mm256_max_epu8(a,b), a); // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32cb operator <= (Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu8_mask_fix<2> (a, b);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32cb operator > (Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu8_mask_fix<6> (a, b);
+#else
+    return Vec32cb(Vec32c(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32cb operator < (Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu8_mask_fix<1> (a, b);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec32uc operator & (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32uc operator && (Vec32uc const a, Vec32uc const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec32uc operator | (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32uc operator || (Vec32uc const a, Vec32uc const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32uc operator ^ (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(Vec256b(a) ^ Vec256b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32uc operator ~ (Vec32uc const a) {
+    return Vec32uc( ~ Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec32uc select (Vec32cb const s, Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi8(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32uc if_add (Vec32cb const f, Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi8 (a, f, a, b);
+#else
+    return a + (Vec32uc(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec32uc if_sub (Vec32cb const f, Vec32uc const a, Vec32uc const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi8 (a, f, a, b);
+#else
+    return a - (Vec32uc(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec32uc if_mul (Vec32cb const f, Vec32uc const a, Vec32uc const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+// (Note: horizontal_add_x(Vec32uc) is slightly faster)
+static inline uint8_t horizontal_add (Vec32uc const a) {
+    __m256i  sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256());
+    __m256i  sum2 = _mm256_shuffle_epi32(sum1,2);
+    __m256i  sum3 = _mm256_add_epi16(sum1,sum2);
+    __m128i  sum4 = _mm256_extracti128_si256(sum3,1);
+    __m128i  sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4);
+    uint8_t  sum6 = (uint8_t)_mm_cvtsi128_si32(sum5);      // truncate to 8 bits
+    return   sum6;                                         // zero extend to 32 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec32uc const a) {
+    __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256());
+    __m256i sum2 = _mm256_shuffle_epi32(sum1,2);
+    __m256i sum3 = _mm256_add_epi16(sum1,sum2);
+    __m128i sum4 = _mm256_extracti128_si256(sum3,1);
+    __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4);
+    return         (uint32_t)_mm_cvtsi128_si32(sum5);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32uc add_saturated(Vec32uc const a, Vec32uc const b) {
+    return _mm256_adds_epu8(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32uc sub_saturated(Vec32uc const a, Vec32uc const b) {
+    return _mm256_subs_epu8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec32uc max(Vec32uc const a, Vec32uc const b) {
+    return _mm256_max_epu8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec32uc min(Vec32uc const a, Vec32uc const b) {
+    return _mm256_min_epu8(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16s : public Vec256b {
+public:
+    // Default constructor:
+    Vec16s() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16s(int i) {
+        ymm = _mm256_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec16s(int16_t i0, int16_t i1, int16_t i2,  int16_t i3,  int16_t i4,  int16_t i5,  int16_t i6,  int16_t i7,
+           int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) {
+        ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 );
+    }
+    // Constructor to build from two Vec8s:
+    Vec16s(Vec8s const a0, Vec8s const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec16s(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec16s & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to convert from type Vec256b used in emulation:
+    Vec16s(Vec256b const & x) {
+        ymm = x;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec16s & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16s & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load 16 unsigned 8-bit integers from array
+    Vec16s & load_16uc(void const * p) {
+        ymm = _mm256_cvtepu8_epi16(Vec16uc().load(p));
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16s & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        ymm = _mm256_maskz_loadu_epi16(__mmask16((1u << n) - 1), p);
+#else
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 8) {
+            *this = Vec16s(Vec8s().load_partial(n, p), 0);
+        }
+        else if (n < 16) {
+            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t const*)p+8));
+        }
+        else {
+            load(p);
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL + AVX512BW
+        _mm256_mask_storeu_epi16(p, __mmask16((1u << n) - 1), ymm);
+#else
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 8) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 16) {
+            get_low().store(p);
+            get_high().store_partial(n-8, (int16_t*)p+8);
+        }
+        else {
+            store(p);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16s & cutoff(int n) {
+#if INSTRSET >= 10
+        ymm = _mm256_maskz_mov_epi16(__mmask16((1u << n) - 1), ymm);
+#else
+        *this = Vec16s(Vec32c(*this).cutoff(n * 2));
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16s const insert(int index, int16_t value) {
+#if INSTRSET >= 10
+        ymm = _mm256_mask_set1_epi16(ymm, __mmask16(1u << index), value);
+#else
+        const int16_t m[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+        __m256i mask  = Vec256b().load(m + 16 - (index & 0x0F));
+        __m256i broad = _mm256_set1_epi16(value);
+        ymm = selectb(mask, broad, ymm);
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int16_t extract(int index) const {
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        __m256i x = _mm256_maskz_compress_epi16(__mmask16(1u << index), ymm);
+        return (int16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(x));
+#else
+        int16_t x[16];  // find faster version
+        store(x);
+        return x[index & 0x0F];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8s:
+    Vec8s get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec8s get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 6;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+class Vec16sb : public Vec16s {
+public:
+    // Default constructor:
+    Vec16sb() {
+    }
+    // Constructor to build from all elements:
+    /*
+    Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7),
+            -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
+        {} */
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec16sb(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec16sb & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16sb(bool b) : Vec16s(-int16_t(b)) {
+    }
+    // Constructor to convert from type Vec256b used in emulation:
+    Vec16sb(Vec256b const & x) : Vec16s(x) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16sb & operator = (bool b) {
+        *this = Vec16sb(b);
+        return *this;
+    }
+    // Constructor to build from two Vec8sb:
+    Vec16sb(Vec8sb const a0, Vec8sb const a1) : Vec16s(Vec8s(a0), Vec8s(a1)) {
+    }
+    Vec8sb get_low() const {
+        return Vec8sb(Vec16s::get_low());
+    }
+    Vec8sb get_high() const {
+        return Vec8sb(Vec16s::get_high());
+    }
+    Vec16sb & insert(int index, bool a) {
+        Vec16s::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec16s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec16sb & load_bits(uint16_t a) {
+        __m256i b1 = _mm256_set1_epi16((int16_t)a);  // broadcast a
+        __m256i m1 = constant8ui<0,0,0,0,0x00010001,0x00010001,0x00010001,0x00010001>();
+        __m256i c1 = _mm256_shuffle_epi8(b1, m1);  // get right byte in each position
+        __m256i m2 = constant8ui<0x00020001,0x00080004,0x00200010,0x00800040,0x00020001,0x00080004,0x00200010,0x00800040>();
+        __m256i d1 = _mm256_and_si256(c1, m2); // isolate one bit in each byte
+        ymm = _mm256_cmpgt_epi16(d1, _mm256_setzero_si256());  // compare with 0
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec16sb(int b) = delete;
+    Vec16sb & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec16b Vec16sb;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16sb
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec16sb operator & (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16sb operator && (Vec16sb const a, Vec16sb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16sb operator | (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16sb operator || (Vec16sb const a, Vec16sb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16sb operator ^ (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec16sb operator == (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(a ^ Vec16sb(~b));
+}
+
+// vector operator != : xor
+static inline Vec16sb operator != (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16sb operator ~ (Vec16sb const a) {
+    return Vec16sb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec16sb operator ! (Vec16sb const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec16sb andnot (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+#endif
+
+/*****************************************************************************
+*
+*          Operators for Vec16s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16s operator + (Vec16s const a, Vec16s const b) {
+    return _mm256_add_epi16(a, b);
+}
+// vector operator += : add
+static inline Vec16s & operator += (Vec16s & a, Vec16s const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16s operator ++ (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec16s & operator ++ (Vec16s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16s operator - (Vec16s const a, Vec16s const b) {
+    return _mm256_sub_epi16(a, b);
+}
+// vector operator - : unary minus
+static inline Vec16s operator - (Vec16s const a) {
+    return _mm256_sub_epi16(_mm256_setzero_si256(), a);
+}
+// vector operator -= : subtract
+static inline Vec16s & operator -= (Vec16s & a, Vec16s const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16s operator -- (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec16s & operator -- (Vec16s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16s operator * (Vec16s const a, Vec16s const b) {
+    return _mm256_mullo_epi16(a, b);
+}
+// vector operator *= : multiply
+static inline Vec16s & operator *= (Vec16s & a, Vec16s const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+
+// vector operator << : shift left
+static inline Vec16s operator << (Vec16s const a, int b) {
+    return _mm256_sll_epi16(a,_mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec16s & operator <<= (Vec16s & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16s operator >> (Vec16s const a, int b) {
+    return _mm256_sra_epi16(a,_mm_cvtsi32_si128(b));
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec16s & operator >>= (Vec16s & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16sb operator == (Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, b, 0);
+#else
+    return _mm256_cmpeq_epi16(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16sb operator != (Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, b, 4);
+#else
+    return Vec16sb(Vec16s(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16sb operator > (Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, b, 6);
+#else
+    return _mm256_cmpgt_epi16(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16sb operator < (Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16sb operator >= (Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, b, 5);
+#else
+    return Vec16sb(Vec16s(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16sb operator <= (Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec16s operator & (Vec16s const a, Vec16s const b) {
+    return Vec16s(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16s operator && (Vec16s const a, Vec16s const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16s & operator &= (Vec16s & a, Vec16s const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16s operator | (Vec16s const a, Vec16s const b) {
+    return Vec16s(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16s operator || (Vec16s const a, Vec16s const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16s & operator |= (Vec16s & a, Vec16s const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16s operator ^ (Vec16s const a, Vec16s const b) {
+    return Vec16s(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16s & operator ^= (Vec16s & a, Vec16s const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16s operator ~ (Vec16s const a) {
+    return Vec16s( ~ Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec16sb operator ! (Vec16s const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi16_mask (a, _mm256_setzero_si256(), 0);
+#else
+    return _mm256_cmpeq_epi16(a,_mm256_setzero_si256());
+#endif
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16s select (Vec16sb const s, Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi16(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16s if_add (Vec16sb const f, Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi16 (a, f, a, b);
+#else
+    return a + (Vec16s(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec16s if_sub (Vec16sb const f, Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi16 (a, f, a, b);
+#else
+    return a - (Vec16s(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec16s if_mul (Vec16sb const f, Vec16s const a, Vec16s const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mullo_epi16 (a, f, a, b);
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int16_t horizontal_add (Vec16s const a) {
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128i sum1  = _mm_add_epi16(_mm256_extracti128_si256(a,1),_mm256_castsi256_si128(a));
+    __m128i sum2  = _mm_add_epi16(sum1,_mm_unpackhi_epi64(sum1,sum1));
+    __m128i sum3  = _mm_add_epi16(sum2,_mm_shuffle_epi32(sum2,1));
+    __m128i sum4  = _mm_add_epi16(sum3,_mm_shufflelo_epi16(sum3,1));
+    return (int16_t)_mm_cvtsi128_si32(sum4);               // truncate to 16 bits
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x (Vec16s const a) {
+    __m256i aeven = _mm256_slli_epi32(a,16);               // even numbered elements of a. get sign bit in position
+            aeven = _mm256_srai_epi32(aeven,16);           // sign extend even numbered elements
+    __m256i aodd  = _mm256_srai_epi32(a,16);               // sign extend odd  numbered elements
+    __m256i sum1  = _mm256_add_epi32(aeven,aodd);          // add even and odd elements
+    __m128i sum2  = _mm_add_epi32(_mm256_extracti128_si256(sum1,1),_mm256_castsi256_si128(sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_unpackhi_epi64(sum2,sum2));
+    __m128i sum4  = _mm_add_epi32(sum3,_mm_shuffle_epi32(sum3,1));
+    return (int16_t)_mm_cvtsi128_si32(sum4);               // truncate to 16 bits
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16s add_saturated(Vec16s const a, Vec16s const b) {
+    return _mm256_adds_epi16(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16s sub_saturated(Vec16s const a, Vec16s const b) {
+    return _mm256_subs_epi16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16s max(Vec16s const a, Vec16s const b) {
+    return _mm256_max_epi16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16s min(Vec16s const a, Vec16s const b) {
+    return _mm256_min_epi16(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16s abs(Vec16s const a) {
+    return _mm256_abs_epi16(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16s abs_saturated(Vec16s const a) {
+#if INSTRSET >= 10
+    return _mm256_min_epu16(abs(a), Vec16s(0x7FFF));
+#else
+    __m256i absa   = abs(a);                               // abs(a)
+    __m256i overfl = _mm256_srai_epi16(absa,15);           // sign
+    return           _mm256_add_epi16(absa,overfl);        // subtract 1 if 0x8000
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16s rotate_left(Vec16s const a, int b) {
+    __m256i left  = _mm256_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F));    // a << b
+    __m256i right = _mm256_srl_epi16(a,_mm_cvtsi32_si128((-b) & 0x0F)); // a >> (16 - b)
+    return          _mm256_or_si256(left,right);                        // or
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16us : public Vec16s {
+public:
+    // Default constructor:
+    Vec16us(){
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16us(uint32_t i) {
+        ymm = _mm256_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec16us(uint16_t i0, uint16_t i1, uint16_t i2,  uint16_t i3,  uint16_t i4,  uint16_t i5,  uint16_t i6,  uint16_t i7,
+            uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) {
+        ymm = _mm256_setr_epi16((int16_t)i0, (int16_t)i1, (int16_t)i2, (int16_t)i3, (int16_t)i4, (int16_t)i5, (int16_t)i6, (int16_t)i7,
+            (int16_t)i8, (int16_t)i9, (int16_t)i10, (int16_t)i11, (int16_t)i12, (int16_t)i13, (int16_t)i14, (int16_t)i15);
+    }
+    // Constructor to build from two Vec8us:
+    Vec16us(Vec8us const a0, Vec8us const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec16us(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec16us & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16us & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16us & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16us const insert(int index, uint16_t value) {
+        Vec16s::insert(index, (int16_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint16_t extract(int index) const {
+        return (uint16_t)Vec16s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8us:
+    Vec8us get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec8us get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int elementtype() {
+        return 7;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16us operator + (Vec16us const a, Vec16us const b) {
+    return Vec16us (Vec16s(a) + Vec16s(b));
+}
+
+// vector operator - : subtract
+static inline Vec16us operator - (Vec16us const a, Vec16us const b) {
+    return Vec16us (Vec16s(a) - Vec16s(b));
+}
+
+// vector operator * : multiply
+static inline Vec16us operator * (Vec16us const a, Vec16us const b) {
+    return Vec16us (Vec16s(a) * Vec16s(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const a, uint32_t b) {
+    return _mm256_srl_epi16(a,_mm_cvtsi32_si128((int)b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const a, uint32_t b) {
+    return _mm256_sll_epi16(a,_mm_cvtsi32_si128((int)b));
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16sb operator >= (Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu16_mask (a, b, 5);
+#else
+    __m256i max_ab = _mm256_max_epu16(a,b);                // max(a,b), unsigned
+    return _mm256_cmpeq_epi16(a,max_ab);                   // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16sb operator <= (Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu16_mask (a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16sb operator > (Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu16_mask (a, b, 6);
+#else
+    return Vec16sb(Vec16s(~(b >= a)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16sb operator < (Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu16_mask (a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec16us operator & (Vec16us const a, Vec16us const b) {
+    return Vec16us(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16us operator && (Vec16us const a, Vec16us const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16us operator | (Vec16us const a, Vec16us const b) {
+    return Vec16us(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16us operator || (Vec16us const a, Vec16us const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16us operator ^ (Vec16us const a, Vec16us const b) {
+    return Vec16us(Vec256b(a) ^ Vec256b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16us operator ~ (Vec16us const a) {
+    return Vec16us( ~ Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16us select (Vec16sb const s, Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi16(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16us if_add (Vec16sb const f, Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi16 (a, f, a, b);
+#else
+    return a + (Vec16us(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec16us if_sub (Vec16sb const f, Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi16 (a, f, a, b);
+#else
+    return a - (Vec16us(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec16us if_mul (Vec16sb const f, Vec16us const a, Vec16us const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mullo_epi16 (a, f, a, b);
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint16_t horizontal_add (Vec16us const a) {
+    return (uint16_t)horizontal_add(Vec16s(a));
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec16us const a) {
+#if INSTRSET >= 10
+    __m256i aeven = _mm256_maskz_mov_epi16 (__mmask16(0x5555), a);
+#else
+    __m256i mask  = _mm256_set1_epi32(0x0000FFFF);         // mask for even positions
+    __m256i aeven = _mm256_and_si256(a,mask);              // even numbered elements of a
+#endif
+    __m256i aodd  = _mm256_srli_epi32(a,16);               // zero extend odd numbered elements
+    __m256i sum1  = _mm256_add_epi32(aeven,aodd);          // add even and odd elements
+    __m128i sum2  = _mm_add_epi32(_mm256_extracti128_si256(sum1,1),_mm256_castsi256_si128(sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_unpackhi_epi64(sum2,sum2));
+    __m128i sum4  = _mm_add_epi32(sum3,_mm_shuffle_epi32(sum3,1));
+    return (int16_t)_mm_cvtsi128_si32(sum4);               // truncate to 16 bits
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16us add_saturated(Vec16us const a, Vec16us const b) {
+    return _mm256_adds_epu16(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16us sub_saturated(Vec16us const a, Vec16us const b) {
+    return _mm256_subs_epu16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec16us max(Vec16us const a, Vec16us const b) {
+    return _mm256_max_epu16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16us min(Vec16us const a, Vec16us const b) {
+    return _mm256_min_epu16(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8i : public Vec256b {
+public:
+    // Default constructor:
+    Vec8i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8i(int i) {
+        ymm = _mm256_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) {
+        ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4i:
+    Vec8i(Vec4i const a0, Vec4i const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec8i(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec8i & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8i & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8i & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load 8 unsigned 8-bit integers from array
+    Vec8i & load_8uc(void const * p) {
+        ymm = _mm256_cvtepu8_epi32(Vec16uc().loadl(p));
+        return *this;
+    }
+    // Member function to load 8 unsigned 16-bit integers from array
+    Vec8i & load_8us(void const * p) {
+        ymm = _mm256_cvtepu16_epi32(Vec8us().load(p));
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8i & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        ymm = _mm256_maskz_loadu_epi32(__mmask8((1u << n) - 1), p);
+#else
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 4) {
+            *this = Vec8i(Vec4i().load_partial(n, p), 0);
+        }
+        else if (n < 8) {
+            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t const*)p+4));
+        }
+        else {
+            load(p);
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL
+        _mm256_mask_storeu_epi32(p, __mmask8((1u << n) - 1), ymm);
+#else
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 8) {
+            get_low().store(p);
+            get_high().store_partial(n-4, (int32_t*)p+4);
+        }
+        else {
+            store(p);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8i & cutoff(int n) {
+#if INSTRSET >= 10
+        ymm = _mm256_maskz_mov_epi32(__mmask8((1u << n) - 1), ymm);
+#else
+        *this = Vec32c(*this).cutoff(n * 4);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8i const insert(int index, int32_t value) {
+#if INSTRSET >= 10
+        ymm = _mm256_mask_set1_epi32(ymm, __mmask8(1u << index), value);
+#else
+        __m256i broad = _mm256_set1_epi32(value);  // broadcast value into all elements
+        const int32_t maskl[16] = {0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0};
+        __m256i mask  = Vec256b().load(maskl + 8 - (index & 7)); // mask with FFFFFFFF at index position
+        ymm = selectb (mask, broad, ymm);
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(int index) const {
+#if INSTRSET >= 10
+        __m256i x = _mm256_maskz_compress_epi32(__mmask8(1u << index), ymm);
+        return _mm_cvtsi128_si32(_mm256_castsi256_si128(x));
+#else
+        int32_t x[8];
+        store(x);
+        return x[index & 7];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4i:
+    Vec4i get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec4i get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+class Vec8ib : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ib() {
+    }
+    // Constructor to build from all elements:
+    Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
+        Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
+        {}
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec8ib(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec8ib & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec8ib(bool b) : Vec8i(-int32_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8ib & operator = (bool b) {
+        *this = Vec8ib(b);
+        return *this;
+    }
+    // Constructor to build from two Vec4ib:
+    Vec8ib(Vec4ib const a0, Vec4ib const a1) : Vec8i(Vec4i(a0), Vec4i(a1)) {
+    }
+    Vec4ib get_low() const {
+        return Vec4ib(Vec8i::get_low());
+    }
+    Vec4ib get_high() const {
+        return Vec4ib(Vec8i::get_high());
+    }
+    Vec8ib & insert (int index, bool a) {
+        Vec8i::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec8i::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8ib & load_bits(uint8_t a) {
+        __m256i b1 = _mm256_set1_epi32((int32_t)a);  // broadcast a
+        __m256i m2 = constant8ui<1,2,4,8,0x10,0x20,0x40,0x80>();
+        __m256i d1 = _mm256_and_si256(b1, m2); // isolate one bit in each dword
+        ymm = _mm256_cmpgt_epi32(d1, _mm256_setzero_si256());  // compare with 0
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec8ib(int b) = delete;
+    Vec8ib & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec8b Vec8ib;  // compact boolean vector
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec8ib
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec8ib operator & (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8ib operator && (Vec8ib const a, Vec8ib const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ib operator | (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8ib operator || (Vec8ib const a, Vec8ib const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ib operator ^ (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec8ib operator == (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(a ^ (~b));
+}
+
+// vector operator != : xor
+static inline Vec8ib operator != (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ib operator ~ (Vec8ib const a) {
+    return Vec8ib( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec8ib operator ! (Vec8ib const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec8ib andnot (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(andnot(Vec256b(a), Vec256b(b)));
+}
+
+#endif
+
+/*****************************************************************************
+*
+*          Operators for Vec8i
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8i operator + (Vec8i const a, Vec8i const b) {
+    return _mm256_add_epi32(a, b);
+}
+// vector operator += : add
+static inline Vec8i & operator += (Vec8i & a, Vec8i const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8i operator ++ (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec8i & operator ++ (Vec8i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8i operator - (Vec8i const a, Vec8i const b) {
+    return _mm256_sub_epi32(a, b);
+}
+// vector operator - : unary minus
+static inline Vec8i operator - (Vec8i const a) {
+    return _mm256_sub_epi32(_mm256_setzero_si256(), a);
+}
+// vector operator -= : subtract
+static inline Vec8i & operator -= (Vec8i & a, Vec8i const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8i operator -- (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec8i & operator -- (Vec8i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8i operator * (Vec8i const a, Vec8i const b) {
+    return _mm256_mullo_epi32(a, b);
+}
+// vector operator *= : multiply
+static inline Vec8i & operator *= (Vec8i & a, Vec8i const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec8i operator << (Vec8i const a, int32_t b) {
+    return _mm256_sll_epi32(a, _mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec8i & operator <<= (Vec8i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8i operator >> (Vec8i const a, int32_t b) {
+    return _mm256_sra_epi32(a, _mm_cvtsi32_si128(b));
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec8i & operator >>= (Vec8i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8ib operator == (Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, b, 0);
+#else
+    return _mm256_cmpeq_epi32(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8ib operator != (Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, b, 4);
+#else
+    return Vec8ib(Vec8i(~(a == b)));
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8ib operator > (Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, b, 6);
+#else
+    return _mm256_cmpgt_epi32(a, b);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8ib operator < (Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8ib operator >= (Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, b, 5);
+#else
+    return Vec8ib(Vec8i(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8ib operator <= (Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec8i operator & (Vec8i const a, Vec8i const b) {
+    return Vec8i(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8i operator && (Vec8i const a, Vec8i const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8i & operator &= (Vec8i & a, Vec8i const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8i operator | (Vec8i const a, Vec8i const b) {
+    return Vec8i(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8i operator || (Vec8i const a, Vec8i const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8i & operator |= (Vec8i & a, Vec8i const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8i operator ^ (Vec8i const a, Vec8i const b) {
+    return Vec8i(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8i & operator ^= (Vec8i & a, Vec8i const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8i operator ~ (Vec8i const a) {
+    return Vec8i( ~ Vec256b(a));
+}
+
+// vector operator ! : returns true for elements == 0
+static inline Vec8ib operator ! (Vec8i const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi32_mask (a, _mm256_setzero_si256(), 0);
+#else
+    return _mm256_cmpeq_epi32(a, _mm256_setzero_si256());
+#endif
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8i select (Vec8ib const s, Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi32(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8i if_add (Vec8ib const f, Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi32 (a, f, a, b);
+#else
+    return a + (Vec8i(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec8i if_sub (Vec8ib const f, Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi32 (a, f, a, b);
+#else
+    return a - (Vec8i(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec8i if_mul (Vec8ib const f, Vec8i const a, Vec8i const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mullo_epi32 (a, f, a, b);
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int32_t horizontal_add (Vec8i const a) {
+    // The hadd instruction is inefficient, and may be split into two instructions for faster decoding
+    __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(a,1),_mm256_castsi256_si128(a));
+    __m128i sum2  = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1,sum1));
+    __m128i sum3  = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2,1));
+    return (int32_t)_mm_cvtsi128_si32(sum3);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+// static inline int64_t horizontal_add_x (Vec8i const a); // defined below
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8i add_saturated(Vec8i const a, Vec8i const b) {
+    __m256i sum    = _mm256_add_epi32(a, b);               // a + b
+    __m256i axb    = _mm256_xor_si256(a, b);               // check if a and b have different sign
+    __m256i axs    = _mm256_xor_si256(a, sum);             // check if a and sum have different sign
+    __m256i overf1 = _mm256_andnot_si256(axb,axs);         // check if sum has wrong sign
+    __m256i overf2 = _mm256_srai_epi32(overf1,31);         // -1 if overflow
+    __m256i asign  = _mm256_srli_epi32(a,31);              // 1  if a < 0
+    __m256i sat1   = _mm256_srli_epi32(overf2,1);          // 7FFFFFFF if overflow
+    __m256i sat2   = _mm256_add_epi32(sat1,asign);         // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2,sat2,sum);                      // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8i sub_saturated(Vec8i const a, Vec8i const b) {
+    __m256i diff   = _mm256_sub_epi32(a, b);               // a + b
+    __m256i axb    = _mm256_xor_si256(a, b);               // check if a and b have different sign
+    __m256i axs    = _mm256_xor_si256(a, diff);            // check if a and sum have different sign
+    __m256i overf1 = _mm256_and_si256(axb,axs);            // check if sum has wrong sign
+    __m256i overf2 = _mm256_srai_epi32(overf1,31);         // -1 if overflow
+    __m256i asign  = _mm256_srli_epi32(a,31);              // 1  if a < 0
+    __m256i sat1   = _mm256_srli_epi32(overf2,1);          // 7FFFFFFF if overflow
+    __m256i sat2   = _mm256_add_epi32(sat1,asign);         // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return  selectb(overf2,sat2,diff);                     // diff if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec8i max(Vec8i const a, Vec8i const b) {
+    return _mm256_max_epi32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8i min(Vec8i const a, Vec8i const b) {
+    return _mm256_min_epi32(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8i abs(Vec8i const a) {
+    return _mm256_abs_epi32(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8i abs_saturated(Vec8i const a) {
+#if INSTRSET >= 10
+    return _mm256_min_epu32(abs(a), Vec8i(0x7FFFFFFF));
+#else
+    __m256i absa   = abs(a);                               // abs(a)
+    __m256i overfl = _mm256_srai_epi32(absa,31);           // sign
+    return           _mm256_add_epi32(absa,overfl);        // subtract 1 if 0x80000000
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8i rotate_left(Vec8i const a, int b) {
+#if INSTRSET >= 10  // __AVX512VL__
+    return _mm256_rolv_epi32(a, _mm256_set1_epi32(b));
+#else
+    __m256i left  = _mm256_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F));   // a << b
+    __m256i right = _mm256_srl_epi32(a,_mm_cvtsi32_si128((-b) & 0x1F));// a >> (32 - b)
+    __m256i rot   = _mm256_or_si256(left,right);                       // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8ui : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ui() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8ui(uint32_t i) {
+        ymm = _mm256_set1_epi32((int32_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) {
+        ymm = _mm256_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4, (int32_t)i5, (int32_t)i6, (int32_t)i7);
+    }
+    // Constructor to build from two Vec4ui:
+    Vec8ui(Vec4ui const a0, Vec4ui const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec8ui(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec8ui & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8ui & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8ui & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8ui const insert(int index, uint32_t value) {
+        Vec8i::insert(index, (int32_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(int index) const {
+        return (uint32_t)Vec8i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec4ui get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec4ui get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int elementtype() {
+        return 9;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8ui operator + (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui (Vec8i(a) + Vec8i(b));
+}
+
+// vector operator - : subtract
+static inline Vec8ui operator - (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui (Vec8i(a) - Vec8i(b));
+}
+
+// vector operator * : multiply
+static inline Vec8ui operator * (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui (Vec8i(a) * Vec8i(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const a, uint32_t b) {
+    return _mm256_srl_epi32(a,_mm_cvtsi32_si128((int)b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+// vector operator >>= : shift right logical
+static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const a, uint32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const a, int32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8ib operator > (Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu32_mask (a, b, 6);
+#else
+    __m256i signbit = _mm256_set1_epi32(0x80000000);
+    __m256i a1      = _mm256_xor_si256(a,signbit);
+    __m256i b1      = _mm256_xor_si256(b,signbit);
+    return _mm256_cmpgt_epi32(a1,b1);                      // signed compare
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8ib operator < (Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu32_mask (a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8ib operator >= (Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu32_mask (a, b, 5);
+#else
+    __m256i max_ab = _mm256_max_epu32(a,b);                // max(a,b), unsigned
+    return _mm256_cmpeq_epi32(a,max_ab);                   // a == max(a,b)
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8ib operator <= (Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu32_mask (a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec8ui operator & (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8ui operator && (Vec8ui const a, Vec8ui const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ui operator | (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8ui operator || (Vec8ui const a, Vec8ui const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ui operator ^ (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(Vec256b(a) ^ Vec256b(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ui operator ~ (Vec8ui const a) {
+    return Vec8ui( ~ Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8ui select (Vec8ib const s, Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi32(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8ui if_add (Vec8ib const f, Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi32 (a, f, a, b);
+#else
+    return a + (Vec8ui(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec8ui if_sub (Vec8ib const f, Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi32 (a, f, a, b);
+#else
+    return a - (Vec8ui(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec8ui if_mul (Vec8ib const f, Vec8ui const a, Vec8ui const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mullo_epi32 (a, f, a, b);
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint32_t horizontal_add (Vec8ui const a) {
+    return (uint32_t)horizontal_add((Vec8i)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+// static inline uint64_t horizontal_add_x (Vec8ui const a); // defined later
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8ui add_saturated(Vec8ui const a, Vec8ui const b) {
+    Vec8ui sum = a + b;
+    Vec8ui aorb = Vec8ui(a | b);
+#if INSTRSET >= 10
+    Vec8b  overflow = _mm256_cmp_epu32_mask(sum, aorb, 1);
+    return _mm256_mask_set1_epi32(sum, overflow, -1);
+#else
+    Vec8ui overflow = Vec8ui(sum < aorb);                  // overflow if a + b < (a | b)
+    return Vec8ui(sum | overflow);                         // return 0xFFFFFFFF if overflow
+#endif
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8ui sub_saturated(Vec8ui const a, Vec8ui const b) {
+    Vec8ui diff = a - b;
+#if INSTRSET >= 10
+    Vec8b  nunderflow = _mm256_cmp_epu32_mask(diff, a, 2); // not underflow if a - b <= a
+    return _mm256_maskz_mov_epi32(nunderflow, diff);       // zero if underflow
+#else
+    Vec8ui underflow = Vec8ui(diff > a);                   // underflow if a - b > a
+    return _mm256_andnot_si256(underflow, diff);           // return 0 if underflow
+#endif
+}
+
+// function max: a > b ? a : b
+static inline Vec8ui max(Vec8ui const a, Vec8ui const b) {
+    return _mm256_max_epu32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8ui min(Vec8ui const a, Vec8ui const b) {
+    return _mm256_min_epu32(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec4q : public Vec256b {
+public:
+    // Default constructor:
+    Vec4q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4q(int64_t i) {
+        ymm = _mm256_set1_epi64x(i);
+    }
+    // Constructor to build from all elements:
+    Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+        ymm = _mm256_setr_epi64x(i0, i1, i2, i3);
+    }
+    // Constructor to build from two Vec2q:
+    Vec4q(Vec2q const a0, Vec2q const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec4q(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec4q & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m256i used in intrinsics
+    operator __m256i() const {
+        return ymm;
+    }
+    // Member function to load from array (unaligned)
+    Vec4q & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4q & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4q & load_partial(int n, void const * p) {
+#if INSTRSET >= 10  // AVX512VL
+        ymm = _mm256_maskz_loadu_epi64(__mmask8((1u << n) - 1), p);
+#else
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 2) {
+            *this = Vec4q(Vec2q().load_partial(n, p), 0);
+        }
+        else if (n < 4) {
+            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t const*)p+2));
+        }
+        else {
+            load(p);
+        }
+#endif
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+#if INSTRSET >= 10  // AVX512VL
+        _mm256_mask_storeu_epi64(p, __mmask8((1u << n) - 1), ymm);
+#else
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 4) {
+            get_low().store(p);
+            get_high().store_partial(n-2, (int64_t*)p+2);
+        }
+        else {
+            store(p);
+        }
+#endif
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec4q & cutoff(int n) {
+#if INSTRSET >= 10
+        ymm = _mm256_maskz_mov_epi64(__mmask8((1u << n) - 1), ymm);
+#else
+        *this = Vec32c(*this).cutoff(n * 8);
+#endif
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4q const insert(int index, int64_t value) {
+#if INSTRSET >= 10
+        ymm = _mm256_mask_set1_epi64(ymm, __mmask8(1u << index), value);
+#else
+        Vec4q x(value);
+        switch (index) {
+        case 0:
+            ymm = _mm256_blend_epi32(ymm,x,0x03);  break;
+        case 1:
+            ymm = _mm256_blend_epi32(ymm,x,0x0C);  break;
+        case 2:
+            ymm = _mm256_blend_epi32(ymm,x,0x30);  break;
+        case 3:
+            ymm = _mm256_blend_epi32(ymm,x,0xC0);  break;
+        }
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(int index) const {
+#if INSTRSET >= 10
+        __m256i x = _mm256_maskz_compress_epi64(__mmask8(1u << index), ymm);
+        return _emulate_movq(_mm256_castsi256_si128(x));
+#else
+        int64_t x[4];
+        store(x);
+        return x[index & 3];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec2q get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec2q get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 10;
+    }
+};
+
+/*****************************************************************************
+*
+*          Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+class Vec4qb : public Vec4q {
+public:
+    // Default constructor:
+    Vec4qb() {
+    }
+    // Constructor to build from all elements:
+    Vec4qb(bool x0, bool x1, bool x2, bool x3) :
+        Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec4qb(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec4qb & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4qb(bool b) : Vec4q(-int64_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4qb & operator = (bool b) {
+        *this = Vec4qb(b);
+        return *this;
+    }
+    // Constructor to build from two Vec2qb:
+    Vec4qb(Vec2qb const a0, Vec2qb const a1) : Vec4q(Vec2q(a0), Vec2q(a1)) {
+    }
+    // Member functions to split into two Vec2qb:
+    Vec2qb get_low() const {
+        return Vec2qb(Vec4q::get_low());
+    }
+    Vec2qb get_high() const {
+        return Vec2qb(Vec4q::get_high());
+    }
+    Vec4qb & insert (int index, bool a) {
+        Vec4q::insert(index, -(int64_t)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec4q::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec4qb & load_bits(uint8_t a) {
+        __m256i b1 = _mm256_set1_epi32((int32_t)a);  // broadcast a
+        __m256i m2 = constant8ui<1,0,2,0,4,0,8,0>();
+        __m256i d1 = _mm256_and_si256(b1, m2); // isolate one bit in each dword
+        ymm = _mm256_cmpgt_epi64(d1, _mm256_setzero_si256());  // we can use signed compare here because no value is negative
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec4qb(int b) = delete;
+    Vec4qb & operator = (int x) = delete;
+};
+
+#else
+
+typedef Vec4b Vec4qb;  // compact boolean vector
+
+#endif
+
+/*****************************************************************************
+*
+*          Define operators for Vec4qb
+*
+*****************************************************************************/
+
+#if INSTRSET < 10  // broad boolean vectors
+
+// vector operator & : bitwise and
+static inline Vec4qb operator & (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4qb operator && (Vec4qb const a, Vec4qb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4qb operator | (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4qb operator || (Vec4qb const a, Vec4qb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4qb operator ^ (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec4qb operator == (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(a ^ (~b));
+}
+
+// vector operator != : xor
+static inline Vec4qb operator != (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4qb operator ~ (Vec4qb const a) {
+    return Vec4qb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec4qb operator ! (Vec4qb const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec4qb andnot (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+#endif
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4q
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4q operator + (Vec4q const a, Vec4q const b) {
+    return _mm256_add_epi64(a, b);
+}
+// vector operator += : add
+static inline Vec4q & operator += (Vec4q & a, Vec4q const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4q operator ++ (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec4q & operator ++ (Vec4q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4q operator - (Vec4q const a, Vec4q const b) {
+    return _mm256_sub_epi64(a, b);
+}
+// vector operator - : unary minus
+static inline Vec4q operator - (Vec4q const a) {
+    return _mm256_sub_epi64(_mm256_setzero_si256(), a);
+}
+// vector operator -= : subtract
+static inline Vec4q & operator -= (Vec4q & a, Vec4q const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4q operator -- (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec4q & operator -- (Vec4q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4q operator * (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__
+    return _mm256_mullo_epi64(a, b);
+#else
+    // Split into 32-bit multiplies
+    __m256i bswap   = _mm256_shuffle_epi32(b,0xB1);        // swap H<->L
+    __m256i prodlh  = _mm256_mullo_epi32(a,bswap);         // 32 bit L*H products
+    __m256i zero    = _mm256_setzero_si256();              // 0
+    __m256i prodlh2 = _mm256_hadd_epi32(prodlh,zero);      // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
+    __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2,0x73);  // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
+    __m256i prodll  = _mm256_mul_epu32(a,b);               // a0Lb0L,a1Lb1L, 64 bit unsigned products
+    __m256i prod    = _mm256_add_epi64(prodll,prodlh3);    // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
+    return  prod;
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec4q & operator *= (Vec4q & a, Vec4q const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec4q operator << (Vec4q const a, int32_t b) {
+    return _mm256_sll_epi64(a, _mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec4q & operator <<= (Vec4q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4q operator >> (Vec4q const a, int32_t b) {
+#if INSTRSET >= 10   // AVX512VL
+    return _mm256_sra_epi64(a, _mm_cvtsi32_si128(b));
+#else
+    __m128i bb;
+    __m256i shi, slo, sra2;
+    if (b <= 32) {
+        bb   = _mm_cvtsi32_si128(b);             // b
+        shi  = _mm256_sra_epi32(a,bb);           // a >> b signed dwords
+        slo  = _mm256_srl_epi64(a,bb);           // a >> b unsigned qwords
+    }
+    else {  // b > 32
+        bb   = _mm_cvtsi32_si128(b-32);          // b - 32
+        shi  = _mm256_srai_epi32(a,31);          // sign of a
+        sra2 = _mm256_sra_epi32(a,bb);           // a >> (b-32) signed dwords
+        slo  = _mm256_srli_epi64(sra2,32);       // a >> (b-32) >> 32 (second shift unsigned qword)
+    }
+    return _mm256_blend_epi32(slo,shi,0xAA);
+#endif
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec4q & operator >>= (Vec4q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4qb operator == (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, b, 0);
+#else
+    return _mm256_cmpeq_epi64(a, b);
+#endif
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4qb operator != (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, b, 4);
+#else
+    return Vec4qb(Vec4q(~(a == b)));
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4qb operator < (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, b, 1);
+#else
+    return _mm256_cmpgt_epi64(b, a);
+#endif
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4qb operator > (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, b, 6);
+#else
+    return b < a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4qb operator >= (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, b, 5);
+#else
+    return Vec4qb(Vec4q(~(a < b)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4qb operator <= (Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec4q operator & (Vec4q const a, Vec4q const b) {
+    return Vec4q(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4q operator && (Vec4q const a, Vec4q const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4q & operator &= (Vec4q & a, Vec4q const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4q operator | (Vec4q const a, Vec4q const b) {
+    return Vec4q(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4q operator || (Vec4q const a, Vec4q const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4q & operator |= (Vec4q & a, Vec4q const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4q operator ^ (Vec4q const a, Vec4q const b) {
+    return Vec4q(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4q & operator ^= (Vec4q & a, Vec4q const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4q operator ~ (Vec4q const a) {
+    return Vec4q( ~ Vec256b(a));
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec4qb operator ! (Vec4q const a) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epi64_mask (a, _mm256_setzero_si256(), 0);
+#else
+    return a == Vec4q(_mm256_setzero_si256());
+#endif
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec4q select (Vec4qb const s, Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi64(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4q if_add (Vec4qb const f, Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi64 (a, f, a, b);
+#else
+    return a + (Vec4q(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec4q if_sub (Vec4qb const f, Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi64 (a, f, a, b);
+#else
+    return a - (Vec4q(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec4q if_mul (Vec4qb const f, Vec4q const a, Vec4q const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mullo_epi64 (a, f, a, b);
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int64_t horizontal_add (Vec4q const a) {
+    __m256i sum1  = _mm256_shuffle_epi32(a,0x0E);                     // high element
+    __m256i sum2  = _mm256_add_epi64(a,sum1);                         // sum
+    __m128i sum3  = _mm256_extracti128_si256(sum2, 1);                // get high part
+    __m128i sum4  = _mm_add_epi64(_mm256_castsi256_si128(sum2),sum3); // add low and high parts
+    return _emulate_movq(sum4);
+}
+
+// function max: a > b ? a : b
+static inline Vec4q max(Vec4q const a, Vec4q const b) {
+    return select(a > b, a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec4q min(Vec4q const a, Vec4q const b) {
+    return select(a < b, a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4q abs(Vec4q const a) {
+#if INSTRSET >= 10     // AVX512VL
+    return _mm256_abs_epi64(a);
+#else
+    __m256i sign  = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);// 0 > a
+    __m256i inv   = _mm256_xor_si256(a, sign);             // invert bits if negative
+    return          _mm256_sub_epi64(inv, sign);           // add 1
+#endif
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4q abs_saturated(Vec4q const a) {
+#if INSTRSET >= 10
+    return _mm256_min_epu64(abs(a), Vec4q(0x7FFFFFFFFFFFFFFF));
+#else
+    __m256i absa   = abs(a);                               // abs(a)
+    __m256i overfl = _mm256_cmpgt_epi64(_mm256_setzero_si256(), absa); // 0 > a
+    return           _mm256_add_epi64(absa, overfl);       // subtract 1 if 0x8000000000000000
+#endif
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4q rotate_left(Vec4q const a, int b) {
+#if INSTRSET >= 10  // __AVX512VL__
+    return _mm256_rolv_epi64(a, _mm256_set1_epi64x(int64_t(b)));
+#else
+    __m256i left  = _mm256_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F));    // a << b
+    __m256i right = _mm256_srl_epi64(a,_mm_cvtsi32_si128((-b) & 0x3F)); // a >> (64 - b)
+    __m256i rot   = _mm256_or_si256(left, right);                       // or
+    return  rot;
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec4uq : public Vec4q {
+public:
+    // Default constructor:
+    Vec4uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4uq(uint64_t i) {
+        ymm = Vec4q((int64_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) {
+        ymm = Vec4q((int64_t)i0, (int64_t)i1, (int64_t)i2, (int64_t)i3);
+    }
+    // Constructor to build from two Vec2uq:
+    Vec4uq(Vec2uq const a0, Vec2uq const a1) {
+        ymm = set_m128ir(a0, a1);
+    }
+    // Constructor to convert from type __m256i used in intrinsics:
+    Vec4uq(__m256i const x) {
+        ymm = x;
+    }
+    // Assignment operator to convert from type __m256i used in intrinsics:
+    Vec4uq & operator = (__m256i const x) {
+        ymm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4uq & load(void const * p) {
+        ymm = _mm256_loadu_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4uq & load_a(void const * p) {
+        ymm = _mm256_load_si256((__m256i const*)p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4uq const insert(int index, uint64_t value) {
+        Vec4q::insert(index, (int64_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(int index) const {
+        return (uint64_t)Vec4q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec2uq get_low() const {
+        return _mm256_castsi256_si128(ymm);
+    }
+    Vec2uq get_high() const {
+        return _mm256_extractf128_si256(ymm,1);
+    }
+    static constexpr int elementtype() {
+        return 11;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4uq operator + (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq (Vec4q(a) + Vec4q(b));
+}
+
+// vector operator - : subtract
+static inline Vec4uq operator - (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq (Vec4q(a) - Vec4q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec4uq operator * (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq (Vec4q(a) * Vec4q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const a, uint32_t b) {
+    return _mm256_srl_epi64(a,_mm_cvtsi32_si128((int)b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+// vector operator >>= : shift right artihmetic
+static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const a, uint32_t b) {
+    return Vec4uq ((Vec4q)a << (int32_t)b);
+}
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const a, int32_t b) {
+    return Vec4uq ((Vec4q)a << b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4qb operator > (Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu64_mask (a, b, 6);
+#else
+    __m256i sign64 = Vec4uq(0x8000000000000000);
+    __m256i aflip  = _mm256_xor_si256(a, sign64);
+    __m256i bflip  = _mm256_xor_si256(b, sign64);
+    Vec4q   cmp    = _mm256_cmpgt_epi64(aflip,bflip);
+    return Vec4qb(cmp);
+#endif
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4qb operator < (Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu64_mask (a, b, 1);
+#else
+    return b > a;
+#endif
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4qb operator >= (Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu64_mask (a, b, 5);
+#else
+    return  Vec4qb(Vec4q(~(b > a)));
+#endif
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4qb operator <= (Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_cmp_epu64_mask (a, b, 2);
+#else
+    return b >= a;
+#endif
+}
+
+// vector operator & : bitwise and
+static inline Vec4uq operator & (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4uq operator && (Vec4uq const a, Vec4uq const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec4uq operator | (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4uq operator || (Vec4uq const a, Vec4uq const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4uq operator ^ (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(Vec256b(a) ^ Vec256b(b));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec4uq select (Vec4qb const s, Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mov_epi64(b, s, a);
+#else
+    return selectb(s,a,b);
+#endif
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4uq if_add (Vec4qb const f, Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_add_epi64 (a, f, a, b);
+#else
+    return a + (Vec4uq(f) & b);
+#endif
+}
+
+// Conditional subtract
+static inline Vec4uq if_sub (Vec4qb const f, Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_sub_epi64 (a, f, a, b);
+#else
+    return a - (Vec4uq(f) & b);
+#endif
+}
+
+// Conditional multiply
+static inline Vec4uq if_mul (Vec4qb const f, Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // compact boolean vectors
+    return _mm256_mask_mullo_epi64 (a, f, a, b);
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint64_t horizontal_add (Vec4uq const a) {
+    return (uint64_t)horizontal_add((Vec4q)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign/zero extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec8i const a) {
+    __m256i signs = _mm256_srai_epi32(a,31);               // sign of all elements
+    Vec4q   a01   = _mm256_unpacklo_epi32(a,signs);        // sign-extended a0, a1, a4, a5
+    Vec4q   a23   = _mm256_unpackhi_epi32(a,signs);        // sign-extended a2, a3, a6, a7
+    return  horizontal_add(a01 + a23);
+}
+
+static inline uint64_t horizontal_add_x (Vec8ui const a) {
+    __m256i zero  = _mm256_setzero_si256();                // 0
+    __m256i a01   = _mm256_unpacklo_epi32(a,zero);         // zero-extended a0, a1
+    __m256i a23   = _mm256_unpackhi_epi32(a,zero);         // zero-extended a2, a3
+    return (uint64_t)horizontal_add(Vec4q(a01) + Vec4q(a23));
+}
+
+// function max: a > b ? a : b
+static inline Vec4uq max(Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm256_max_epu64 (a, b);
+#else
+    return Vec4uq(select(a > b, a, b));
+#endif
+}
+
+// function min: a < b ? a : b
+static inline Vec4uq min(Vec4uq const a, Vec4uq const b) {
+#if INSTRSET >= 10  // AVX512VL
+    return _mm256_min_epu64 (a, b);
+#else
+    return Vec4uq(select(a > b, b, a));
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// Permute vector of 4 64-bit integers.
+template <int i0, int i1, int i2, int i3 >
+static inline Vec4q permute4(Vec4q const a) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 };          // indexes as array
+    __m256i y = a;                                         // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec4q>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & perm_largeblock) != 0) {        // permute 128-bit blocks
+        constexpr EList<int, 2> L = largeblock_perm<4>(indexs); // get 128-bit permute pattern
+        constexpr int j0 = L.a[0];
+        constexpr int j1 = L.a[1];
+#ifndef ZEXT_MISSING
+        if constexpr (j0 == 0 && j1 == -1 && !(flags & perm_addz)) { // zero extend
+            return _mm256_zextsi128_si256(_mm256_castsi256_si128(y));
+        }
+        if constexpr (j0 == 1 && j1 < 0 && !(flags & perm_addz)) {   // extract upper part, zero extend
+            return _mm256_zextsi128_si256(_mm256_extracti128_si256(y, 1));
+        }
+#endif
+        if constexpr ((flags & perm_perm) != 0  && !(flags & perm_zeroing)) {
+            return _mm256_permute2x128_si256(y, y, (j0 & 1) | (j1 & 1) << 4);
+        }
+    }
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+        if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in both lanes
+            // try to fit various instructions
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm256_unpackhi_epi64(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm256_unpacklo_epi64(y, y);
+            }
+            else { // general permute
+                y = _mm256_shuffle_epi32(a, uint8_t(flags >> perm_ipattern));
+            }
+        }
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) {
+            y = _mm256_broadcastq_epi64(_mm256_castsi256_si128(y)); // broadcast first element
+        }
+        else {  // different patterns in two lanes
+#if INSTRSET >= 10  // AVX512VL
+            if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                return _mm256_maskz_alignr_epi64 (zero_mask<4>(indexs), y, y, rot);
+            }
+            else { // full permute
+                constexpr uint8_t mms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6;
+                constexpr __mmask8 mmz = zero_mask<4>(indexs);//(i0 >= 0) | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3;
+                return _mm256_maskz_permutex_epi64(mmz, a, mms);
+            }
+#else
+            // full permute
+            constexpr int ms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6;
+            y = _mm256_permute4x64_epi64(a, ms);
+#endif
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi64(zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int64_t, 4> bm = zero_mask_broad<Vec4q>(indexs);
+        y = _mm256_and_si256(Vec4q().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq permute4(Vec4uq const a) {
+    return Vec4uq (permute4<i0,i1,i2,i3> (Vec4q(a)));
+}
+
+
+// Permute vector of 8 32-bit integers.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8i permute8(Vec8i const a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m256i y = a;                                         // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec8i>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {    // use larger permutation
+            constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // permutation pattern
+            y = permute4 <L.a[0], L.a[1], L.a[2], L.a[3]> (Vec4q(a));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in both lanes
+            // try to fit various instructions
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm256_unpackhi_epi32(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm256_unpacklo_epi32(y, y);
+            }
+            else { // general permute
+                y = _mm256_shuffle_epi32(a, uint8_t(flags >> perm_ipattern));
+            }
+        }
+#if INSTRSET >= 10
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags & perm_zeroing) == 0) {
+            constexpr uint8_t e = flags >> perm_rot_count & 0xF; // broadcast one element
+            if constexpr (e > 0) {
+                y = _mm256_alignr_epi32(y, y, e);
+            }
+            return _mm256_broadcastd_epi32(_mm256_castsi256_si128(y));
+#else
+        else if constexpr ((flags & perm_broadcast) != 0 && (flags & perm_zeroing) == 0 && (flags >> perm_rot_count == 0)) {
+            return _mm256_broadcastd_epi32(_mm256_castsi256_si128(y)); // broadcast first element
+#endif
+        }
+        else if constexpr ((flags & perm_zext) != 0) {
+            y = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(y));  // zero extension
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+#if INSTRSET >= 10  // AVX512VL
+        else if constexpr ((flags & perm_compress) != 0) {
+            y = _mm256_maskz_compress_epi32(__mmask8(compress_mask(indexs)), y); // compress
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+        else if constexpr ((flags & perm_expand) != 0) {
+            y = _mm256_maskz_expand_epi32(__mmask8(expand_mask(indexs)), y); // expand
+            if constexpr ((flags & perm_addz2) == 0) return y;
+        }
+#endif
+        else {  // different patterns in two lanes
+#if INSTRSET >= 10  // AVX512VL
+            if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                return _mm256_maskz_alignr_epi32(zero_mask<8>(indexs), y, y, rot);
+            }
+            else
+#endif
+            if constexpr ((flags & perm_cross_lane) == 0) {  // no lane crossing. Use pshufb
+                const EList <int8_t, 32> bm = pshufb_mask<Vec8i>(indexs);
+                return _mm256_shuffle_epi8(a, Vec8i().load(bm.a));
+            }
+            // full permute needed
+            __m256i permmask = constant8ui <
+                i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7 > ();
+#if INSTRSET >= 10  // AVX512VL
+            return _mm256_maskz_permutexvar_epi32 (zero_mask<8>(indexs), permmask, y);
+#else
+            y =_mm256_permutevar8x32_epi32(y, permmask);
+#endif
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {
+        // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi32(zero_mask<8>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 8> bm = zero_mask_broad<Vec8i>(indexs);
+        y = _mm256_and_si256(Vec8i().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8ui permute8(Vec8ui const a) {
+    return Vec8ui (permute8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8i(a)));
+}
+
+
+// Permute vector of 16 16-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16s permute16(Vec16s const a) {
+    int constexpr indexs[16] = {  // indexes as array
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    __m256i y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec16s>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {                   // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {         // use larger permutation
+            constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // permutation pattern
+            y = permute8 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]> (Vec8i(a));
+            if (!(flags & perm_addz)) return y;                 // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in both lanes
+            // try to fit various instructions
+            if constexpr ((flags & perm_punpckh) != 0) {        // fits punpckhi
+                y = _mm256_unpackhi_epi16(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){      // fits punpcklo
+                y = _mm256_unpacklo_epi16(y, y);
+            }
+            else if constexpr ((flags & perm_rotate) != 0) {    // fits palignr. rotate within lanes
+                y = _mm256_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF);
+            }
+            else {
+                // flags for 16 bit permute instructions
+                constexpr uint64_t flags16 = perm16_flags<Vec16s>(indexs);
+                constexpr bool L2L = (flags16 & 1) != 0;        // from low  to low  64-bit part
+                constexpr bool H2H = (flags16 & 2) != 0;        // from high to high 64-bit part
+                constexpr bool H2L = (flags16 & 4) != 0;        // from high to low  64-bit part
+                constexpr bool L2H = (flags16 & 8) != 0;        // from low  to high 64-bit part
+                constexpr uint8_t pL2L = uint8_t(flags16 >> 32);// low  to low  permute pattern
+                constexpr uint8_t pH2H = uint8_t(flags16 >> 40);// high to high permute pattern
+                constexpr uint8_t noperm = 0xE4;                // pattern for no permute
+                if constexpr (!H2L && !L2H) {                   // simple case. no crossing of 64-bit boundary
+                    if constexpr (L2L && pL2L != noperm) {
+                        y = _mm256_shufflelo_epi16(y, pL2L);    // permute low 64-bits
+                    }
+                    if constexpr (H2H && pH2H != noperm) {
+                        y = _mm256_shufflehi_epi16(y, pH2H);    // permute high 64-bits
+                    }
+                }
+                else {  // use pshufb
+                    const EList <int8_t, 32> bm = pshufb_mask<Vec16s>(indexs);
+                    return _mm256_shuffle_epi8(a, Vec16s().load(bm.a));
+                }
+            }
+        }
+        else {  // different patterns in two lanes
+            if constexpr ((flags & perm_zext) != 0) {     // fits zero extension
+                y = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(y));  // zero extension
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm256_maskz_compress_epi16(__mmask16(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm256_maskz_expand_epi16(__mmask16(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#endif  // AVX512VBMI2
+            else if constexpr ((flags & perm_cross_lane) == 0) {     // no lane crossing. Use pshufb
+                const EList <int8_t, 32> bm = pshufb_mask<Vec16s>(indexs);
+                return _mm256_shuffle_epi8(a, Vec16s().load(bm.a));
+            }
+            else if constexpr ((flags & perm_rotate_big) != 0) {// fits full rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count) * 2; // rotate count
+                __m256i swap = _mm256_permute4x64_epi64(a,0x4E);// swap 128-bit halves
+                if (rot <= 16) {
+                    y = _mm256_alignr_epi8(swap, y, rot);
+                }
+                else {
+                    y = _mm256_alignr_epi8(y, swap, rot & 15);
+                }
+            }
+            else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) {
+                y = _mm256_broadcastw_epi16(_mm256_castsi256_si128(y)); // broadcast first element
+            }
+            else {  // full permute needed
+#if INSTRSET >= 10  // AVX512VL
+                const EList <int16_t, 16> bm = perm_mask_broad<Vec16s>(indexs);
+                y = _mm256_permutexvar_epi16(Vec16s().load(bm.a), y);
+#else           // no full permute instruction available
+                __m256i swap = _mm256_permute4x64_epi64(y,0x4E);// swap high and low 128-bit lane
+                const EList <int8_t, 32> bm1 = pshufb_mask<Vec16s, 1>(indexs);
+                const EList <int8_t, 32> bm2 = pshufb_mask<Vec16s, 0>(indexs);
+                __m256i r1 = _mm256_shuffle_epi8(swap, Vec16s().load(bm1.a));
+                __m256i r2 = _mm256_shuffle_epi8(y,    Vec16s().load(bm2.a));
+                return       _mm256_or_si256(r1, r2);
+#endif
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {           // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi16(zero_mask<16>(indexs), y);
+#else               // use broad mask
+        const EList <int16_t, 16> bm = zero_mask_broad<Vec16s>(indexs);
+        y = _mm256_and_si256(Vec16s().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16us permute16(Vec16us const a) {
+    return Vec16us (permute16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16s(a)));
+}
+
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+static inline Vec32c permute32(Vec32c const a) {
+    int constexpr indexs[32] = {  // indexes as array
+        i0,  i1,  i2,  i3,  i4,  i5,  i6,  i7,  i8,  i9,  i10, i11, i12, i13, i14, i15,
+        i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 };
+
+    __m256i y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec32c>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {                   // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {         // use larger permutation
+            constexpr EList<int, 16> L = largeblock_perm<32>(indexs); // permutation pattern
+            y = permute16 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7],
+                L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15]> (Vec16s(a));
+            if (!(flags & perm_addz)) return y;                 // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in both lanes
+            if constexpr ((flags & perm_punpckh) != 0) {        // fits punpckhi
+                y = _mm256_unpackhi_epi8(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){      // fits punpcklo
+                y = _mm256_unpacklo_epi8(y, y);
+            }
+            else if constexpr ((flags & perm_rotate) != 0) {    // fits palignr. rotate within lanes
+                y = _mm256_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF);
+            }
+            else { // use pshufb
+                const EList <int8_t, 32> bm = pshufb_mask<Vec32c>(indexs);
+                return _mm256_shuffle_epi8(a, Vec32c().load(bm.a));
+            }
+        }
+        else {  // different patterns in two lanes
+            if constexpr ((flags & perm_zext) != 0) {     // fits zero extension
+                y = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y));  // zero extension
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm256_maskz_compress_epi8(__mmask32(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm256_maskz_expand_epi8(__mmask32(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#endif  // AVX512VBMI2
+            else if constexpr ((flags & perm_cross_lane) == 0) {     // no lane crossing. Use pshufb
+                const EList <int8_t, 32> bm = pshufb_mask<Vec32c>(indexs);
+                return _mm256_shuffle_epi8(a, Vec32c().load(bm.a));
+            }
+            else if constexpr ((flags & perm_rotate_big) != 0) {// fits full rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotate count
+                __m256i swap = _mm256_permute4x64_epi64(a,0x4E);// swap 128-bit halves
+                if (rot <= 16) {
+                    y = _mm256_alignr_epi8(swap, y, rot);
+                }
+                else {
+                    y = _mm256_alignr_epi8(y, swap, rot & 15);
+                }
+            }
+            else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) {
+                y = _mm256_broadcastb_epi8(_mm256_castsi256_si128(y)); // broadcast first element
+            }
+            else {  // full permute needed
+#if INSTRSET >= 10 && defined ( __AVX512VBMI__ ) // AVX512VBMI
+                const EList <int8_t, 32> bm = perm_mask_broad<Vec32c>(indexs);
+                y = _mm256_permutexvar_epi8(Vec32c().load(bm.a), y);
+#else
+                // no full permute instruction available
+                __m256i swap = _mm256_permute4x64_epi64(y, 0x4E);  // swap high and low 128-bit lane
+                const EList <int8_t, 32> bm1 = pshufb_mask<Vec32c, 1>(indexs);
+                const EList <int8_t, 32> bm2 = pshufb_mask<Vec32c, 0>(indexs);
+                __m256i r1 = _mm256_shuffle_epi8(swap, Vec32c().load(bm1.a));
+                __m256i r2 = _mm256_shuffle_epi8(y,    Vec32c().load(bm2.a));
+                return       _mm256_or_si256(r1, r2);
+#endif
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi8(zero_mask<32>(indexs), y);
+#else  // use broad mask
+        const EList <int8_t, 32> bm = zero_mask_broad<Vec32c>(indexs);
+        y = _mm256_and_si256(Vec32c().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc permute32(Vec32uc const a) {
+        return Vec32uc (permute32<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (Vec32c(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+// permute and blend Vec4q
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q blend4(Vec4q const a, Vec4q const b) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 };          // indexes as array
+    __m256i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec4q>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute4 <i0, i1, i2, i3> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        return permute4 <i0<0?i0:i0&3, i1<0?i1:i1&3, i2<0?i2:i2&3, i3<0?i3:i3&3> (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs);  // blend mask
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm256_mask_mov_epi64 (a, mb, b);
+#else  // AVX2
+        y = _mm256_blend_epi32(a, b, ((mb & 1) | (mb & 2) << 1 | (mb & 4) << 2 | (mb & 8) << 3) * 3); // duplicate each bit
+#endif
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 128-bit blocks
+        constexpr EList<int, 2> L = largeblock_perm<4>(indexs); // get 128-bit blend pattern
+        constexpr uint8_t pp = (L.a[0] & 0xF) | uint8_t(L.a[1] & 0xF) << 4;
+        y = _mm256_permute2x128_si256(a, b, pp);
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm256_unpacklo_epi64 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm256_unpacklo_epi64 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm256_unpackhi_epi64 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm256_unpackhi_epi64 (b, a);
+    }
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm256_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm256_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+#if ALLOW_FP_PERMUTE  // allow floating point permute instructions on integer vectors
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), (flags >> blend_shufpattern) & 0xF));
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), (flags >> blend_shufpattern) & 0xF));
+    }
+#endif
+    else { // No special cases
+#if INSTRSET >= 10  // AVX512VL. use vpermi2q
+        __m256i const maskp = constant8ui<i0 & 15, 0, i1 & 15, 0, i2 & 15, 0, i3 & 15, 0>();
+        return _mm256_maskz_permutex2var_epi64 (zero_mask<4>(indexs), a, maskp, b);
+#else   // permute a and b separately, then blend.
+        constexpr EList<int, 8> L = blend_perm_indexes<4, 0>(indexs); // get permutation indexes
+        __m256i ya = permute4<L.a[0], L.a[1], L.a[2], L.a[3]>(a);
+        __m256i yb = permute4<L.a[4], L.a[5], L.a[6], L.a[7]>(b);
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs);  // blend mask
+        y = _mm256_blend_epi32(ya, yb, ((mb & 1) | (mb & 2) << 1 | (mb & 4) << 2 | (mb & 8) << 3) * 3); // duplicate each bit
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi64(zero_mask<4>(indexs), y);
+#else  // use broad mask
+        const EList <int64_t, 4> bm = zero_mask_broad<Vec4q>(indexs);
+        y = _mm256_and_si256(Vec4q().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq blend4(Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(blend4<i0,i1,i2,i3> (Vec4q(a),Vec4q(b)));
+}
+
+
+// permute and blend Vec8i
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i blend8(Vec8i const a, Vec8i const b) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m256i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec8i>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & blend_largeblock) != 0) {       // blend and permute 32-bit blocks
+        constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // get 32-bit blend pattern
+        y = blend4<L.a[0], L.a[1], L.a[2], L.a[3]> (Vec4q(a), Vec4q(b));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else if constexpr ((flags & blend_b) == 0) {           // nothing from b. just permute a
+        return permute8 <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+    else if constexpr ((flags & blend_a) == 0) {           // nothing from a. just permute b
+        constexpr EList<int, 16> L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes
+        return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b);
+    }
+    else if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs);  // blend mask
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm256_mask_mov_epi32 (a, mb, b);
+#else  // AVX2
+        y = _mm256_blend_epi32(a, b, mb);
+#endif
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm256_unpacklo_epi32 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm256_unpacklo_epi32 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm256_unpackhi_epi32 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm256_unpackhi_epi32 (b, a);
+    }
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm256_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm256_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+#if ALLOW_FP_PERMUTE  // allow floating point permute instructions on integer vectors
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), uint8_t(flags >> blend_shufpattern)));
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), uint8_t(flags >> blend_shufpattern)));
+    }
+#endif
+    else { // No special cases
+#if INSTRSET >= 10  // AVX512VL. use vpermi2d
+        __m256i const maskp = constant8ui<i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15> ();
+        return _mm256_maskz_permutex2var_epi32 (zero_mask<8>(indexs), a, maskp, b);
+#else   // permute a and b separately, then blend.
+        constexpr EList<int, 16> L = blend_perm_indexes<8, 0>(indexs); // get permutation indexes
+        __m256i ya = permute8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>(a);
+        __m256i yb = permute8<L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15]>(b);
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs);  // blend mask
+        y = _mm256_blend_epi32(ya, yb, mb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi32(zero_mask<8>(indexs), y);
+#else  // use broad mask
+        const EList <int32_t, 8> bm = zero_mask_broad<Vec8i>(indexs);
+        y = _mm256_and_si256(Vec8i().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8ui blend8(Vec8ui const a, Vec8ui const b) {
+    return Vec8ui( blend8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8i(a),Vec8i(b)));
+}
+
+
+// permute and blend Vec16s
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+    static inline Vec16s blend16(Vec16s const a, Vec16s const b) {
+    int constexpr indexs[16] = {
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };// indexes as array
+    __m256i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec16s>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & blend_largeblock) != 0) {       // blend and permute 32-bit blocks
+        constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // get 32-bit blend pattern
+        y = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]> (Vec8i(a), Vec8i(b));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else if constexpr ((flags & blend_b) == 0) {           // nothing from b. just permute a
+        return permute16 <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (a);
+    }
+    else if constexpr ((flags & blend_a) == 0) {           // nothing from a. just permute b
+        constexpr EList<int, 32> L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes
+        return permute16 <
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]> (b);
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm256_unpacklo_epi16 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm256_unpacklo_epi16 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm256_unpackhi_epi16 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm256_unpackhi_epi16 (b, a);
+    }
+    else if constexpr ((flags & blend_rotateab) != 0) {
+        y = _mm256_alignr_epi8(a, b, flags >> blend_rotpattern);
+    }
+    else if constexpr ((flags & blend_rotateba) != 0) {
+        y = _mm256_alignr_epi8(b, a, flags >> blend_rotpattern);
+    }
+    else { // No special cases
+#if INSTRSET >= 10  // AVX512VL. use vpermi2w
+        if constexpr ((flags & (blend_perma | blend_permb)) != 0) {
+            const EList <int16_t, 16> bm = perm_mask_broad<Vec16s>(indexs);
+            return _mm256_maskz_permutex2var_epi16(zero_mask<16>(indexs), a, Vec16s().load(bm.a), b);
+        }
+#endif
+        // permute a and b separately, then blend.
+        Vec16s ya = a, yb = b;  // a and b permuted
+        constexpr EList<int, 32> L = blend_perm_indexes<16, 0>(indexs); // get permutation indexes
+        if constexpr ((flags & blend_perma) != 0) {
+            ya = permute16<
+                L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7],
+                L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] >(ya);
+        }
+        if constexpr ((flags & blend_permb) != 0) {
+            yb = permute16<
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] >(yb);
+        }
+        constexpr uint16_t mb = (uint16_t)make_bit_mask<16, 0x304>(indexs);  // blend mask
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm256_mask_mov_epi16 (ya, mb, yb);
+#else  // AVX2
+        if ((flags & blend_same_pattern) != 0) {           // same blend pattern in both 128-bit lanes
+            y = _mm256_blend_epi16(ya, yb, (uint8_t)mb);
+        }
+        else {
+            const EList <int16_t, 16> bm = make_broad_mask<Vec16s>(mb);
+            y = _mm256_blendv_epi8 (ya, yb, Vec16s().load(bm.a));
+        }
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi16(zero_mask<16>(indexs), y);
+#else  // use broad mask
+        const EList <int16_t, 16> bm = zero_mask_broad<Vec16s>(indexs);
+        y = _mm256_and_si256(Vec16s().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16us blend16(Vec16us const a, Vec16us const b) {
+    return Vec16us( blend16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16s(a),Vec16s(b)));
+}
+
+
+// permute and blend Vec32c
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+static inline Vec32c blend32(Vec32c const a, Vec32c const b) {
+    int constexpr indexs[32] = {
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+        i16, i17, i18, i19, i20, i21, i22, i23,i24, i25, i26, i27, i28, i29, i30, i31 };                  // indexes as array
+    __m256i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec32c>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256();  // just return zero
+
+    if constexpr ((flags & blend_largeblock) != 0) {       // blend and permute 16-bit blocks
+        constexpr EList<int, 16> L = largeblock_perm<32>(indexs); // get 16-bit blend pattern
+        y = blend16 < L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7],
+            L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] >
+            (Vec16s(a), Vec16s(b));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else if constexpr ((flags & blend_b) == 0) {           // nothing from b. just permute a
+        return permute32 <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23,i24, i25, i26, i27, i28, i29, i30, i31 > (a);
+    }
+    else if constexpr ((flags & blend_a) == 0) {           // nothing from a. just permute b
+        constexpr EList<int, 64> L = blend_perm_indexes<32, 2>(indexs); // get permutation indexes
+        return permute32 <
+            L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39],
+            L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47],
+            L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55],
+            L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] > (b);
+    }
+    else { // No special cases
+#if INSTRSET >= 10 && defined (__AVX512VBMI__) // AVX512VL + AVX512VBMI. use vpermi2b
+        if constexpr ((flags & (blend_perma | blend_permb)) != 0) {
+            const EList <int8_t, 32> bm = perm_mask_broad<Vec32c>(indexs);
+            return _mm256_maskz_permutex2var_epi8(zero_mask<32>(indexs), a, Vec32c().load(bm.a), b);
+        }
+#endif
+        // permute a and b separately, then blend.
+        Vec32c ya = a, yb = b;  // a and b permuted
+        constexpr EList<int, 64> L = blend_perm_indexes<32, 0>(indexs); // get permutation indexes
+        if constexpr ((flags & blend_perma) != 0) {
+            ya = permute32 <
+                L.a[0],  L.a[1],  L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
+                L.a[8],  L.a[9],  L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15],
+                L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+                L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (ya);
+        }
+        if constexpr ((flags & blend_permb) != 0) {
+            yb = permute32 <
+                L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39],
+                L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47],
+                L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55],
+                L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] > (yb);
+        }
+        constexpr uint32_t mb = (uint32_t)make_bit_mask<32, 0x305>(indexs);// blend mask
+#if INSTRSET >= 10 // AVX512VL
+        y = _mm256_mask_mov_epi8 (ya, mb, yb);
+#else  // AVX2
+        const EList <int8_t, 32> bm = make_broad_mask<Vec32c>(mb);
+        y = _mm256_blendv_epi8 (ya, yb, Vec32c().load(bm.a));
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+#if INSTRSET >= 10  // use compact mask
+        y = _mm256_maskz_mov_epi8(zero_mask<32>(indexs), y);
+#else  // use broad mask
+        const EList <int8_t, 32> bm = zero_mask_broad<Vec32c>(indexs);
+        y = _mm256_and_si256(Vec32c().load(bm.a), y);
+#endif
+    }
+    return y;
+}
+
+template <int ... i0>
+static inline Vec32uc blend32(Vec32uc const a, Vec32uc const b) {
+    return Vec32uc (blend32 <i0 ...> (Vec32c(a), Vec32c(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec32c lookup32(Vec32c const index, Vec32c const table) {
+#ifdef __XOP__  // AMD XOP instruction set. Use VPPERM
+    Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
+    Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
+    return Vec32c(t0, t1);
+#else
+    Vec32c f0 = constant8ui<0,0,0,0,0x10101010,0x10101010,0x10101010,0x10101010>();
+    Vec32c f1 = constant8ui<0x10101010,0x10101010,0x10101010,0x10101010,0,0,0,0>();
+    Vec32c tablef = _mm256_permute4x64_epi64(table, 0x4E);   // low and high parts swapped
+    Vec32c r0 = _mm256_shuffle_epi8(table,  (index ^ f0) + 0x70);
+    Vec32c r1 = _mm256_shuffle_epi8(tablef, (index ^ f1) + 0x70);
+    return r0 | r1;
+#endif
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32uc const index, void const * table) {
+    if constexpr (n <=  0) return 0;
+    if constexpr (n <= 16) {
+        Vec16c tt = Vec16c().load(table);
+        Vec16c r0 = lookup16(index.get_low(),  tt);
+        Vec16c r1 = lookup16(index.get_high(), tt);
+        return Vec32c(r0, r1);
+    }
+    if constexpr (n <= 32) return lookup32(index, Vec32c().load(table));
+    // n > 32. Limit index
+    Vec32uc index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec32uc(index) & uint8_t(n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec32uc(index), uint8_t(n-1));
+    }
+    Vec8ui mask0 = Vec8ui(0x000000FF);  // mask 8 bits
+    Vec32c t0 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & Vec8ui(index1)),      1); // positions 0, 4, 8,  ...
+    Vec32c t1 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1, 8)), 1); // positions 1, 5, 9,  ...
+    Vec32c t2 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1,16)), 1); // positions 2, 6, 10, ...
+    Vec32c t3 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1,24), 1); // positions 3, 7, 11, ...
+    t0 = t0 & Vec32c(mask0);
+    t1 = _mm256_slli_epi32(t1 & Vec32c(mask0),  8);
+    t2 = _mm256_slli_epi32(t2 & Vec32c(mask0), 16);
+    t3 = _mm256_slli_epi32(t3,         24);
+    return (t0 | t3) | (t1 | t2);
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32c const index, void const * table) {
+    return lookup<n>(Vec32uc(index), table);
+}
+
+
+static inline Vec16s lookup16(Vec16s const index, Vec16s const table) {
+    return Vec16s(lookup32(Vec32c(index * 0x202 + 0x100), Vec32c(table)));
+}
+
+template <int n>
+static inline Vec16s lookup(Vec16s const index, void const * table) {
+    if constexpr (n <=  0) return 0;
+    if constexpr (n <=  8) {
+        Vec8s table1 = Vec8s().load(table);
+        return Vec16s(
+            lookup8 (index.get_low(),  table1),
+            lookup8 (index.get_high(), table1));
+    }
+    if constexpr (n <= 16) return lookup16(index, Vec16s().load(table));
+    // n > 16. Limit index
+    Vec16us index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16us(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16us(index), n-1);
+    }
+    Vec16s t1 = _mm256_i32gather_epi32((const int *)table, __m256i(Vec8ui(index1) & 0x0000FFFF), 2);  // even positions
+    Vec16s t2 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1, 16) , 2);        // odd  positions
+    return blend16<0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30>(t1, t2);
+}
+
+static inline Vec8i lookup8(Vec8i const index, Vec8i const table) {
+    return _mm256_permutevar8x32_epi32(table, index);
+}
+
+template <int n>
+static inline Vec8i lookup(Vec8i const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 8) {
+        Vec8i table1 = Vec8i().load(table);
+        return lookup8(index, table1);
+    }
+    if constexpr (n <= 16) {
+        Vec8i table1 = Vec8i().load(table);
+        Vec8i table2 = Vec8i().load((int32_t const*)table + 8);
+        Vec8i y1 = lookup8(index, table1);
+        Vec8i y2 = lookup8(index, table2);
+        Vec8ib s = index > 7;
+        return select(s, y2, y1);
+    }
+    // n > 16. Limit index
+    Vec8ui index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8ui(index), n-1);
+    }
+    return _mm256_i32gather_epi32((const int *)table, index1, 4);
+}
+
+static inline Vec4q lookup4(Vec4q const index, Vec4q const table) {
+    return Vec4q(lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table)));
+}
+
+template <int n>
+static inline Vec4q lookup(Vec4q const index, int64_t const * table) {
+    if constexpr (n <= 0) return 0;
+    // n > 0. Limit index
+    Vec4uq index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1.
+        // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+        // since n is a 32-bit integer
+        index1 = Vec4uq(min(Vec8ui(index), constant8ui<n-1, 0, n-1, 0, n-1, 0, n-1, 0>()));
+    }
+/* old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long
+#if defined (__clang__) && CLANG_VERSION < 30400
+// clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed
+    return _mm256_i64gather_epi64((const int *)table, index1, 8);
+#elif defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
+// Old MS and Intel use non-standard type __int64
+    return _mm256_i64gather_epi64((const int64_t *)table, index1, 8);
+#else
+// Gnu, Clang 3.4, MS 11.0
+*/
+    return _mm256_i64gather_epi64((const long long *)table, index1, 8);
+//#endif
+}
+
+
+/*****************************************************************************
+*
+*          Byte shifts
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+template <unsigned int b>
+static inline Vec32c shift_bytes_up(Vec32c const a) {
+    __m256i ahi, alo;
+    if constexpr (b == 0) return a;
+#if INSTRSET >= 10  // AVX512VL
+    else if constexpr ((b & 3) == 0) {  // b is divisible by 4
+        return _mm256_alignr_epi32(a, _mm256_setzero_si256(), (8 - (b >> 2)) & 7);
+    }
+#endif
+    else if constexpr (b < 16) {
+        alo = a;
+        ahi = _mm256_inserti128_si256 (_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1);// shift a 16 bytes up, zero lower part
+    }
+    else if constexpr (b < 32) {
+        alo = _mm256_inserti128_si256 (_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1);// shift a 16 bytes up, zero lower part
+        ahi = _mm256_setzero_si256();
+    }
+    else {
+        return _mm256_setzero_si256();                     // zero
+    }
+    if constexpr ((b & 0xF) == 0) return alo;              // modulo 16. no more shift needeed
+    return _mm256_alignr_epi8(alo, ahi, 16-(b & 0xF));     // shift within 16-bytes lane
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+template <unsigned int b>
+static inline Vec32c shift_bytes_down(Vec32c const a) {
+#if INSTRSET >= 10  // AVX512VL
+    if constexpr ((b & 3) == 0) {  // b is divisible by 4
+        return _mm256_alignr_epi32(_mm256_setzero_si256(), a, (b >> 2) & 7);
+    }
+#endif
+    __m256i ahi, alo;
+    if constexpr (b < 16) {
+        // shift a 16 bytes down, zero upper part
+        alo = _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_extracti128_si256(a, 1), 0);// make sure the upper part is zero (otherwise, an optimizing compiler can mess it up)
+        ahi = a;
+    }
+    else if constexpr (b < 32) {
+        alo = _mm256_setzero_si256();                      // zero
+        ahi = _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_extracti128_si256(a, 1), 0);// shift a 16 bytes down, zero upper part
+    }
+    else {
+        return _mm256_setzero_si256();                     // zero
+    }
+    if constexpr ((b & 0xF) == 0) return ahi;              // modulo 16. no more shift needeed
+    return _mm256_alignr_epi8(alo, ahi, b & 0xF);          // shift within 16-bytes lane
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i gather8i(void const * a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if constexpr (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8i b = Vec8i().load((int32_t const *)a + imax-7);
+            return permute8<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7>(b);
+        }
+        else {
+            Vec8i b = Vec8i().load((int32_t const *)a + imin);
+            return permute8<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin>(b);
+        }
+    }
+    if constexpr ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8i b = Vec8i().load((int32_t const *)a + imin);
+        Vec8i c = Vec8i().load((int32_t const *)a + imax-7);
+        constexpr int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        constexpr int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        constexpr int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        constexpr int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        constexpr int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        constexpr int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        constexpr int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        constexpr int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use AVX2 gather
+    return _mm256_i32gather_epi32((const int *)a, Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), 4);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q gather4q(void const * a) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 3) {
+        // load one contiguous block and permute
+        if constexpr (imax > 3) {
+            // make sure we don't read past the end of the array
+            Vec4q b = Vec4q().load((int64_t const *)a + imax-3);
+            return permute4<i0-imax+3, i1-imax+3, i2-imax+3, i3-imax+3>(b);
+        }
+        else {
+            Vec4q b = Vec4q().load((int64_t const *)a + imin);
+            return permute4<i0-imin, i1-imin, i2-imin, i3-imin>(b);
+        }
+    }
+    if constexpr ((i0<imin+4 || i0>imax-4) && (i1<imin+4 || i1>imax-4) && (i2<imin+4 || i2>imax-4) && (i3<imin+4 || i3>imax-4)) {
+        // load two contiguous blocks and blend
+        Vec4q b = Vec4q().load((int64_t const *)a + imin);
+        Vec4q c = Vec4q().load((int64_t const *)a + imax-3);
+        const int j0 = i0<imin+4 ? i0-imin : 7-imax+i0;
+        const int j1 = i1<imin+4 ? i1-imin : 7-imax+i1;
+        const int j2 = i2<imin+4 ? i2-imin : 7-imax+i2;
+        const int j3 = i3<imin+4 ? i3-imin : 7-imax+i3;
+        return blend4<j0, j1, j2, j3>(b, c);
+    }
+    // use AVX2 gather
+    return _mm256_i32gather_epi64((const long long *)a, Vec4i(i0,i1,i2,i3), 8);
+}
+
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8i const data, void * array) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __m256i indx = constant8ui<i0,i1,i2,i3,i4,i5,i6,i7>();
+    __mmask8 mask = uint8_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) |
+        ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7));
+    _mm256_mask_i32scatter_epi32((int*)array, mask, indx, data, 4);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __m512i indx = _mm512_castsi256_si512(constant8ui<i0,i1,i2,i3,i4,i5,i6,i7>());
+    __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) |
+        ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7));
+    _mm512_mask_i32scatter_epi32((int*)array, mask, indx, _mm512_castsi256_si512(data), 4);
+#else
+    int32_t* arr = (int32_t*)array;
+    const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
+    for (int i = 0; i < 8; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4q const data, void * array) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __m128i indx = constant4ui<i0,i1,i2,i3>();
+    __mmask8 mask = uint8_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3));
+    _mm256_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8);
+#elif INSTRSET >= 9  //  __AVX512F__
+    __m256i indx = _mm256_castsi128_si256(constant4ui<i0,i1,i2,i3>());
+    __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3));
+    _mm512_mask_i32scatter_epi64((long long*)array, (__mmask8)mask, indx, _mm512_castsi256_si512(data), 8);
+#else
+    int64_t* arr = (int64_t*)array;
+    const int index[4] = {i0,i1,i2,i3};
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+
+/*****************************************************************************
+*
+*          Scatter functions with variable indexes
+*
+*****************************************************************************/
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8i const data, void * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask8 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+    _mm256_mask_i32scatter_epi32((int*)destination, mask, index, data, 4);
+#elif INSTRSET >= 9  //  __AVX512F__
+    // 16 bit mask, upper 8 bits are 0. Usually, we can rely on the upper bit of an extended vector to be zero, but we will mask then off the be sure
+    //__mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+    __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xFF, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+    _mm512_mask_i32scatter_epi32((int*)destination, mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 4);
+#else
+    int32_t* arr = (int32_t*)destination;
+    for (int i = 0; i < 8; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4q const index, uint32_t limit, Vec4q const data, void * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask8 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit)));
+    _mm256_mask_i64scatter_epi64((long long*)destination, mask, index, data, 8);
+#elif INSTRSET >= 9  //  __AVX512F__
+    // 16 bit mask. upper 12 bits are 0
+    __mmask16 mask = _mm512_mask_cmplt_epu64_mask(0xF, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit))));
+    _mm512_mask_i64scatter_epi64((long long*)destination, (__mmask8)mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 8);
+#else
+    int64_t* arr = (int64_t*)destination;
+    for (int i = 0; i < 4; i++) {
+        if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+static inline void scatter(Vec4i const index, uint32_t limit, Vec4q const data, void * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
+    _mm256_mask_i32scatter_epi64((long long*)destination, mask, index, data, 8);
+#elif INSTRSET >= 9  //  __AVX512F__
+    // 16 bit mask. upper 12 bits are 0
+    __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xF, _mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit)));
+    _mm512_mask_i32scatter_epi64((long long*)destination, (__mmask8)mask, _mm256_castsi128_si256(index), _mm512_castsi256_si512(data), 8);
+#else
+    int64_t* arr = (int64_t*)destination;
+    for (int i = 0; i < 4; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+#endif
+}
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 16 elements to 16 bits with sign extension
+static inline Vec16s extend_low (Vec32c const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0x10);            // get bits 64-127 to position 128-191
+    __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2
+    return         _mm256_unpacklo_epi8(a2, sign);               // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec16s extend_high (Vec32c const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0xC8);            // get bits 128-191 to position 64-127
+    __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2
+    return         _mm256_unpackhi_epi8(a2, sign);               // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec16us extend_low (Vec32uc const a) {
+    __m256i a2 = _mm256_permute4x64_epi64(a, 0x10);              // get bits 64-127 to position 128-191
+    return    _mm256_unpacklo_epi8(a2, _mm256_setzero_si256());  // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec16us extend_high (Vec32uc const a) {
+    __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8);              // get bits 128-191 to position 64-127
+    return  _mm256_unpackhi_epi8(a2, _mm256_setzero_si256());    // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec8i extend_low (Vec16s const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0x10);            // get bits 64-127 to position 128-191
+    __m256i sign = _mm256_srai_epi16(a2, 15);                    // sign bit
+    return         _mm256_unpacklo_epi16(a2 ,sign);              // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec8i extend_high (Vec16s const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0xC8);            // get bits 128-191 to position 64-127
+    __m256i sign = _mm256_srai_epi16(a2, 15);                    // sign bit
+    return         _mm256_unpackhi_epi16(a2, sign);              // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_low (Vec16us const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0x10);            // get bits 64-127 to position 128-191
+    return    _mm256_unpacklo_epi16(a2, _mm256_setzero_si256()); // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_high (Vec16us const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0xC8);            // get bits 128-191 to position 64-127
+    return  _mm256_unpackhi_epi16(a2, _mm256_setzero_si256());   // interleave with zero extensions
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 64 bits with sign extension
+static inline Vec4q extend_low (Vec8i const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0x10);            // get bits 64-127 to position 128-191
+    __m256i sign = _mm256_srai_epi32(a2, 31);                    // sign bit
+    return         _mm256_unpacklo_epi32(a2, sign);              // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with sign extension
+static inline Vec4q extend_high (Vec8i const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0xC8);            // get bits 128-191 to position 64-127
+    __m256i sign = _mm256_srai_epi32(a2, 31);                    // sign bit
+    return         _mm256_unpackhi_epi32(a2, sign);              // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_low (Vec8ui const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0x10);            // get bits 64-127 to position 128-191
+    return  _mm256_unpacklo_epi32(a2, _mm256_setzero_si256());   // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_high (Vec8ui const a) {
+    __m256i a2   = _mm256_permute4x64_epi64(a, 0xC8);            // get bits 128-191 to position 64-127
+    return  _mm256_unpackhi_epi32(a2, _mm256_setzero_si256());   // interleave with zero extensions
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec32c compress (Vec16s const low, Vec16s const high) {
+    __m256i mask  = _mm256_set1_epi32(0x00FF00FF);         // mask for low bytes
+    __m256i lowm  = _mm256_and_si256(low, mask);           // bytes of low
+    __m256i highm = _mm256_and_si256(high, mask);          // bytes of high
+    __m256i pk    = _mm256_packus_epi16(lowm, highm);      // unsigned pack
+    return          _mm256_permute4x64_epi64(pk, 0xD8);    // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec32c compress_saturated (Vec16s const low, Vec16s const high) {
+    __m256i pk    = _mm256_packs_epi16(low,high);          // packed with signed saturation
+    return          _mm256_permute4x64_epi64(pk, 0xD8);    // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec32uc compress (Vec16us const low, Vec16us const high) {
+    return  Vec32uc (compress((Vec16s)low, (Vec16s)high));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec32uc compress_saturated (Vec16us const low, Vec16us const high) {
+    __m256i maxval  = _mm256_set1_epi32(0x00FF00FF);       // maximum value
+    __m256i low1    = _mm256_min_epu16(low,maxval);        // upper limit
+    __m256i high1   = _mm256_min_epu16(high,maxval);       // upper limit
+    __m256i pk      = _mm256_packus_epi16(low1,high1);     // this instruction saturates from signed 32 bit to unsigned 16 bit
+    return            _mm256_permute4x64_epi64(pk, 0xD8);  // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed to unsigned, with saturation
+static inline Vec32uc compress_saturated_s2u (Vec16s const low, Vec16s const high) {
+    __m256i pk    = _mm256_packus_epi16(low,high);         // this instruction saturates from signed 16 bit to unsigned 8 bit
+    return          _mm256_permute4x64_epi64(pk, 0xD8);    // put in right place
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16s compress (Vec8i const low, Vec8i const high) {
+    __m256i mask  = _mm256_set1_epi32(0x0000FFFF);         // mask for low words
+    __m256i lowm  = _mm256_and_si256(low,mask);            // words of low
+    __m256i highm = _mm256_and_si256(high,mask);           // words of high
+    __m256i pk    = _mm256_packus_epi32(lowm,highm);       // unsigned pack
+    return          _mm256_permute4x64_epi64(pk, 0xD8);    // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec16s compress_saturated (Vec8i const low, Vec8i const high) {
+    __m256i pk    =  _mm256_packs_epi32(low,high);         // pack with signed saturation
+    return           _mm256_permute4x64_epi64(pk, 0xD8);   // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16us compress (Vec8ui const low, Vec8ui const high) {
+    return Vec16us (compress((Vec8i)low, (Vec8i)high));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec16us compress_saturated (Vec8ui const low, Vec8ui const high) {
+    __m256i maxval  = _mm256_set1_epi32(0x0000FFFF);       // maximum value
+    __m256i low1    = _mm256_min_epu32(low,maxval);        // upper limit
+    __m256i high1   = _mm256_min_epu32(high,maxval);       // upper limit
+    __m256i pk      = _mm256_packus_epi32(low1,high1);     // this instruction saturates from signed 32 bit to unsigned 16 bit
+    return            _mm256_permute4x64_epi64(pk, 0xD8);  // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed to unsigned, with saturation
+static inline Vec16us compress_saturated_s2u (Vec8i const low, Vec8i const high) {
+    __m256i pk    =  _mm256_packus_epi32(low,high);        // this instruction saturates from signed 32 bit to unsigned 16 bit
+    return           _mm256_permute4x64_epi64(pk, 0xD8);   // put in right place
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec8i compress (Vec4q const low, Vec4q const high) {
+    __m256i low2  = _mm256_shuffle_epi32(low,0xD8);        // low dwords of low  to pos. 0 and 32
+    __m256i high2 = _mm256_shuffle_epi32(high,0xD8);       // low dwords of high to pos. 0 and 32
+    __m256i pk    = _mm256_unpacklo_epi64(low2,high2);     // interleave
+    return          _mm256_permute4x64_epi64(pk, 0xD8);    // put in right place
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec8i compress_saturated (Vec4q const a, Vec4q const b) {
+    Vec4q maxval = constant8ui<0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0>();
+    Vec4q minval = constant8ui<0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF>();
+    Vec4q a1  = min(a,maxval);
+    Vec4q b1  = min(b,maxval);
+    Vec4q a2  = max(a1,minval);
+    Vec4q b2  = max(b1,minval);
+    return compress(a2,b2);
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8ui compress (Vec4uq const low, Vec4uq const high) {
+    return Vec8ui (compress((Vec4q)low, (Vec4q)high));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec8ui compress_saturated (Vec4uq const low, Vec4uq const high) {
+    __m256i zero     = _mm256_setzero_si256();             // 0
+    __m256i lowzero  = _mm256_cmpeq_epi32(low,zero);       // for each dword is zero
+    __m256i highzero = _mm256_cmpeq_epi32(high,zero);      // for each dword is zero
+    __m256i mone     = _mm256_set1_epi32(-1);              // FFFFFFFF
+    __m256i lownz    = _mm256_xor_si256(lowzero,mone);     // for each dword is nonzero
+    __m256i highnz   = _mm256_xor_si256(highzero,mone);    // for each dword is nonzero
+    __m256i lownz2   = _mm256_srli_epi64(lownz,32);        // shift down to low dword
+    __m256i highnz2  = _mm256_srli_epi64(highnz,32);       // shift down to low dword
+    __m256i lowsatur = _mm256_or_si256(low,lownz2);        // low, saturated
+    __m256i hisatur  = _mm256_or_si256(high,highnz2);      // high, saturated
+    return  Vec8ui (compress(Vec4q(lowsatur), Vec4q(hisatur)));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 8 32-bit signed integers
+static inline Vec8i operator / (Vec8i const a, Divisor_i const d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i sgn = _mm256_broadcastq_epi64(d.getsign());    // broadcast sign of d
+    __m256i t1  = _mm256_mul_epi32(a,m);                   // 32x32->64 bit signed multiplication of even elements of a
+    __m256i t2  = _mm256_srli_epi64(t1,32);                // high dword of even numbered results
+    __m256i t3  = _mm256_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m256i t4  = _mm256_mul_epi32(t3,m);                  // 32x32->64 bit signed multiplication of odd elements
+    __m256i t7  = _mm256_blend_epi32(t2,t4,0xAA);
+    __m256i t8  = _mm256_add_epi32(t7,a);                  // add
+    __m256i t9  = _mm256_sra_epi32(t8,d.gets1());          // shift right artihmetic
+    __m256i t10 = _mm256_srai_epi32(a,31);                 // sign of a
+    __m256i t11 = _mm256_sub_epi32(t10,sgn);               // sign of a - sign of d
+    __m256i t12 = _mm256_sub_epi32(t9,t11);                // + 1 if a < 0, -1 if d < 0
+    return        _mm256_xor_si256(t12,sgn);               // change sign if divisor negative
+}
+
+// vector of 8 32-bit unsigned integers
+static inline Vec8ui operator / (Vec8ui const a, Divisor_ui const d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i t1  = _mm256_mul_epu32(a,m);                   // 32x32->64 bit unsigned multiplication of even elements of a
+    __m256i t2  = _mm256_srli_epi64(t1,32);                // high dword of even numbered results
+    __m256i t3  = _mm256_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m256i t4  = _mm256_mul_epu32(t3,m);                  // 32x32->64 bit unsigned multiplication of odd elements
+    __m256i t7  = _mm256_blend_epi32(t2,t4,0xAA);
+    __m256i t8  = _mm256_sub_epi32(a,t7);                  // subtract
+    __m256i t9  = _mm256_srl_epi32(t8,d.gets1());          // shift right logical
+    __m256i t10 = _mm256_add_epi32(t7,t9);                 // add
+    return        _mm256_srl_epi32(t10,d.gets2());         // shift right logical
+}
+
+// vector of 16 16-bit signed integers
+static inline Vec16s operator / (Vec16s const a, Divisor_s const d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i sgn = _mm256_broadcastq_epi64(d.getsign());    // broadcast sign of d
+    __m256i t1  = _mm256_mulhi_epi16(a, m);                // multiply high signed words
+    __m256i t2  = _mm256_add_epi16(t1,a);                  // + a
+    __m256i t3  = _mm256_sra_epi16(t2,d.gets1());          // shift right artihmetic
+    __m256i t4  = _mm256_srai_epi16(a,15);                 // sign of a
+    __m256i t5  = _mm256_sub_epi16(t4,sgn);                // sign of a - sign of d
+    __m256i t6  = _mm256_sub_epi16(t3,t5);                 // + 1 if a < 0, -1 if d < 0
+    return        _mm256_xor_si256(t6,sgn);                // change sign if divisor negative
+}
+
+// vector of 16 16-bit unsigned integers
+static inline Vec16us operator / (Vec16us const a, Divisor_us const d) {
+    __m256i m   = _mm256_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m256i t1  = _mm256_mulhi_epu16(a, m);                // multiply high signed words
+    __m256i t2  = _mm256_sub_epi16(a,t1);                  // subtract
+    __m256i t3  = _mm256_srl_epi16(t2,d.gets1());          // shift right logical
+    __m256i t4  = _mm256_add_epi16(t1,t3);                 // add
+    return        _mm256_srl_epi16(t4,d.gets2());          // shift right logical
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec32c operator / (Vec32c const a, Divisor_s const d) {
+#if INSTRSET >= 10
+    // sign-extend even-numbered and odd-numbered elements to 16 bits
+    Vec16s  even = _mm256_srai_epi16(_mm256_slli_epi16(a, 8),8);
+    Vec16s  odd  = _mm256_srai_epi16(a, 8);
+    Vec16s  evend = even / d;         // divide even-numbered elements
+    Vec16s  oddd  = odd  / d;         // divide odd-numbered  elements
+            oddd  = _mm256_slli_epi16(oddd, 8); // shift left to put back in place
+    __m256i res  = _mm256_mask_mov_epi8(evend, 0xAAAAAAAA, oddd); // interleave even and odd
+    return res;
+#else
+    // expand into two Vec16s
+    Vec16s low  = extend_low(a) / d;
+    Vec16s high = extend_high(a) / d;
+    return compress(low,high);
+#endif
+}
+
+
+// vector of 32 8-bit unsigned integers
+static inline Vec32uc operator / (Vec32uc const a, Divisor_us const d) {
+    // zero-extend even-numbered and odd-numbered elements to 16 bits
+#if INSTRSET >= 10
+    Vec16us  even = _mm256_maskz_mov_epi8(__mmask32(0x55555555), a);
+    Vec16us  odd  = _mm256_srli_epi16(a, 8);
+    Vec16us  evend = even / d;          // divide even-numbered elements
+    Vec16us  oddd  = odd  / d;          // divide odd-numbered  elements
+    oddd  = _mm256_slli_epi16(oddd, 8); // shift left to put back in place
+    __m256i res  = _mm256_mask_mov_epi8(evend, 0xAAAAAAAA, oddd); // interleave even and odd
+    return res;
+#else
+    // expand into two Vec16s
+    Vec16us low  = extend_low(a) / d;
+    Vec16us high = extend_high(a) / d;
+    return compress(low,high);
+#endif
+}
+
+// vector operator /= : divide
+static inline Vec8i & operator /= (Vec8i & a, Divisor_i const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16s & operator /= (Vec16s & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+
+// vector operator /= : divide
+static inline Vec16us & operator /= (Vec16us & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+
+}
+
+// vector operator /= : divide
+static inline Vec32c & operator /= (Vec32c & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec8i by compile-time constant
+template <int32_t d>
+static inline Vec8i divide_by_i(Vec8i const x) {
+    static_assert(d != 0, "Integer division by zero");
+    if constexpr (d ==  1) return  x;
+    if constexpr (d == -1) return -x;
+    if constexpr (uint32_t(d) == 0x80000000u) return Vec8i(x == Vec8i(0x80000000)) & 1; // prevent overflow when changing sign
+    constexpr uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);// compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
+    if constexpr ((d1 & (d1-1)) == 0) {
+        // d1 is a power of 2. use shift
+        constexpr int k = bit_scan_reverse_const(d1);
+        __m256i sign;
+        if constexpr (k > 1) sign = _mm256_srai_epi32(x, k-1); else sign = x;  // k copies of sign bit
+        __m256i bias    = _mm256_srli_epi32(sign, 32-k);   // bias = x >= 0 ? 0 : k-1
+        __m256i xpbias  = _mm256_add_epi32 (x, bias);      // x + bias
+        __m256i q       = _mm256_srai_epi32(xpbias, k);    // (x + bias) >> k
+        if (d > 0)      return q;                          // d > 0: return  q
+        return _mm256_sub_epi32(_mm256_setzero_si256(), q);// d < 0: return -q
+    }
+    // general case
+    constexpr int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);// ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+    constexpr int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));// multiplier
+    const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+    return x / div;
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec8i operator / (Vec8i const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec8i a / const_uint(d)
+template <uint32_t d>
+static inline Vec8i operator / (Vec8i const a, Const_uint_t<d>) {
+    static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int32_t(d)>(a);                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8ui by compile-time constant
+template <uint32_t d>
+static inline Vec8ui divide_by_ui(Vec8ui const x) {
+    static_assert(d != 0, "Integer division by zero");
+    if constexpr (d == 1) return x;                        // divide by 1
+    constexpr int b = bit_scan_reverse_const(d);           // floor(log2(d))
+    if constexpr ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm256_srli_epi32(x, b);                   // x >> b
+    }
+    // general case (d > 2)
+    constexpr uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d
+    constexpr uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
+    constexpr bool round_down = (2*rem < d);               // check if fraction is less than 0.5
+    constexpr uint32_t mult1 = round_down ? mult : mult + 1;
+    // do 32*32->64 bit unsigned multiplication and get high part of result
+#if INSTRSET >= 10
+    const __m256i multv = _mm256_maskz_set1_epi32(0x55, mult1);// zero-extend mult and broadcast
+#else
+    const __m256i multv = Vec4uq(uint64_t(mult1));         // zero-extend mult and broadcast
+#endif
+    __m256i t1 = _mm256_mul_epu32(x,multv);                // 32x32->64 bit unsigned multiplication of x[0] and x[2]
+    if constexpr (round_down) {
+        t1     = _mm256_add_epi64(t1,multv);               // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m256i t2 = _mm256_srli_epi64(t1,32);                 // high dword of result 0 and 2
+    __m256i t3 = _mm256_srli_epi64(x,32);                  // get x[1] and x[3] into position for multiplication
+    __m256i t4 = _mm256_mul_epu32(t3,multv);               // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+    if constexpr (round_down) {
+        t4     = _mm256_add_epi64(t4,multv);               // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m256i t7 = _mm256_blend_epi32(t2,t4,0xAA);
+    Vec8ui  q  = _mm256_srli_epi32(t7, b);                 // shift right by b
+    return  q;                                             // no overflow possible
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec8ui operator / (Vec8ui const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec8ui operator / (Vec8ui const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                             // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16s by compile-time constant
+template <int d>
+static inline Vec16s divide_by_i(Vec16s const x) {
+    constexpr int16_t d0 = int16_t(d);                     // truncate d to 16 bits
+    static_assert(d0 != 0, "Integer division by zero");
+    if constexpr (d0 ==  1) return  x;                     // divide by  1
+    if constexpr (d0 == -1) return -x;                     // divide by -1
+    if constexpr (uint16_t(d0) == 0x8000u) return Vec16s(x == Vec16s(0x8000)) & 1;// prevent overflow when changing sign
+    constexpr uint16_t d1 = d0 > 0 ? d0 : -d0;             // compile-time abs(d0)
+    if constexpr ((d1 & (d1-1)) == 0) {
+        // d is a power of 2. use shift
+        constexpr int k = bit_scan_reverse_const(uint32_t(d1));
+        __m256i sign;
+        if constexpr (k > 1) sign = _mm256_srai_epi16(x, k-1); else sign = x;// k copies of sign bit
+        __m256i bias    = _mm256_srli_epi16(sign, 16-k);   // bias = x >= 0 ? 0 : k-1
+        __m256i xpbias  = _mm256_add_epi16 (x, bias);      // x + bias
+        __m256i q       = _mm256_srai_epi16(xpbias, k);    // (x + bias) >> k
+        if constexpr (d0 > 0)  return q;                   // d0 > 0: return  q
+        return _mm256_sub_epi16(_mm256_setzero_si256(), q);// d0 < 0: return -q
+    }
+    // general case
+    constexpr int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1;// ceil(log2(d)). (d < 2 handled above)
+    constexpr int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier
+    constexpr int shift1 = L - 1;
+    const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+    return x / div;
+}
+
+// define Vec16s a / const_int(d)
+template <int d>
+static inline Vec16s operator / (Vec16s const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16s a / const_uint(d)
+template <uint32_t d>
+static inline Vec16s operator / (Vec16s const a, Const_uint_t<d>) {
+    static_assert(d < 0x8000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int(d)>(a);                         // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16us by compile-time constant
+template <uint32_t d>
+static inline Vec16us divide_by_ui(Vec16us const x) {
+    constexpr uint16_t d0 = uint16_t(d);                   // truncate d to 16 bits
+    static_assert(d0 != 0, "Integer division by zero");
+    if constexpr (d0 == 1) return x;                       // divide by 1
+    constexpr int b = bit_scan_reverse_const((uint32_t)d0);// floor(log2(d))
+    if constexpr ((d0 & (d0-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm256_srli_epi16(x, b);                   // x >> b
+    }
+    // general case (d > 2)
+    constexpr uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0);// multiplier = 2^(32+b) / d
+    constexpr uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d
+    constexpr bool round_down = (2*rem < d0);              // check if fraction is less than 0.5
+    Vec16us x1 = x;
+    if constexpr (round_down) {
+        x1 = x1 + 1;                                       // round down mult and compensate by adding 1 to x
+    }
+    constexpr uint16_t mult1 = round_down ? mult : mult + 1;
+    const __m256i multv = _mm256_set1_epi16((int16_t)mult1);// broadcast mult
+    __m256i xm = _mm256_mulhi_epu16(x1, multv);            // high part of 16x16->32 bit unsigned multiplication
+    Vec16us q    = _mm256_srli_epi16(xm, b);               // shift right by b
+    if constexpr (round_down) {
+        Vec16sb overfl = (x1 == Vec16us(_mm256_setzero_si256())); // check for overflow of x+1
+        return select(overfl, Vec16us(uint16_t(mult1 >> (uint16_t)b)), q); // deal with overflow (rarely needed)
+    }
+    else {
+        return q;                                          // no overflow possible
+    }
+}
+
+// define Vec16us a / const_uint(d)
+template <uint32_t d>
+static inline Vec16us operator / (Vec16us const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec16us a / const_int(d)
+template <int d>
+static inline Vec16us operator / (Vec16us const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                             // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec32c a / const_int(d)
+template <int d>
+static inline Vec32c operator / (Vec32c const a, Const_int_t<d>) {
+    // expand into two Vec16s
+    Vec16s low  = extend_low(a)  / Const_int_t<d>();
+    Vec16s high = extend_high(a) / Const_int_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32c a / const_uint(d)
+template <uint32_t d>
+static inline Vec32c operator / (Vec32c const a, Const_uint_t<d>) {
+    static_assert(uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned");
+    return a / Const_int_t<d>();                           // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec32uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec32uc operator / (Vec32uc const a, Const_uint_t<d>) {
+    // expand into two Vec16us
+    Vec16us low  = extend_low(a)  / Const_uint_t<d>();
+    Vec16us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32uc a / const_int(d)
+template <int d>
+static inline Vec32uc operator / (Vec32uc const a, Const_int_t<d>) {
+    static_assert(int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return a / Const_uint_t<d>();                          // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+#if INSTRSET >= 10  // compact boolean vectors, other sizes
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32b const x) {
+    return __mmask32(x);
+}
+
+#else
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32cb const x) {
+    return (uint32_t)_mm256_movemask_epi8(x);
+}
+
+static inline uint16_t to_bits(Vec16sb const x) {
+    __m128i a = _mm_packs_epi16(x.get_low(), x.get_high());  // 16-bit words to bytes
+    return (uint16_t)_mm_movemask_epi8(a);
+}
+
+static inline uint8_t to_bits(Vec8ib const x) {
+    __m128i a = _mm_packs_epi32(x.get_low(), x.get_high());  // 32-bit dwords to 16-bit words
+    __m128i b = _mm_packs_epi16(a, a);  // 16-bit words to bytes
+    return (uint8_t)_mm_movemask_epi8(b);
+}
+
+static inline uint8_t to_bits(Vec4qb const x) {
+    uint32_t a = (uint32_t)_mm256_movemask_epi8(x);
+    return ((a & 1) | ((a >> 7) & 2)) | (((a >> 14) & 4) | ((a >> 21) & 8));
+}
+
+#endif
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI256_H
diff --git a/Bwdif/VCL2/vectori256e.h b/Bwdif/VCL2/vectori256e.h
new file mode 100644
index 0000000..df47594
--- /dev/null
+++ b/Bwdif/VCL2/vectori256e.h
@@ -0,0 +1,3954 @@
+/****************************  vectori256e.h   *******************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 256-bit integer point vector classes as interface
+* to intrinsic functions. Emulated for processors without AVX2 instruction set.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec256b   Vector of 256  bits. Used internally as base class
+* Vec32c    Vector of  32  8-bit signed    integers
+* Vec32uc   Vector of  32  8-bit unsigned  integers
+* Vec32cb   Vector of  32  Booleans for use with Vec32c and Vec32uc
+* Vec16s    Vector of  16  16-bit signed   integers
+* Vec16us   Vector of  16  16-bit unsigned integers
+* Vec16sb   Vector of  16  Booleans for use with Vec16s and Vec16us
+* Vec8i     Vector of   8  32-bit signed   integers
+* Vec8ui    Vector of   8  32-bit unsigned integers
+* Vec8ib    Vector of   8  Booleans for use with Vec8i and Vec8ui
+* Vec4q     Vector of   4  64-bit signed   integers
+* Vec4uq    Vector of   4  64-bit unsigned integers
+* Vec4qb    Vector of   4  Booleans for use with Vec4q and Vec4uq
+*
+* Each vector object is represented internally in the CPU as two 128-bit registers.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORI256E_H
+#define VECTORI256E_H 1
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+// check combination of header files
+#if defined (VECTORI256_H)
+#error Two different versions of vectori256.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+/*****************************************************************************
+*
+*          Vector of 256 bits. used as base class
+*
+*****************************************************************************/
+
+class Vec256b {
+protected:
+    __m128i y0;                        // low half
+    __m128i y1;                        // high half
+public:
+    // Default constructor:
+    Vec256b() {
+    }
+    Vec256b(__m128i x0, __m128i x1) {  // constructor to build from two __m128i
+        y0 = x0;  y1 = x1;
+    }
+    // Constructor to build from two Vec128b:
+    Vec256b(Vec128b const a0, Vec128b const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Member function to load from array (unaligned)
+    Vec256b & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    Vec256b & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm_storeu_si128((__m128i*)p,     y0);
+        _mm_storeu_si128((__m128i*)p + 1, y1);
+    }
+    // Member function to store into array, aligned by 32
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 32, but there is hardly any speed advantage of load_a on modern processors
+    void store_a(void * p) const {
+        _mm_store_si128((__m128i*)p,     y0);
+        _mm_store_si128((__m128i*)p + 1, y1);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 32
+    void store_nt(void * p) const {
+        _mm_stream_si128((__m128i*)p,     y0);
+        _mm_stream_si128((__m128i*)p + 1, y1);
+    }
+    // Member functions to split into two Vec128b:
+    Vec128b get_low() const {
+        return y0;
+    }
+    Vec128b get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 256;
+    }
+    static constexpr int elementtype() {
+        return 1;
+    }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec256b operator & (Vec256b const a, Vec256b const b) {
+    return Vec256b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec256b operator && (Vec256b const a, Vec256b const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec256b operator | (Vec256b const a, Vec256b const b) {
+    return Vec256b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec256b operator || (Vec256b const a, Vec256b const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec256b operator ^ (Vec256b const a, Vec256b const b) {
+    return Vec256b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec256b operator ~ (Vec256b const a) {
+    return Vec256b(~a.get_low(), ~a.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec256b & operator &= (Vec256b & a, Vec256b const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec256b & operator |= (Vec256b & a, Vec256b const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec256b & operator ^= (Vec256b & a, Vec256b const b) {
+    a = a ^ b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Functions for this class
+*
+*****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec256b andnot (Vec256b const a, Vec256b const b) {
+    return Vec256b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+// Select between two sources, byte by byte. Used in various functions and operators
+// Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
+// Only bit 7 in each byte of s is checked,
+static inline Vec256b selectb (Vec256b const s, Vec256b const a, Vec256b const b) {
+    return Vec256b(selectb(s.get_low(),  a.get_low(),  b.get_low()),
+                   selectb(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec256b const a) {
+    return horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec256b const a) {
+    return horizontal_or(a.get_low() | a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec32c : public Vec256b {
+public:
+    // Default constructor:
+    Vec32c(){
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32c(int i) {
+        y1 = y0 = _mm_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,
+        int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
+        int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) {
+        y0 = _mm_setr_epi8(i0,  i1,  i2,  i3,  i4,  i5,  i6,  i7,  i8,  i9,  i10, i11, i12, i13, i14, i15);
+        y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+    }
+    // Constructor to build from two Vec16c:
+    Vec32c(Vec16c const a0, Vec16c const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec32c(Vec256b const & x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec32c & operator = (Vec256b const x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec32c & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32c & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec32c & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 16) {
+            *this = Vec32c(Vec16c().load_partial(n, p), 0);
+        }
+        else if (n < 32) {
+            *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char const*)p+16));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 16) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 32) {
+            get_low().store(p);
+            get_high().store_partial(n-16, (char*)p+16);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 32-n elements are set to zero
+    Vec32c & cutoff(int n) {
+        if (uint32_t(n) >= 32) return *this;
+        static const union {
+            int32_t i[16];
+            char    c[64];
+        } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
+        *this &= Vec32c().load(mask.c+32-n);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32c const insert(int index, int8_t value) {
+        if ((uint32_t)index < 16) {
+            y0 = Vec16c(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec16c(y1).insert(index-16, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(int index) const {
+        if ((uint32_t)index < 16) {
+            return Vec16c(y0).extract(index);
+        }
+        else {
+            return Vec16c(y1).extract(index-16);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16c:
+    Vec16c get_low() const {
+        return y0;
+    }
+    Vec16c get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 32;
+    }
+    static constexpr int elementtype() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc
+*
+*****************************************************************************/
+
+class Vec32cb : public Vec32c {
+public:
+    // Default constructor:
+    Vec32cb() {}
+
+    // Constructor to convert from type Vec256b
+    Vec32cb(Vec256b const x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec32cb & operator = (Vec256b const x) {
+        y0 = x.get_low();
+        y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec32cb(bool b) : Vec32c(-int8_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec32cb & operator = (bool b) {
+        *this = Vec32cb(b);
+        return *this;
+    }
+    // Constructor to build from two Vec16cb:
+    Vec32cb(Vec16cb const a0, Vec16cb const a1) : Vec32c(Vec16c(a0), Vec16c(a1)) {
+    }
+    // Member functions to split into two Vec16c:
+    Vec16cb get_low() const {
+        return y0;
+    }
+    Vec16cb get_high() const {
+        return y1;
+    }
+    Vec32cb & insert (int index, bool a) {
+        Vec32c::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec32c::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec32cb & load_bits(uint32_t a) {
+        y0 = Vec16cb().load_bits(uint16_t(a));
+        y1 = Vec16cb().load_bits(uint16_t(a>>16));
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec32cb(int b) = delete;
+    Vec32cb & operator = (int x) = delete;
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec32cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32cb operator & (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec32cb operator && (Vec32cb const a, Vec32cb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32cb operator | (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec32cb operator || (Vec32cb const a, Vec32cb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32cb operator ^ (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec32cb operator == (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(Vec256b(a) ^ Vec256b(~b));
+}
+
+// vector operator != : xor
+static inline Vec32cb operator != (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32cb operator ~ (Vec32cb const a) {
+    return Vec32cb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec32cb operator ! (Vec32cb const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec32cb andnot (Vec32cb const a, Vec32cb const b) {
+    return Vec32cb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec32c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32c operator + (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec32c & operator += (Vec32c & a, Vec32c const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec32c operator ++ (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec32c & operator ++ (Vec32c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32c operator - (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec32c operator - (Vec32c const a) {
+    return Vec32c(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : add
+static inline Vec32c & operator -= (Vec32c & a, Vec32c const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec32c operator -- (Vec32c & a, int) {
+    Vec32c a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec32c & operator -- (Vec32c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32c operator * (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec32c & operator *= (Vec32c & a, Vec32c const b) {
+    a = a * b;
+    return a;
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec32c operator / (Vec32c const a, Divisor_s const d) {
+    return Vec32c(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec32c & operator /= (Vec32c & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32c operator << (Vec32c const a, int b) {
+    return Vec32c(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec32c & operator <<= (Vec32c & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic all elements
+static inline Vec32c operator >> (Vec32c const a, int b) {
+    return Vec32c(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32c & operator >>= (Vec32c & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32cb operator == (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32cb operator != (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (signed)
+static inline Vec32cb operator > (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (signed)
+static inline Vec32cb operator < (Vec32c const a, Vec32c const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32cb operator >= (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32cb operator <= (Vec32c const a, Vec32c const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32c operator & (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32c operator && (Vec32c const a, Vec32c const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32c & operator &= (Vec32c & a, Vec32c const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32c operator | (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32c operator || (Vec32c const a, Vec32c const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32c & operator |= (Vec32c & a, Vec32c const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32c operator ^ (Vec32c const a, Vec32c const b) {
+    return Vec32c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec32c & operator ^= (Vec32c & a, Vec32c const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32c operator ~ (Vec32c const a) {
+    return Vec32c(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec32cb operator ! (Vec32c const a) {
+    return Vec32c(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands using broad boolean vectors. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+static inline Vec32c select (Vec32cb const s, Vec32c const a, Vec32c const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32c if_add (Vec32cb const f, Vec32c const a, Vec32c const b) {
+    return a + (Vec32c(f) & b);
+}
+
+// Conditional subtract
+static inline Vec32c if_sub (Vec32cb const f, Vec32c const a, Vec32c const b) {
+    return a - (Vec32c(f) & b);
+}
+
+// Conditional multiply
+static inline Vec32c if_mul (Vec32cb const f, Vec32c const a, Vec32c const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint8_t horizontal_add (Vec32c const a) {
+    return (uint8_t)horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec32c const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32c add_saturated(Vec32c const a, Vec32c const b) {
+    return Vec32c(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32c sub_saturated(Vec32c const a, Vec32c const b) {
+    return Vec32c(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32c max(Vec32c const a, Vec32c const b) {
+    return Vec32c(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32c min(Vec32c const a, Vec32c const b) {
+    return Vec32c(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32c abs(Vec32c const a) {
+    return Vec32c(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32c abs_saturated(Vec32c const a) {
+    return Vec32c(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32c rotate_left(Vec32c const a, int b) {
+    return Vec32c(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec32uc : public Vec32c {
+public:
+    // Default constructor:
+    Vec32uc(){
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32uc(uint32_t i) {
+        y1 = y0 = _mm_set1_epi8((char)i);
+    }
+    // Constructor to build from all elements:
+    Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,
+        uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
+        uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) {
+        y0 = _mm_setr_epi8((int8_t)i0,  (int8_t)i1,  (int8_t)i2,  (int8_t)i3,  (int8_t)i4,  (int8_t)i5,  (int8_t)i6,  (int8_t)i7,  (int8_t)i8,  (int8_t)i9,  (int8_t)i10, (int8_t)i11, (int8_t)i12, (int8_t)i13, (int8_t)i14, (int8_t)i15);
+        y1 = _mm_setr_epi8((int8_t)i16, (int8_t)i17, (int8_t)i18, (int8_t)i19, (int8_t)i20, (int8_t)i21, (int8_t)i22, (int8_t)i23, (int8_t)i24, (int8_t)i25, (int8_t)i26, (int8_t)i27, (int8_t)i28, (int8_t)i29, (int8_t)i30, (int8_t)i31);
+    }
+    // Constructor to build from two Vec16uc:
+    Vec32uc(Vec16uc const a0, Vec16uc const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec32uc(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec32uc & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec32uc & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec32uc & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32uc const insert(int index, uint8_t value) {
+        Vec32c::insert(index, (int8_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(int index) const {
+        return (uint8_t)Vec32c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16uc:
+    Vec16uc get_low() const {
+        return y0;
+    }
+    Vec16uc get_high() const {
+        return y1;
+    }
+    static constexpr int elementtype() {
+        return 5;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec32uc operator + (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract
+static inline Vec32uc operator - (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator * : multiply
+static inline Vec32uc operator * (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide
+static inline Vec32uc operator / (Vec32uc const a, Divisor_us const d) {
+    return Vec32uc(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const a, uint32_t b) {
+    return Vec32uc(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec32uc operator << (Vec32uc const a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const a, uint32_t b) {
+    return Vec32uc(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec32uc operator >> (Vec32uc const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32cb operator >= (Vec32uc const a, Vec32uc const b) {
+    return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32cb operator <= (Vec32uc const a, Vec32uc const b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32cb operator > (Vec32uc const a, Vec32uc const b) {
+    return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32cb operator < (Vec32uc const a, Vec32uc const b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32uc operator & (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32uc operator && (Vec32uc const a, Vec32uc const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec32uc operator | (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32uc operator || (Vec32uc const a, Vec32uc const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32uc operator ^ (Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32uc operator ~ (Vec32uc const a) {
+    return Vec32uc(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec32uc select (Vec32cb const s, Vec32uc const a, Vec32uc const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32uc if_add (Vec32cb const f, Vec32uc const a, Vec32uc const b) {
+    return a + (Vec32uc(f) & b);
+}
+
+// Conditional subtract
+static inline Vec32uc if_sub (Vec32cb const f, Vec32uc const a, Vec32uc const b) {
+    return a - (Vec32uc(f) & b);
+}
+
+// Conditional multiply
+static inline Vec32uc if_mul (Vec32cb const f, Vec32uc const a, Vec32uc const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+// (Note: horizontal_add_x(Vec32uc) is slightly faster)
+static inline uint32_t horizontal_add (Vec32uc const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec32uc const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32uc add_saturated(Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32uc sub_saturated(Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32uc max(Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32uc min(Vec32uc const a, Vec32uc const b) {
+    return Vec32uc(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16s : public Vec256b {
+public:
+    // Default constructor:
+    Vec16s() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16s(int i) {
+        y1 = y0 = _mm_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec16s(int16_t i0, int16_t i1, int16_t i2,  int16_t i3,  int16_t i4,  int16_t i5,  int16_t i6,  int16_t i7,
+           int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) {
+        y0 = _mm_setr_epi16(i0, i1, i2,  i3,  i4,  i5,  i6,  i7);
+        y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15);
+    }
+    // Constructor to build from two Vec8s:
+    Vec16s(Vec8s const a0, Vec8s const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec16s(Vec256b const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec16s & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16s & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16s & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16s & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 8) {
+            *this = Vec16s(Vec8s().load_partial(n, p), 0);
+        }
+        else if (n < 16) {
+            *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t const*)p+8));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 8) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 16) {
+            get_low().store(p);
+            get_high().store_partial(n-8, (int16_t*)p+8);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16s & cutoff(int n) {
+        *this = Vec16s(Vec32c(*this).cutoff(n * 2));
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16s const insert(int index, int16_t value) {
+        if ((uint32_t)index < 8) {
+            y0 = Vec8s(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec8s(y1).insert(index-8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int16_t extract(int index) const {
+        if ((uint32_t)index < 8) {
+            return Vec8s(y0).extract(index);
+        }
+        else {
+            return Vec8s(y1).extract(index-8);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8s:
+    Vec8s get_low() const {
+        return y0;
+    }
+    Vec8s get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 6;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us
+*
+*****************************************************************************/
+
+class Vec16sb : public Vec16s {
+public:
+    // Default constructor:
+    Vec16sb() {
+    }
+    // Constructor to build from all elements:
+    Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
+        Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7),
+            -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15))
+        {}
+    // Constructor to convert from type Vec256b
+    Vec16sb(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec16sb & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16sb(bool b) : Vec16s(-int16_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16sb & operator = (bool b) {
+        *this = Vec16sb(b);
+        return *this;
+    }
+    // Constructor to build from two Vec8sb:
+    Vec16sb(Vec8sb const a0, Vec8sb const a1) : Vec16s(Vec8s(a0), Vec8s(a1)) {
+    }
+    // Member functions to split into two Vec8s:
+    Vec8sb get_low() const {
+        return y0;
+    }
+    Vec8sb get_high() const {
+        return y1;
+    }
+    Vec16sb & insert (int index, bool a) {
+        Vec16s::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec16s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec16sb & load_bits(uint16_t a) {
+        y0 = Vec8sb().load_bits(uint8_t(a));
+        y1 = Vec8sb().load_bits(uint8_t(a>>8));
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec16sb(int b) = delete;
+    Vec16sb & operator = (int x) = delete;
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec16sb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec16sb operator & (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec16sb operator && (Vec16sb const a, Vec16sb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16sb operator | (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec16sb operator || (Vec16sb const a, Vec16sb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16sb operator ^ (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec16sb operator == (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(Vec256b(a) ^ Vec256b(~b));
+}
+
+// vector operator != : xor
+static inline Vec16sb operator != (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16sb operator ~ (Vec16sb const a) {
+    return Vec16sb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec16sb operator ! (Vec16sb const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec16sb andnot (Vec16sb const a, Vec16sb const b) {
+    return Vec16sb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec16s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec16s operator + (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec16s & operator += (Vec16s & a, Vec16s const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16s operator ++ (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16s & operator ++ (Vec16s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16s operator - (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec16s operator - (Vec16s const a) {
+    return Vec16s(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec16s & operator -= (Vec16s & a, Vec16s const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16s operator -- (Vec16s & a, int) {
+    Vec16s a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16s & operator -- (Vec16s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16s operator * (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec16s & operator *= (Vec16s & a, Vec16s const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16s operator / (Vec16s const a, Divisor_s const d) {
+    return Vec16s(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16s & operator /= (Vec16s & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec16s operator << (Vec16s const a, int b) {
+    return Vec16s(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec16s & operator <<= (Vec16s & a, int b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16s operator >> (Vec16s const a, int b) {
+    return Vec16s(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16s & operator >>= (Vec16s & a, int b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16sb operator == (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16sb operator != (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16sb operator > (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16sb operator < (Vec16s const a, Vec16s const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16sb operator >= (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16sb operator <= (Vec16s const a, Vec16s const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16s operator & (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16s operator && (Vec16s const a, Vec16s const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec16s & operator &= (Vec16s & a, Vec16s const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16s operator | (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16s operator || (Vec16s const a, Vec16s const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec16s & operator |= (Vec16s & a, Vec16s const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16s operator ^ (Vec16s const a, Vec16s const b) {
+    return Vec16s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec16s & operator ^= (Vec16s & a, Vec16s const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16s operator ~ (Vec16s const a) {
+    return Vec16s(~Vec256b(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16s select (Vec16sb const s, Vec16s const a, Vec16s const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16s if_add (Vec16sb const f, Vec16s const a, Vec16s const b) {
+    return a + (Vec16s(f) & b);
+}
+
+// Conditional subtract
+static inline Vec16s if_sub (Vec16sb const f, Vec16s const a, Vec16s const b) {
+    return a - (Vec16s(f) & b);
+}
+
+// Conditional multiply
+static inline Vec16s if_mul (Vec16sb const f, Vec16s const a, Vec16s const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int16_t horizontal_add (Vec16s const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int32_t horizontal_add_x (Vec16s const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16s add_saturated(Vec16s const a, Vec16s const b) {
+    return Vec16s(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16s sub_saturated(Vec16s const a, Vec16s const b) {
+    return Vec16s(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16s max(Vec16s const a, Vec16s const b) {
+    return Vec16s(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16s min(Vec16s const a, Vec16s const b) {
+    return Vec16s(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16s abs(Vec16s const a) {
+    return Vec16s(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16s abs_saturated(Vec16s const a) {
+    return Vec16s(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16s rotate_left(Vec16s const a, int b) {
+    return Vec16s(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16us : public Vec16s {
+public:
+    // Default constructor:
+    Vec16us(){
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16us(uint32_t i) {
+        y1 = y0 = _mm_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec16us(uint16_t i0, uint16_t i1, uint16_t i2,  uint16_t i3,  uint16_t i4,  uint16_t i5,  uint16_t i6,  uint16_t i7,
+            uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) {
+        y0 = _mm_setr_epi16((int16_t)i0, (int16_t)i1, (int16_t)i2,  (int16_t)i3,  (int16_t)i4,  (int16_t)i5,  (int16_t)i6,  (int16_t)i7);
+        y1 = _mm_setr_epi16((int16_t)i8, (int16_t)i9, (int16_t)i10, (int16_t)i11, (int16_t)i12, (int16_t)i13, (int16_t)i14, (int16_t)i15);
+    }
+    // Constructor to build from two Vec8us:
+    Vec16us(Vec8us const a0, Vec8us const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec16us(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec16us & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16us & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec16us & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16us const insert(int index, uint16_t value) {
+        Vec16s::insert(index, (int16_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint16_t extract(int index) const {
+        return (uint16_t)Vec16s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8us:
+    Vec8us get_low() const {
+        return y0;
+    }
+    Vec8us get_high() const {
+        return y1;
+    }
+    static constexpr int elementtype() {
+        return 7;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16us operator + (Vec16us const a, Vec16us const b) {
+    return Vec16us(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract
+static inline Vec16us operator - (Vec16us const a, Vec16us const b) {
+    return Vec16us(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator * : multiply
+static inline Vec16us operator * (Vec16us const a, Vec16us const b) {
+    return Vec16us(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide
+static inline Vec16us operator / (Vec16us const a, Divisor_us const d) {
+    return Vec16us(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16us & operator /= (Vec16us & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const a, uint32_t b) {
+    return Vec16us(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16us operator >> (Vec16us const a, int b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const a, uint32_t b) {
+    return Vec16us(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec16us operator << (Vec16us const a, int32_t b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16sb operator >= (Vec16us const a, Vec16us const b) {
+    return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16sb operator <= (Vec16us const a, Vec16us const b) {
+    return b >= a;
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16sb operator > (Vec16us const a, Vec16us const b) {
+    return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16sb operator < (Vec16us const a, Vec16us const b) {
+    return b > a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16us operator & (Vec16us const a, Vec16us const b) {
+    return Vec16us(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16us operator && (Vec16us const a, Vec16us const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16us operator | (Vec16us const a, Vec16us const b) {
+    return Vec16us(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16us operator || (Vec16us const a, Vec16us const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16us operator ^ (Vec16us const a, Vec16us const b) {
+    return Vec16us(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16us operator ~ (Vec16us const a) {
+    return Vec16us(~ Vec256b(a));
+}
+
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec16us select (Vec16sb const s, Vec16us const a, Vec16us const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16us if_add (Vec16sb const f, Vec16us const a, Vec16us const b) {
+    return a + (Vec16us(f) & b);
+}
+
+// Conditional subtract
+static inline Vec16us if_sub (Vec16sb const f, Vec16us const a, Vec16us const b) {
+    return a - (Vec16us(f) & b);
+}
+
+// Conditional multiply
+static inline Vec16us if_mul (Vec16sb const f, Vec16us const a, Vec16us const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16us const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is zero-extended before addition to avoid overflow
+static inline uint32_t horizontal_add_x (Vec16us const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16us add_saturated(Vec16us const a, Vec16us const b) {
+    return Vec16us(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16us sub_saturated(Vec16us const a, Vec16us const b) {
+    return Vec16us(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16us max(Vec16us const a, Vec16us const b) {
+    return Vec16us(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16us min(Vec16us const a, Vec16us const b) {
+    return Vec16us(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8i : public Vec256b {
+public:
+    // Default constructor:
+    Vec8i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8i(int i) {
+        y1 = y0 = _mm_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) {
+        y0 = _mm_setr_epi32(i0, i1, i2, i3);
+        y1 = _mm_setr_epi32(i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4i:
+    Vec8i(Vec4i const a0, Vec4i const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec8i(Vec256b const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec8i & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8i & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8i & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8i & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 4) {
+            *this = Vec8i(Vec4i().load_partial(n, p), 0);
+        }
+        else if (n < 8) {
+            *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t const*)p+4));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 4) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 8) {
+            get_low().store(p);
+            get_high().store_partial(n-4, (int32_t*)p+4);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8i & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 4);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8i const insert(int index, int32_t value) {
+        if ((uint32_t)index < 4) {
+            y0 = Vec4i(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec4i(y1).insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(int index) const {
+        if ((uint32_t)index < 4) {
+            return Vec4i(y0).extract(index);
+        }
+        else {
+            return Vec4i(y1).extract(index-4);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4i:
+    Vec4i get_low() const {
+        return y0;
+    }
+    Vec4i get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 8;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui
+*
+*****************************************************************************/
+
+class Vec8ib : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ib() {
+    }
+    // Constructor to build from all elements:
+    Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
+        Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7))
+        {}
+    // Constructor to convert from type Vec256b
+    Vec8ib(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec8ib & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec8ib(bool b) : Vec8i(-int32_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8ib & operator = (bool b) {
+        *this = Vec8ib(b);
+        return *this;
+    }
+    // Constructor to build from two Vec4ib:
+    Vec8ib(Vec4ib const a0, Vec4ib const a1) : Vec8i(Vec4i(a0), Vec4i(a1)) {
+    }
+    // Member functions to split into two Vec4i:
+    Vec4ib get_low() const {
+        return y0;
+    }
+    Vec4ib get_high() const {
+        return y1;
+    }
+    Vec8ib & insert (int index, bool a) {
+        Vec8i::insert(index, -(int)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec8i::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8ib & load_bits(uint8_t a) {
+        y0 = Vec4ib().load_bits(uint16_t(a));
+        y1 = Vec4ib().load_bits(uint16_t(a>>4));
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec8ib(int b) = delete;
+    Vec8ib & operator = (int x) = delete;
+};
+
+/*****************************************************************************
+*
+*          Define operators for Vec8ib
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec8ib operator & (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) & Vec256b(b));
+}
+static inline Vec8ib operator && (Vec8ib const a, Vec8ib const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ib operator | (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) | Vec256b(b));
+}
+static inline Vec8ib operator || (Vec8ib const a, Vec8ib const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ib operator ^ (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec8ib operator == (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(Vec256b(a) ^ Vec256b(~b));
+}
+
+// vector operator != : xor
+static inline Vec8ib operator != (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ib operator ~ (Vec8ib const a) {
+    return Vec8ib( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec8ib operator ! (Vec8ib const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec8ib andnot (Vec8ib const a, Vec8ib const b) {
+    return Vec8ib(andnot(Vec256b(a), Vec256b(b)));
+}
+
+/*****************************************************************************
+*
+*          Operators for Vec8i
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec8i operator + (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec8i & operator += (Vec8i & a, Vec8i const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8i operator ++ (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8i & operator ++ (Vec8i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8i operator - (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec8i operator - (Vec8i const a) {
+    return Vec8i(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8i & operator -= (Vec8i & a, Vec8i const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8i operator -- (Vec8i & a, int) {
+    Vec8i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8i & operator -- (Vec8i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8i operator * (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec8i & operator *= (Vec8i & a, Vec8i const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8i operator / (Vec8i const a, Divisor_i const d) {
+    return Vec8i(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec8i & operator /= (Vec8i & a, Divisor_i const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec8i operator << (Vec8i const a, int32_t b) {
+    return Vec8i(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec8i & operator <<= (Vec8i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8i operator >> (Vec8i const a, int32_t b) {
+    return Vec8i(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8i & operator >>= (Vec8i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8ib operator == (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8ib operator != (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8ib operator > (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8ib operator < (Vec8i const a, Vec8i const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8ib operator >= (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8ib operator <= (Vec8i const a, Vec8i const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8i operator & (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8i operator && (Vec8i const a, Vec8i const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec8i & operator &= (Vec8i & a, Vec8i const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8i operator | (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8i operator || (Vec8i const a, Vec8i const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec8i & operator |= (Vec8i & a, Vec8i const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8i operator ^ (Vec8i const a, Vec8i const b) {
+    return Vec8i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec8i & operator ^= (Vec8i & a, Vec8i const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8i operator ~ (Vec8i const a) {
+    return Vec8i(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : returns true for elements == 0
+static inline Vec8ib operator ! (Vec8i const a) {
+    return Vec8i(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8i select (Vec8ib const s, Vec8i const a, Vec8i const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8i if_add (Vec8ib const f, Vec8i const a, Vec8i const b) {
+    return a + (Vec8i(f) & b);
+}
+
+// Conditional subtract
+static inline Vec8i if_sub (Vec8ib const f, Vec8i const a, Vec8i const b) {
+    return a - (Vec8i(f) & b);
+}
+
+// Conditional multiply
+static inline Vec8i if_mul (Vec8ib const f, Vec8i const a, Vec8i const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int32_t horizontal_add (Vec8i const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec8i const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec8i add_saturated(Vec8i const a, Vec8i const b) {
+    return Vec8i(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec8i sub_saturated(Vec8i const a, Vec8i const b) {
+    return Vec8i(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec8i max(Vec8i const a, Vec8i const b) {
+    return Vec8i(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8i min(Vec8i const a, Vec8i const b) {
+    return Vec8i(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8i abs(Vec8i const a) {
+    return Vec8i(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8i abs_saturated(Vec8i const a) {
+    return Vec8i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8i rotate_left(Vec8i const a, int b) {
+    return Vec8i(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8ui : public Vec8i {
+public:
+    // Default constructor:
+    Vec8ui() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8ui(uint32_t i) {
+        y1 = y0 = _mm_set1_epi32(int32_t(i));
+    }
+    // Constructor to build from all elements:
+    Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) {
+        y0 = _mm_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3);
+        y1 = _mm_setr_epi32((int32_t)i4, (int32_t)i5, (int32_t)i6, (int32_t)i7);
+    }
+    // Constructor to build from two Vec4ui:
+    Vec8ui(Vec4ui const a0, Vec4ui const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec8ui(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec8ui & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8ui & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8ui & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8ui const insert(int index, uint32_t value) {
+        Vec8i::insert(index, (int32_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(int index) const {
+        return (uint32_t)Vec8i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec4ui get_low() const {
+        return y0;
+    }
+    Vec4ui get_high() const {
+        return y1;
+    }
+    static constexpr int elementtype() {
+        return 9;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8ui operator + (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui (Vec8i(a) + Vec8i(b));
+}
+
+// vector operator - : subtract
+static inline Vec8ui operator - (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui (Vec8i(a) - Vec8i(b));
+}
+
+// vector operator * : multiply
+static inline Vec8ui operator * (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui (Vec8i(a) * Vec8i(b));
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec8ui operator / (Vec8ui const a, Divisor_ui const d) {
+    return Vec8ui(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const a, uint32_t b) {
+    return Vec8ui(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8ui operator >> (Vec8ui const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8ui & operator >>= (Vec8ui & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const a, uint32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec8ui operator << (Vec8ui const a, int32_t b) {
+    return Vec8ui ((Vec8i)a << (int32_t)b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8ib operator > (Vec8ui const a, Vec8ui const b) {
+    return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8ib operator < (Vec8ui const a, Vec8ui const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8ib operator >= (Vec8ui const a, Vec8ui const b) {
+    return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8ib operator <= (Vec8ui const a, Vec8ui const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8ui operator & (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec8ui operator && (Vec8ui const a, Vec8ui const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8ui operator | (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec8ui operator || (Vec8ui const a, Vec8ui const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8ui operator ^ (Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8ui operator ~ (Vec8ui const a) {
+    return Vec8ui(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec8ui select (Vec8ib const s, Vec8ui const a, Vec8ui const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8ui if_add (Vec8ib const f, Vec8ui const a, Vec8ui const b) {
+    return a + (Vec8ui(f) & b);
+}
+
+// Conditional subtract
+static inline Vec8ui if_sub (Vec8ib const f, Vec8ui const a, Vec8ui const b) {
+    return a - (Vec8ui(f) & b);
+}
+
+// Conditional multiply
+static inline Vec8ui if_mul (Vec8ib const f, Vec8ui const a, Vec8ui const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint32_t horizontal_add (Vec8ui const a) {
+    return (uint32_t)horizontal_add((Vec8i)a);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec8ui const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec8ui add_saturated(Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec8ui sub_saturated(Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec8ui max(Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8ui min(Vec8ui const a, Vec8ui const b) {
+    return Vec8ui(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec4q : public Vec256b {
+public:
+    // Default constructor:
+    Vec4q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4q(int64_t i) {
+        y0 = y1 = Vec2q(i);
+    }
+    // Constructor to build from all elements:
+    Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+        y0 = Vec2q(i0,i1);
+        y1 = Vec2q(i2,i3);
+    }
+    // Constructor to build from two Vec2q:
+    Vec4q(Vec2q const a0, Vec2q const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec4q(Vec256b const & x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec4q & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4q & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4q & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec4q & load_partial(int n, void const * p) {
+        if (n <= 0) {
+            *this = 0;
+        }
+        else if (n <= 2) {
+            *this = Vec4q(Vec2q().load_partial(n, p), 0);
+        }
+        else if (n < 4) {
+            *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t const*)p+2));
+        }
+        else {
+            load(p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n <= 0) {
+            return;
+        }
+        else if (n <= 2) {
+            get_low().store_partial(n, p);
+        }
+        else if (n < 4) {
+            get_low().store(p);
+            get_high().store_partial(n-2, (int64_t*)p+2);
+        }
+        else {
+            store(p);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec4q & cutoff(int n) {
+        *this = Vec32c(*this).cutoff(n * 8);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4q const insert(int index, int64_t value) {
+        if ((uint32_t)index < 2) {
+            y0 = Vec2q(y0).insert(index, value);
+        }
+        else {
+            y1 = Vec2q(y1).insert(index-2, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(int index) const {
+        if ((uint32_t)index < 2) {
+            return Vec2q(y0).extract(index);
+        }
+        else {
+            return Vec2q(y1).extract(index-2);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec2q get_low() const {
+        return y0;
+    }
+    Vec2q get_high() const {
+        return y1;
+    }
+    static constexpr int size() {
+        return 4;
+    }
+    static constexpr int elementtype() {
+        return 10;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq
+*
+*****************************************************************************/
+
+class Vec4qb : public Vec4q {
+public:
+    // Default constructor:
+    Vec4qb() {
+    }
+    // Constructor to build from all elements:
+    Vec4qb(bool x0, bool x1, bool x2, bool x3) :
+        Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) {
+    }
+    // Constructor to convert from type Vec256b
+    Vec4qb(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec4qb & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec4qb(bool b) : Vec4q(-int64_t(b)) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec4qb & operator = (bool b) {
+        *this = Vec4qb(b);
+        return *this;
+    }
+    // Constructor to build from two Vec2qb:
+    Vec4qb(Vec2qb const a0, Vec2qb const a1) : Vec4q(Vec2q(a0), Vec2q(a1)) {
+    }
+    // Member functions to split into two Vec2qb:
+    Vec2qb get_low() const {
+        return y0;
+    }
+    Vec2qb get_high() const {
+        return y1;
+    }
+    Vec4qb & insert (int index, bool a) {
+        Vec4q::insert(index, -(int64_t)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec4q::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec4qb & load_bits(uint8_t a) {
+        y0 = Vec2qb().load_bits(a);
+        y1 = Vec2qb().load_bits(uint8_t(a>>2u));
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc.
+    Vec4qb(int b) = delete;
+    Vec4qb & operator = (int x) = delete;
+};
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec4qb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec4qb operator & (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) & Vec256b(b));
+}
+static inline Vec4qb operator && (Vec4qb const a, Vec4qb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4qb operator | (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) | Vec256b(b));
+}
+static inline Vec4qb operator || (Vec4qb const a, Vec4qb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4qb operator ^ (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) ^ Vec256b(b));
+}
+// vector operator ^= : bitwise xor
+static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec4qb operator == (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(Vec256b(a) ^ Vec256b(~b));
+}
+
+// vector operator != : xor
+static inline Vec4qb operator != (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4qb operator ~ (Vec4qb const a) {
+    return Vec4qb( ~ Vec256b(a));
+}
+
+// vector operator ! : element not
+static inline Vec4qb operator ! (Vec4qb const a) {
+    return ~ a;
+}
+
+// vector function andnot
+static inline Vec4qb andnot (Vec4qb const a, Vec4qb const b) {
+    return Vec4qb(andnot(Vec256b(a), Vec256b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Operators for Vec4q
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec4q operator + (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec4q & operator += (Vec4q & a, Vec4q const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec4q operator ++ (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec4q & operator ++ (Vec4q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec4q operator - (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec4q operator - (Vec4q const a) {
+    return Vec4q(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec4q & operator -= (Vec4q & a, Vec4q const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec4q operator -- (Vec4q & a, int) {
+    Vec4q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec4q & operator -- (Vec4q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec4q operator * (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec4q & operator *= (Vec4q & a, Vec4q const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec4q operator << (Vec4q const a, int32_t b) {
+    return Vec4q(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec4q & operator <<= (Vec4q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec4q operator >> (Vec4q const a, int32_t b) {
+    return Vec4q(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec4q & operator >>= (Vec4q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec4qb operator == (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec4qb operator != (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec4qb operator < (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec4qb operator > (Vec4q const a, Vec4q const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec4qb operator >= (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec4qb operator <= (Vec4q const a, Vec4q const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4q operator & (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4q operator && (Vec4q const a, Vec4q const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec4q & operator &= (Vec4q & a, Vec4q const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec4q operator | (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4q operator || (Vec4q const a, Vec4q const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec4q & operator |= (Vec4q & a, Vec4q const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4q operator ^ (Vec4q const a, Vec4q const b) {
+    return Vec4q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec4q & operator ^= (Vec4q & a, Vec4q const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4q operator ~ (Vec4q const a) {
+    return Vec4q(~a.get_low(), ~a.get_high());
+}
+
+// vector operator ! : logical not, returns true for elements == 0
+static inline Vec4qb operator ! (Vec4q const a) {
+    return Vec4q(!a.get_low(), !a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4q select (Vec4qb const s, Vec4q const a, Vec4q const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4q if_add (Vec4qb const f, Vec4q const a, Vec4q const b) {
+    return a + (Vec4q(f) & b);
+}
+
+// Conditional subtract
+static inline Vec4q if_sub (Vec4qb const f, Vec4q const a, Vec4q const b) {
+    return a - (Vec4q(f) & b);
+}
+
+// Conditional multiply
+static inline Vec4q if_mul (Vec4qb const f, Vec4q const a, Vec4q const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int64_t horizontal_add (Vec4q const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec4q max(Vec4q const a, Vec4q const b) {
+    return Vec4q(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4q min(Vec4q const a, Vec4q const b) {
+    return Vec4q(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec4q abs(Vec4q const a) {
+    return Vec4q(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec4q abs_saturated(Vec4q const a) {
+    return Vec4q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec4q rotate_left(Vec4q const a, int b) {
+    return Vec4q(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 4 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec4uq : public Vec4q {
+public:
+    // Default constructor:
+    Vec4uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec4uq(uint64_t i) {
+        y1 = y0 = Vec2q((int64_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) {
+        y0 = Vec2q((int64_t)i0, (int64_t)i1);
+        y1 = Vec2q((int64_t)i2, (int64_t)i3);
+    }
+    // Constructor to build from two Vec2uq:
+    Vec4uq(Vec2uq const a0, Vec2uq const a1) {
+        y0 = a0;  y1 = a1;
+    }
+    // Constructor to convert from type Vec256b
+    Vec4uq(Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec256b
+    Vec4uq & operator = (Vec256b const x) {
+        y0 = x.get_low();  y1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec4uq & load(void const * p) {
+        y0 = _mm_loadu_si128((__m128i const*)p);
+        y1 = _mm_loadu_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec4uq & load_a(void const * p) {
+        y0 = _mm_load_si128((__m128i const*)p);
+        y1 = _mm_load_si128((__m128i const*)p + 1);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec4uq const insert(int index, uint64_t value) {
+        Vec4q::insert(index, (int64_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(int index) const {
+        return (uint64_t)Vec4q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec2uq get_low() const {
+        return y0;
+    }
+    Vec2uq get_high() const {
+        return y1;
+    }
+    static constexpr int elementtype() {
+        return 11;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec4uq operator + (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq (Vec4q(a) + Vec4q(b));
+}
+
+// vector operator - : subtract
+static inline Vec4uq operator - (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq (Vec4q(a) - Vec4q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec4uq operator * (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq (Vec4q(a) * Vec4q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const a, uint32_t b) {
+    return Vec4uq(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec4uq operator >> (Vec4uq const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const a, uint32_t b) {
+    return Vec4uq ((Vec4q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec4uq operator << (Vec4uq const a, int32_t b) {
+    return Vec4uq ((Vec4q)a << b);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec4qb operator > (Vec4uq const a, Vec4uq const b) {
+    return Vec4q(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec4qb operator < (Vec4uq const a, Vec4uq const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec4qb operator >= (Vec4uq const a, Vec4uq const b) {
+    return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec4qb operator <= (Vec4uq const a, Vec4uq const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec4uq operator & (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec4uq operator && (Vec4uq const a, Vec4uq const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec4uq operator | (Vec4uq const a, Vec4uq const b) {
+    return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec4uq operator || (Vec4uq const a, Vec4uq const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec4uq operator ^ (Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec4uq operator ~ (Vec4uq const a) {
+    return Vec4uq(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+// Each word in s must be either 0 (false) or -1 (true). No other values are allowed.
+// (s is signed)
+static inline Vec4uq select (Vec4qb const s, Vec4uq const a, Vec4uq const b) {
+    return selectb(s,a,b);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec4uq if_add (Vec4qb const f, Vec4uq const a, Vec4uq const b) {
+    return a + (Vec4uq(f) & b);
+}
+
+// Conditional subtract
+static inline Vec4uq if_sub (Vec4qb const f, Vec4uq const a, Vec4uq const b) {
+    return a - (Vec4uq(f) & b);
+}
+
+// Conditional multiply
+static inline Vec4uq if_mul (Vec4qb const f, Vec4uq const a, Vec4uq const b) {
+    return select(f, a*b, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint64_t horizontal_add (Vec4uq const a) {
+    return (uint64_t)horizontal_add((Vec4q)a);
+}
+
+// function max: a > b ? a : b
+static inline Vec4uq max(Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec4uq min(Vec4uq const a, Vec4uq const b) {
+    return Vec4uq(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// permute vector of 4 64-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3 >
+static inline Vec4q permute4(Vec4q const a) {
+    return Vec4q(blend2<i0,i1> (a.get_low(), a.get_high()),
+                 blend2<i2,i3> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq permute4(Vec4uq const a) {
+    return Vec4uq (permute4<i0,i1,i2,i3> (Vec4q(a)));
+}
+
+// permute vector of 8 32-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8i permute8(Vec8i const a) {
+    return Vec8i(blend4<i0,i1,i2,i3> (a.get_low(), a.get_high()),
+                 blend4<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7 >
+static inline Vec8ui permute4(Vec8ui const a) {
+    return Vec8ui (permute8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8i(a)));
+}
+
+// permute vector of 16 16-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16s permute16(Vec16s const a) {
+    return Vec16s(blend8<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
+                  blend8<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16us permute16(Vec16us const a) {
+    return Vec16us (permute16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16s(a)));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+static inline Vec32c permute32(Vec32c const a) {
+    return Vec32c(blend16<i0, i1, i2 ,i3 ,i4 ,i5 ,i6 ,i7, i8, i9, i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()),
+                  blend16<i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a.get_low(), a.get_high()));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+          int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+          int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc permute32(Vec32uc const a) {
+        return Vec32uc (permute32<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (Vec32c(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+// blend vectors Vec4q
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q blend4(Vec4q const& a, Vec4q const& b) {
+    Vec2q x0 = blend_half<Vec4q, i0, i1>(a, b);
+    Vec2q x1 = blend_half<Vec4q, i2, i3>(a, b);
+    return Vec4q(x0, x1);
+}
+
+// blend vectors Vec8i
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i blend8(Vec8i const& a, Vec8i const& b) {
+    Vec4i x0 = blend_half<Vec8i, i0, i1, i2, i3>(a, b);
+    Vec4i x1 = blend_half<Vec8i, i4, i5, i6, i7>(a, b);
+    return Vec8i(x0, x1);
+}
+
+// blend vectors Vec16s
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16s blend16(Vec16s const& a, Vec16s const& b) {
+    Vec8s x0 = blend_half<Vec16s, i0, i1, i2, i3, i4, i5, i6, i7>(a, b);
+    Vec8s x1 = blend_half<Vec16s, i8, i9, i10, i11, i12, i13, i14, i15>(a, b);
+    return Vec16s(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32c blend32(Vec32c const& a, Vec32c const& b) {
+    Vec16c x0 = blend_half<Vec32c, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a, b);
+    Vec16c x1 = blend_half<Vec32c, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31>(a, b);
+    return Vec32c(x0, x1);
+}
+
+// unsigned types:
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4uq blend4(Vec4uq const a, Vec4uq const b) {
+    return Vec4uq( blend4<i0,i1,i2,i3> (Vec4q(a),Vec4q(b)));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8ui blend8(Vec8ui const a, Vec8ui const b) {
+    return Vec8ui( blend8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8i(a),Vec8i(b)));
+}
+
+template <int i0, int i1, int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16us blend16(Vec16us const a, Vec16us const b) {
+    return Vec16us( blend16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16s(a),Vec16s(b)));
+}
+
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32uc blend32(Vec32uc const a, Vec32uc const b) {
+        return Vec32uc (blend32<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,
+            i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (Vec32c(a), Vec32c(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec32c lookup32(Vec32c const index, Vec32c const table) {
+#if defined (__XOP__)   // AMD XOP instruction set. Use VPPERM
+    Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low());
+    Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high());
+    return Vec32c(t0, t1);
+#else
+    Vec16c t0 = lookup32(index.get_low() , table.get_low(), table.get_high());
+    Vec16c t1 = lookup32(index.get_high(), table.get_low(), table.get_high());
+    return Vec32c(t0, t1);
+#endif
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32uc const index, void const * table) {
+    if constexpr (n <=  0) return 0;
+    if constexpr (n <= 16) {
+        Vec16c tt = Vec16c().load(table);
+        Vec16c r0 = lookup16(index.get_low(),  tt);
+        Vec16c r1 = lookup16(index.get_high(), tt);
+        return Vec32c(r0, r1);
+    }
+    if constexpr (n <= 32) return lookup32(index, Vec32c().load(table));
+    // n > 32. Limit index
+    Vec32uc index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec32uc(index) & uint8_t(n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec32uc(index), uint8_t(n-1));
+    }
+    uint8_t ii[32];  index1.store(ii);
+    int8_t  rr[32];
+    for (int j = 0; j < 32; j++) {
+        rr[j] = ((int8_t*)table)[ii[j]];
+    }
+    return Vec32c().load(rr);
+}
+
+template <int n>
+static inline Vec32c lookup(Vec32c const index, void const * table) {
+    return lookup<n>(Vec32uc(index), table);
+}
+
+static inline Vec16s lookup16(Vec16s const index, Vec16s const table) {
+    Vec8s t0 = lookup16(index.get_low() , table.get_low(), table.get_high());
+    Vec8s t1 = lookup16(index.get_high(), table.get_low(), table.get_high());
+    return Vec16s(t0, t1);
+}
+
+template <int n>
+static inline Vec16s lookup(Vec16s const index, void const * table) {
+    if constexpr (n <=  0) return 0;
+    if constexpr (n <=  8) {
+        Vec8s table1 = Vec8s().load(table);
+        return Vec16s(
+            lookup8 (index.get_low(),  table1),
+            lookup8 (index.get_high(), table1));
+    }
+    if constexpr (n <= 16) return lookup16(index, Vec16s().load(table));
+    // n > 16. Limit index
+    Vec16us i1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec16us(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec16us(index), n-1);
+    }
+    int16_t const * t = (int16_t const *)table;
+    return Vec16s(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
+        t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
+}
+
+static inline Vec8i lookup8(Vec8i const index, Vec8i const table) {
+    Vec4i t0 = lookup8(index.get_low() , table.get_low(), table.get_high());
+    Vec4i t1 = lookup8(index.get_high(), table.get_low(), table.get_high());
+    return Vec8i(t0, t1);
+}
+
+template <int n>
+static inline Vec8i lookup(Vec8i const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 4) {
+        Vec4i table1 = Vec4i().load(table);
+        return Vec8i(
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if constexpr (n <= 8) {
+        return lookup8(index, Vec8i().load(table));
+    }
+    // n > 8. Limit index
+    Vec8ui i1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec8ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec8ui(index), n-1);
+    }
+    int32_t const * t = (int32_t const *)table;
+    return Vec8i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
+}
+
+static inline Vec4q lookup4(Vec4q const index, Vec4q const table) {
+    return lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table));
+}
+
+template <int n>
+static inline Vec4q lookup(Vec4q const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    // n > 0. Limit index
+    Vec4uq index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec4uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1.
+        // There is no 64-bit min instruction, but we can use the 32-bit unsigned min,
+        // since n is a 32-bit integer
+        index1 = Vec4uq(min(Vec8ui(index), Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0)));
+    }
+    uint32_t ii[8];  index1.store(ii);  // use only lower 32 bits of each index
+    int64_t const * tt = (int64_t const *)table;
+    return Vec4q(tt[ii[0]], tt[ii[2]], tt[ii[4]], tt[ii[6]]);
+}
+
+
+/*****************************************************************************
+*
+*          Byte shifts
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+template <unsigned int b>
+static inline Vec32c shift_bytes_up(Vec32c const a) {
+    int8_t dat[64];
+    if (b < 32) {
+        Vec32c(0).store(dat);
+        a.store(dat+b);
+        return Vec32c().load(dat);
+    }
+    else return 0;
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+template <unsigned int b>
+static inline Vec32c shift_bytes_down(Vec32c const a) {
+    int8_t dat[64];
+    if (b < 32) {
+        a.store(dat);
+        Vec32c(0).store(dat+32);
+        return Vec32c().load(dat+b);
+    }
+    else return 0;
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8i gather8i(void const * a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if constexpr (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8i b = Vec8i().load((int32_t const *)a + imax-7);
+            return permute8<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7>(b);
+        }
+        else {
+            Vec8i b = Vec8i().load((int32_t const *)a + imin);
+            return permute8<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin>(b);
+        }
+    }
+    if constexpr ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8i b = Vec8i().load((int32_t const *)a + imin);
+        Vec8i c = Vec8i().load((int32_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), a);
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline Vec4q gather4q(void const * a) {
+    int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 3) {
+        // load one contiguous block and permute
+        if constexpr (imax > 3) {
+            // make sure we don't read past the end of the array
+            Vec4q b = Vec4q().load((int64_t const *)a + imax-3);
+            return permute4<i0-imax+3, i1-imax+3, i2-imax+3, i3-imax+3>(b);
+        }
+        else {
+            Vec4q b = Vec4q().load((int64_t const *)a + imin);
+            return permute4<i0-imin, i1-imin, i2-imin, i3-imin>(b);
+        }
+    }
+    if constexpr ((i0<imin+4 || i0>imax-4) && (i1<imin+4 || i1>imax-4) && (i2<imin+4 || i2>imax-4) && (i3<imin+4 || i3>imax-4)) {
+        // load two contiguous blocks and blend
+        Vec4q b = Vec4q().load((int64_t const *)a + imin);
+        Vec4q c = Vec4q().load((int64_t const *)a + imax-3);
+        const int j0 = i0<imin+4 ? i0-imin : 7-imax+i0;
+        const int j1 = i1<imin+4 ? i1-imin : 7-imax+i1;
+        const int j2 = i2<imin+4 ? i2-imin : 7-imax+i2;
+        const int j3 = i3<imin+4 ? i3-imin : 7-imax+i3;
+        return blend4<j0, j1, j2, j3>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec4q(i0,i1,i2,i3), a);
+}
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8i const data, void * array) {
+    int32_t* arr = (int32_t*)array;
+    const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
+    for (int i = 0; i < 8; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3>
+static inline void scatter(Vec4q const data, void * array) {
+    int64_t* arr = (int64_t*)array;
+    const int index[4] = {i0,i1,i2,i3};
+    for (int i = 0; i < 4; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+}
+
+// scatter functions with variable indexes
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8i const data, void * destination) {
+    int32_t* arr = (int32_t*)destination;
+    for (int i = 0; i < 8; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4q const index, uint32_t limit, Vec4q const data, void * destination) {
+    int64_t* arr = (int64_t*)destination;
+    for (int i = 0; i < 4; i++) {
+        if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec4i const index, uint32_t limit, Vec4q const data, void * destination) {
+    int64_t* arr = (int64_t*)destination;
+    for (int i = 0; i < 4; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+}
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 16 elements to 16 bits with sign extension
+static inline Vec16s extend_low (Vec32c const a) {
+    return Vec16s(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec16s extend_high (Vec32c const a) {
+    return Vec16s(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec16us extend_low (Vec32uc const a) {
+    return Vec16us(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec16us extend_high (Vec32uc const a) {
+    return Vec16us(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec8i extend_low (Vec16s const a) {
+    return Vec8i(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec8i extend_high (Vec16s const a) {
+    return Vec8i(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_low (Vec16us const a) {
+    return Vec8ui(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec8ui extend_high (Vec16us const a) {
+    return Vec8ui(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 4 elements to 64 bits with sign extension
+static inline Vec4q extend_low (Vec8i const a) {
+    return Vec4q(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with sign extension
+static inline Vec4q extend_high (Vec8i const a) {
+    return Vec4q(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_low (Vec8ui const a) {
+    return Vec4uq(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 4 elements to 64 bits with zero extension
+static inline Vec4uq extend_high (Vec8ui const a) {
+    return Vec4uq(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec32c compress (Vec16s const low, Vec16s const high) {
+    return Vec32c(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec32c compress_saturated (Vec16s const low, Vec16s const high) {
+    return Vec32c(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec32uc compress (Vec16us const low, Vec16us const high) {
+    return Vec32uc(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec32uc compress_saturated (Vec16us const low, Vec16us const high) {
+    return Vec32uc(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16s compress (Vec8i const low, Vec8i const high) {
+    return Vec16s(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec16s compress_saturated (Vec8i const low, Vec8i const high) {
+    return Vec16s(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec16us compress (Vec8ui const low, Vec8ui const high) {
+    return Vec16us(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec16us compress_saturated (Vec8ui const low, Vec8ui const high) {
+    return Vec16us(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec8i compress (Vec4q const low, Vec4q const high) {
+    return Vec8i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec8i compress_saturated (Vec4q const low, Vec4q const high) {
+    return Vec8i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec8ui compress (Vec4uq const low, Vec4uq const high) {
+    return Vec8ui (compress((Vec4q)low, (Vec4q)high));
+}
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec8ui compress_saturated (Vec4uq const low, Vec4uq const high) {
+    return Vec8ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec8i by compile-time constant
+template <int32_t d>
+static inline Vec8i divide_by_i(Vec8i const a) {
+    return Vec8i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec8i operator / (Vec8i const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec8i a / const_uint(d)
+template <uint32_t d>
+static inline Vec8i operator / (Vec8i const a, Const_uint_t<d>) {
+    static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8i & operator /= (Vec8i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec8ui by compile-time constant
+template <uint32_t d>
+static inline Vec8ui divide_by_ui(Vec8ui const a) {
+    return Vec8ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec8ui operator / (Vec8ui const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec8ui operator / (Vec8ui const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// Divide Vec16s by compile-time constant
+template <int d>
+static inline Vec16s divide_by_i(Vec16s const a) {
+    return Vec16s( divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec16s a / const_int(d)
+template <int d>
+static inline Vec16s operator / (Vec16s const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16s a / const_uint(d)
+template <uint32_t d>
+static inline Vec16s operator / (Vec16s const a, Const_uint_t<d>) {
+    static_assert(d < 0x8000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int(d)>(a);                                   // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16s & operator /= (Vec16s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// Divide Vec16us by compile-time constant
+template <uint32_t d>
+static inline Vec16us divide_by_ui(Vec16us const a) {
+    return Vec16us( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec16us a / const_uint(d)
+template <uint32_t d>
+static inline Vec16us operator / (Vec16us const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec16us a / const_int(d)
+template <int d>
+static inline Vec16us operator / (Vec16us const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16us & operator /= (Vec16us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec32c a / const_int(d)
+template <int d>
+static inline Vec32c operator / (Vec32c const a, Const_int_t<d>) {
+    // expand into two Vec16s
+    Vec16s low  = extend_low(a)  / Const_int_t<d>();
+    Vec16s high = extend_high(a) / Const_int_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32c a / const_uint(d)
+template <uint32_t d>
+static inline Vec32c operator / (Vec32c const a, Const_uint_t<d>) {
+    static_assert(uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned");
+    return a / Const_int_t<d>();                                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32c & operator /= (Vec32c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec32uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec32uc operator / (Vec32uc const a, Const_uint_t<d>) {
+    // expand into two Vec16us
+    Vec16us low  = extend_low(a)  / Const_uint_t<d>();
+    Vec16us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low,high);
+}
+
+// define Vec32uc a / const_int(d)
+template <int d>
+static inline Vec32uc operator / (Vec32uc const a, Const_int_t<d>) {
+    static_assert(int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return a / Const_uint_t<d>();                                    // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32cb const x) {
+    return to_bits(x.get_low()) | (uint32_t)to_bits(x.get_high()) << 16;
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint16_t to_bits(Vec16sb const x) {
+    return uint16_t(to_bits(x.get_low()) | (uint16_t)to_bits(x.get_high()) << 8);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec8ib const x) {
+    return uint8_t(to_bits(x.get_low()) | (uint8_t)to_bits(x.get_high()) << 4);
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint8_t to_bits(Vec4qb const x) {
+    return uint8_t(to_bits(x.get_low()) | to_bits(x.get_high()) << 2);
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI256E_H
diff --git a/Bwdif/VCL2/vectori512.h b/Bwdif/VCL2/vectori512.h
new file mode 100644
index 0000000..eca5d0b
--- /dev/null
+++ b/Bwdif/VCL2/vectori512.h
@@ -0,0 +1,2133 @@
+/****************************  vectori512.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector class library
+* Description:
+* Header file defining 512-bit integer vector classes for 32 and 64 bit integers.
+* For x86 microprocessors with AVX512F and later instruction sets.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec16i    Vector of  16  32-bit signed   integers
+* Vec16ui   Vector of  16  32-bit unsigned integers
+* Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
+* Vec8q     Vector of   8  64-bit signed   integers
+* Vec8uq    Vector of   8  64-bit unsigned integers
+* Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
+* Other 512-bit integer vectors are defined in Vectori512s.h
+*
+* Each vector object is represented internally in the CPU as a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORI512_H
+#define VECTORI512_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+// check combination of header files
+#ifdef VECTORI512E_H
+#error Two different versions of vectori512.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+// Generate a constant vector of 16 integers stored in memory.
+// Can be converted to any integer vector type
+template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
+uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15>
+static inline __m512i constant16ui() {
+    /*
+    const union {
+        uint32_t i[16];
+        __m512i zmm;
+    } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
+    return u.zmm;
+    */
+    return _mm512_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15);
+}
+
+
+/*****************************************************************************
+*
+*          Boolean vector classes for AVX512
+*
+*****************************************************************************/
+
+typedef Vec16b Vec16ib;
+typedef Vec16b Vec16uib;
+typedef Vec8b Vec8qb;
+typedef Vec8b Vec8uqb;
+
+
+/*****************************************************************************
+*
+*          Vector of 512 bits. Used as base class for Vec16i and Vec8q
+*
+*****************************************************************************/
+class Vec512b {
+protected:
+    __m512i zmm; // Integer vector
+public:
+    // Default constructor:
+    Vec512b() {
+    }
+    // Constructor to build from two Vec256b:
+    Vec512b(Vec256b const a0, Vec256b const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec512b(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec512b & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec512b & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    // You may use load_a instead of load if you are certain that p points to an address
+    // divisible by 64, but there is hardly any speed advantage of load_a on modern processors
+    Vec512b & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        _mm512_storeu_si512(p, zmm);
+    }
+    // Member function to store into array, aligned by 64
+    // You may use store_a instead of store if you are certain that p points to an address
+    // divisible by 64, but there is hardly any speed advantage of store_a on modern processors
+    void store_a(void * p) const {
+        _mm512_store_si512(p, zmm);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 64
+    void store_nt(void * p) const {
+        _mm512_stream_si512((__m512i*)p, zmm);
+    }
+    // Member functions to split into two Vec256b:
+    Vec256b get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec256b get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static constexpr int size() {
+        return 512;
+    }
+    static constexpr int elementtype() {
+        return 1;
+    }
+    typedef __m512i registertype;
+};
+
+// Define operators and functions for this class
+
+// vector operator & : bitwise and
+static inline Vec512b operator & (Vec512b const a, Vec512b const b) {
+    return _mm512_and_epi32(a, b);
+}
+static inline Vec512b operator && (Vec512b const a, Vec512b const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec512b operator | (Vec512b const a, Vec512b const b) {
+    return _mm512_or_epi32(a, b);
+}
+static inline Vec512b operator || (Vec512b const a, Vec512b const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec512b operator ^ (Vec512b const a, Vec512b const b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec512b operator ~ (Vec512b const a) {
+    return _mm512_xor_epi32(a, _mm512_set1_epi32(-1));
+}
+
+// vector operator &= : bitwise and
+static inline Vec512b & operator &= (Vec512b & a, Vec512b const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec512b & operator |= (Vec512b & a, Vec512b const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec512b & operator ^= (Vec512b & a, Vec512b const b) {
+    a = a ^ b;
+    return a;
+}
+
+// function andnot: a & ~ b
+static inline Vec512b andnot (Vec512b const a, Vec512b const b) {
+    return _mm512_andnot_epi32(b, a);
+}
+
+static inline __m512i zero_si512() {
+    return _mm512_setzero_si512();
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16i: public Vec512b {
+public:
+    // Default constructor:
+    Vec16i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16i(int i) {
+        zmm = _mm512_set1_epi32(i);
+    }
+    // Constructor to build from all elements:
+    Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+        int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
+        zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+    }
+    // Constructor to build from two Vec8i:
+    Vec16i(Vec8i const a0, Vec8i const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec16i(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec16i & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec16i & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16i & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Member function to load 16 unsigned 8-bit integers from array
+    Vec16i & load_16uc(void const * p) {
+        zmm = _mm512_cvtepu8_epi32(Vec16uc().load(p));
+        return *this;
+    }
+    // Member function to load 16 unsigned 16-bit integers from array
+    Vec16i & load_16us(void const * p) {
+        zmm = _mm512_cvtepu16_epi32(Vec16us().load(p));
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16i & load_partial(int n, void const * p) {
+        zmm = _mm512_maskz_loadu_epi32(__mmask16((1u << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        _mm512_mask_storeu_epi32(p, __mmask16((1u << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 16-n elements are set to zero
+    Vec16i & cutoff(int n) {
+        zmm = _mm512_maskz_mov_epi32(__mmask16((1u << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16i const insert(int index, int32_t value) {
+        zmm = _mm512_mask_set1_epi32(zmm, __mmask16(1u << index), value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(int index) const {
+        __m512i x = _mm512_maskz_compress_epi32(__mmask16(1u << index), zmm);
+        return _mm_cvtsi128_si32(_mm512_castsi512_si128(x));
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8i:
+    Vec8i get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec8i get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 8;
+    }
+};
+
+
+// Define operators for Vec16i
+
+// vector operator + : add element by element
+static inline Vec16i operator + (Vec16i const a, Vec16i const b) {
+    return _mm512_add_epi32(a, b);
+}
+// vector operator += : add
+static inline Vec16i & operator += (Vec16i & a, Vec16i const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16i operator ++ (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec16i & operator ++ (Vec16i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16i operator - (Vec16i const a, Vec16i const b) {
+    return _mm512_sub_epi32(a, b);
+}
+// vector operator - : unary minus
+static inline Vec16i operator - (Vec16i const a) {
+    return _mm512_sub_epi32(_mm512_setzero_epi32(), a);
+}
+// vector operator -= : subtract
+static inline Vec16i & operator -= (Vec16i & a, Vec16i const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16i operator -- (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec16i & operator -- (Vec16i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16i operator * (Vec16i const a, Vec16i const b) {
+    return _mm512_mullo_epi32(a, b);
+}
+// vector operator *= : multiply
+static inline Vec16i & operator *= (Vec16i & a, Vec16i const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec16i operator << (Vec16i const a, int32_t b) {
+    return _mm512_sll_epi32(a, _mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16i operator >> (Vec16i const a, int32_t b) {
+    return _mm512_sra_epi32(a, _mm_cvtsi32_si128(b));
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16ib operator == (Vec16i const a, Vec16i const b) {
+    return _mm512_cmpeq_epi32_mask(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16ib operator != (Vec16i const a, Vec16i const b) {
+    return _mm512_cmpneq_epi32_mask(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16ib operator > (Vec16i const a, Vec16i const b) {
+    return  _mm512_cmp_epi32_mask(a, b, 6);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16ib operator < (Vec16i const a, Vec16i const b) {
+    return  _mm512_cmp_epi32_mask(a, b, 1);
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16ib operator >= (Vec16i const a, Vec16i const b) {
+    return _mm512_cmp_epi32_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16ib operator <= (Vec16i const a, Vec16i const b) {
+    return _mm512_cmp_epi32_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec16i operator & (Vec16i const a, Vec16i const b) {
+    return _mm512_and_epi32(a, b);
+}
+// vector operator &= : bitwise and
+static inline Vec16i & operator &= (Vec16i & a, Vec16i const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16i operator | (Vec16i const a, Vec16i const b) {
+    return _mm512_or_epi32(a, b);
+}
+// vector operator |= : bitwise or
+static inline Vec16i & operator |= (Vec16i & a, Vec16i const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16i operator ^ (Vec16i const a, Vec16i const b) {
+    return _mm512_xor_epi32(a, b);
+}
+// vector operator ^= : bitwise xor
+static inline Vec16i & operator ^= (Vec16i & a, Vec16i const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16i operator ~ (Vec16i const a) {
+    return a ^ Vec16i(-1);
+    // This is potentially faster, but not on any current compiler:
+    //return _mm512_ternarylogic_epi32(_mm512_undefined_epi32(), _mm512_undefined_epi32(), a, 0x55);
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16i select (Vec16ib const s, Vec16i const a, Vec16i const b) {
+    return _mm512_mask_mov_epi32(b, s, a);  // conditional move may be optimized better by the compiler than blend
+    // return _mm512_mask_blend_epi32(s, b, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16i if_add (Vec16ib const f, Vec16i const a, Vec16i const b) {
+    return _mm512_mask_add_epi32(a, f, a, b);
+}
+
+// Conditional subtract
+static inline Vec16i if_sub (Vec16ib const f, Vec16i const a, Vec16i const b) {
+    return _mm512_mask_sub_epi32(a, f, a, b);
+}
+
+// Conditional multiply
+static inline Vec16i if_mul (Vec16ib const f, Vec16i const a, Vec16i const b) {
+    return _mm512_mask_mullo_epi32(a, f, a, b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int32_t horizontal_add (Vec16i const a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_epi32(a);
+#else
+    return horizontal_add(a.get_low() + a.get_high());
+#endif
+}
+
+// function add_saturated: add element by element, signed with saturation
+// (is it faster to up-convert to 64 bit integers, and then downconvert the sum with saturation?)
+static inline Vec16i add_saturated(Vec16i const a, Vec16i const b) {
+    __m512i sum    = _mm512_add_epi32(a, b);               // a + b
+    __m512i axb    = _mm512_xor_epi32(a, b);               // check if a and b have different sign
+    __m512i axs    = _mm512_xor_epi32(a, sum);             // check if a and sum have different sign
+    __m512i ovf1   = _mm512_andnot_epi32(axb,axs);         // check if sum has wrong sign
+    __m512i ovf2   = _mm512_srai_epi32(ovf1,31);           // -1 if overflow
+    __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
+    __m512i asign  = _mm512_srli_epi32(a,31);              // 1  if a < 0
+    __m512i sat1   = _mm512_srli_epi32(ovf2,1);            // 7FFFFFFF if overflow
+    __m512i sat2   = _mm512_add_epi32(sat1,asign);         // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return _mm512_mask_blend_epi32(ovf3, sum, sat2);       // sum if not overflow, else sat2
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16i sub_saturated(Vec16i const a, Vec16i const b) {
+    __m512i diff   = _mm512_sub_epi32(a, b);               // a + b
+    __m512i axb    = _mm512_xor_si512(a, b);               // check if a and b have different sign
+    __m512i axs    = _mm512_xor_si512(a, diff);            // check if a and sum have different sign
+    __m512i ovf1   = _mm512_and_si512(axb,axs);            // check if sum has wrong sign
+    __m512i ovf2   = _mm512_srai_epi32(ovf1,31);           // -1 if overflow
+    __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
+    __m512i asign  = _mm512_srli_epi32(a,31);              // 1  if a < 0
+    __m512i sat1   = _mm512_srli_epi32(ovf2,1);            // 7FFFFFFF if overflow
+    __m512i sat2   = _mm512_add_epi32(sat1,asign);         // 7FFFFFFF if positive overflow 80000000 if negative overflow
+    return _mm512_mask_blend_epi32(ovf3, diff, sat2);      // sum if not overflow, else sat2
+}
+
+// function max: a > b ? a : b
+static inline Vec16i max(Vec16i const a, Vec16i const b) {
+    return _mm512_max_epi32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16i min(Vec16i const a, Vec16i const b) {
+    return _mm512_min_epi32(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16i abs(Vec16i const a) {
+    return _mm512_abs_epi32(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16i abs_saturated(Vec16i const a) {
+    return _mm512_min_epu32(abs(a), Vec16i(0x7FFFFFFF));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16i rotate_left(Vec16i const a, int b) {
+    return _mm512_rolv_epi32(a, Vec16i(b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16ui : public Vec16i {
+public:
+    // Default constructor:
+    Vec16ui() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16ui(uint32_t i) {
+        zmm = _mm512_set1_epi32((int32_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
+        uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
+        zmm = _mm512_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4, (int32_t)i5, (int32_t)i6, (int32_t)i7,
+            (int32_t)i8, (int32_t)i9, (int32_t)i10, (int32_t)i11, (int32_t)i12, (int32_t)i13, (int32_t)i14, (int32_t)i15);
+    }
+    // Constructor to build from two Vec8ui:
+    Vec16ui(Vec8ui const a0, Vec8ui const a1) {
+        zmm = Vec16i(Vec8i(a0), Vec8i(a1));
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec16ui(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec16ui & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16ui & load(void const * p) {
+        Vec16i::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16ui & load_a(void const * p) {
+        Vec16i::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16ui const insert(int index, uint32_t value) {
+        Vec16i::insert(index, (int32_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(int index) const {
+        return (uint32_t)Vec16i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec8ui get_low() const {
+        return Vec8ui(Vec16i::get_low());
+    }
+    Vec8ui get_high() const {
+        return Vec8ui(Vec16i::get_high());
+    }
+    static constexpr int elementtype() {
+        return 9;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16ui operator + (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui (Vec16i(a) + Vec16i(b));
+}
+
+// vector operator - : subtract
+static inline Vec16ui operator - (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui (Vec16i(a) - Vec16i(b));
+}
+
+// vector operator * : multiply
+static inline Vec16ui operator * (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui (Vec16i(a) * Vec16i(b));
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const a, uint32_t b) {
+    return _mm512_srl_epi32(a, _mm_cvtsi32_si128((int32_t)b));
+}
+static inline Vec16ui operator >> (Vec16ui const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const a, uint32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const a, int32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16ib operator < (Vec16ui const a, Vec16ui const b) {
+    return _mm512_cmp_epu32_mask(a, b, 1);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16ib operator > (Vec16ui const a, Vec16ui const b) {
+    return _mm512_cmp_epu32_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16ib operator >= (Vec16ui const a, Vec16ui const b) {
+    return _mm512_cmp_epu32_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16ib operator <= (Vec16ui const a, Vec16ui const b) {
+    return _mm512_cmp_epu32_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec16ui operator & (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(Vec16i(a) & Vec16i(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec16ui operator | (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(Vec16i(a) | Vec16i(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ui operator ^ (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(Vec16i(a) ^ Vec16i(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ui operator ~ (Vec16ui const a) {
+    return Vec16ui( ~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16ui select (Vec16ib const s, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16ui if_add (Vec16ib const f, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional subtract
+static inline Vec16ui if_sub (Vec16ib const f, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(if_sub(f, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional multiply
+static inline Vec16ui if_mul (Vec16ib const f, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(if_mul(f, Vec16i(a), Vec16i(b)));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16ui const a) {
+    return (uint32_t)horizontal_add((Vec16i)a);
+}
+
+// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16ui add_saturated(Vec16ui const a, Vec16ui const b) {
+    Vec16ui sum      = a + b;
+    Vec16ib overflow = sum < (a | b);                      // overflow if (a + b) < (a | b)
+    return _mm512_mask_set1_epi32(sum, overflow, -1);      // 0xFFFFFFFF if overflow
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16ui sub_saturated(Vec16ui const a, Vec16ui const b) {
+    Vec16ui diff      = a - b;
+    return _mm512_maskz_mov_epi32(diff <= a, diff);        // underflow if diff > a gives zero
+}
+
+// function max: a > b ? a : b
+static inline Vec16ui max(Vec16ui const a, Vec16ui const b) {
+    return _mm512_max_epu32(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec16ui min(Vec16ui const a, Vec16ui const b) {
+    return _mm512_min_epu32(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8q : public Vec512b {
+public:
+    // Default constructor:
+    Vec8q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8q(int64_t i) {
+        zmm = _mm512_set1_epi64(i);
+    }
+    // Constructor to build from all elements:
+    Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
+        zmm = _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4q:
+    Vec8q(Vec4q const a0, Vec4q const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec8q(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec8q & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec8q & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec8q & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8q & load_partial(int n, void const * p) {
+        zmm = _mm512_maskz_loadu_epi64(__mmask16((1 << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        _mm512_mask_storeu_epi64(p, __mmask16((1 << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8q & cutoff(int n) {
+        zmm = _mm512_maskz_mov_epi64(__mmask16((1 << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8q const insert(int index, int64_t value) {
+#ifdef __x86_64__
+        zmm = _mm512_mask_set1_epi64(zmm, __mmask16(1 << index), value);
+#else
+        __m512i v = Vec8q(value);
+        zmm = _mm512_mask_mov_epi64(zmm, __mmask16(1 << index), v);
+#endif
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(int index) const {
+        __m512i x = _mm512_maskz_compress_epi64(__mmask8(1u << index), zmm);
+        return _emulate_movq(_mm512_castsi512_si128(x));
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec4q get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec4q get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 10;
+    }
+};
+
+
+// Define operators for Vec8q
+
+// vector operator + : add element by element
+static inline Vec8q operator + (Vec8q const a, Vec8q const b) {
+    return _mm512_add_epi64(a, b);
+}
+// vector operator += : add
+static inline Vec8q & operator += (Vec8q & a, Vec8q const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8q operator ++ (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec8q & operator ++ (Vec8q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8q operator - (Vec8q const a, Vec8q const b) {
+    return _mm512_sub_epi64(a, b);
+}
+// vector operator - : unary minus
+static inline Vec8q operator - (Vec8q const a) {
+    return _mm512_sub_epi64(_mm512_setzero_epi32(), a);
+}
+// vector operator -= : subtract
+static inline Vec8q & operator -= (Vec8q & a, Vec8q const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8q operator -- (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec8q & operator -- (Vec8q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8q operator * (Vec8q const a, Vec8q const b) {
+#if INSTRSET >= 10  // __AVX512DQ__
+    return _mm512_mullo_epi64(a, b);
+#elif defined (__INTEL_COMPILER)
+    return _mm512_mullox_epi64(a, b);                      // _mm512_mullox_epi64 missing in gcc
+#else
+    // instruction does not exist. Split into 32-bit multiplications
+    //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1);       // swap H<->L
+    __m512i ahigh   = _mm512_srli_epi64(a, 32);            // high 32 bits of each a
+    __m512i bhigh   = _mm512_srli_epi64(b, 32);            // high 32 bits of each b
+    __m512i prodahb = _mm512_mul_epu32(ahigh, b);          // ahigh*b
+    __m512i prodbha = _mm512_mul_epu32(bhigh, a);          // bhigh*a
+    __m512i prodhl  = _mm512_add_epi64(prodahb, prodbha);  // sum of high*low products
+    __m512i prodhi  = _mm512_slli_epi64(prodhl, 32);       // same, shifted high
+    __m512i prodll  = _mm512_mul_epu32(a, b);              // alow*blow = 64 bit unsigned products
+    __m512i prod    = _mm512_add_epi64(prodll, prodhi);    // low*low+(high*low)<<32
+    return  prod;
+#endif
+}
+
+// vector operator *= : multiply
+static inline Vec8q & operator *= (Vec8q & a, Vec8q const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec8q operator << (Vec8q const a, int32_t b) {
+    return _mm512_sll_epi64(a, _mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8q operator >> (Vec8q const a, int32_t b) {
+    return _mm512_sra_epi64(a, _mm_cvtsi32_si128(b));
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8qb operator == (Vec8q const a, Vec8q const b) {
+    return Vec8qb(_mm512_cmpeq_epi64_mask(a, b));
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8qb operator != (Vec8q const a, Vec8q const b) {
+    return Vec8qb(_mm512_cmpneq_epi64_mask(a, b));
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8qb operator < (Vec8q const a, Vec8q const b) {
+    return _mm512_cmp_epi64_mask(a, b, 1);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8qb operator > (Vec8q const a, Vec8q const b) {
+    return _mm512_cmp_epi64_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8qb operator >= (Vec8q const a, Vec8q const b) {
+    return _mm512_cmp_epi64_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8qb operator <= (Vec8q const a, Vec8q const b) {
+    return _mm512_cmp_epi64_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec8q operator & (Vec8q const a, Vec8q const b) {
+    return _mm512_and_epi32(a, b);
+}
+// vector operator &= : bitwise and
+static inline Vec8q & operator &= (Vec8q & a, Vec8q const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8q operator | (Vec8q const a, Vec8q const b) {
+    return _mm512_or_epi32(a, b);
+}
+// vector operator |= : bitwise or
+static inline Vec8q & operator |= (Vec8q & a, Vec8q const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8q operator ^ (Vec8q const a, Vec8q const b) {
+    return _mm512_xor_epi32(a, b);
+}
+// vector operator ^= : bitwise xor
+static inline Vec8q & operator ^= (Vec8q & a, Vec8q const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8q operator ~ (Vec8q const a) {
+    return Vec8q(~ Vec16i(a));
+    //return _mm512_ternarylogic_epi64(_mm512_undefined_epi32(), _mm512_undefined_epi32(), a, 0x55);
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8q select (Vec8qb const s, Vec8q const a, Vec8q const b) {
+    // avoid warning in MS compiler if INSTRSET = 9 by casting mask to uint8_t, while __mmask8 is not supported in AVX512F
+    return _mm512_mask_mov_epi64(b, (uint8_t)s, a);
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8q if_add (Vec8qb const f, Vec8q const a, Vec8q const b) {
+    return _mm512_mask_add_epi64(a, (uint8_t)f, a, b);
+}
+
+// Conditional subtract
+static inline Vec8q if_sub (Vec8qb const f, Vec8q const a, Vec8q const b) {
+    return _mm512_mask_sub_epi64(a, (uint8_t)f, a, b);
+}
+
+// Conditional multiply
+static inline Vec8q if_mul (Vec8qb const f, Vec8q const a, Vec8q const b) {
+#if INSTRSET >= 10
+    return _mm512_mask_mullo_epi64(a, f, a, b);  // AVX512DQ
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int64_t horizontal_add (Vec8q const a) {
+#if defined(__INTEL_COMPILER)
+    return _mm512_reduce_add_epi64(a);
+#else
+    return horizontal_add(a.get_low()+a.get_high());
+#endif
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec16i const x) {
+    Vec8q a = _mm512_cvtepi32_epi64(x.get_low());
+    Vec8q b = _mm512_cvtepi32_epi64(x.get_high());
+    return horizontal_add(a+b);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec16ui const x) {
+    Vec8q a = _mm512_cvtepu32_epi64(x.get_low());
+    Vec8q b = _mm512_cvtepu32_epi64(x.get_high());
+    return (uint64_t)horizontal_add(a+b);
+}
+
+// function max: a > b ? a : b
+static inline Vec8q max(Vec8q const a, Vec8q const b) {
+    return _mm512_max_epi64(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8q min(Vec8q const a, Vec8q const b) {
+    return _mm512_min_epi64(a, b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8q abs(Vec8q const a) {
+    return _mm512_abs_epi64(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8q abs_saturated(Vec8q const a) {
+    return _mm512_min_epu64(abs(a), Vec8q(0x7FFFFFFFFFFFFFFF));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8q rotate_left(Vec8q const a, int b) {
+    return _mm512_rolv_epi64(a, Vec8q(b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8uq : public Vec8q {
+public:
+    // Default constructor:
+    Vec8uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8uq(uint64_t i) {
+        zmm = Vec8q((int64_t)i);
+    }
+    // Constructor to convert from Vec8q:
+    Vec8uq(Vec8q const x) {
+        zmm = x;
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec8uq(__m512i const x) {
+        zmm = x;
+    }
+    // Constructor to build from all elements:
+    Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
+        zmm = Vec8q((int64_t)i0, (int64_t)i1, (int64_t)i2, (int64_t)i3, (int64_t)i4, (int64_t)i5, (int64_t)i6, (int64_t)i7);
+    }
+    // Constructor to build from two Vec4uq:
+    Vec8uq(Vec4uq const a0, Vec4uq const a1) {
+        zmm = Vec8q(Vec4q(a0), Vec4q(a1));
+    }
+    // Assignment operator to convert from Vec8q:
+    Vec8uq  & operator = (Vec8q const x) {
+        zmm = x;
+        return *this;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec8uq & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8uq & load(void const * p) {
+        Vec8q::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8uq & load_a(void const * p) {
+        Vec8q::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8uq const insert(int index, uint64_t value) {
+        Vec8q::insert(index, (int64_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(int index) const {
+        return (uint64_t)Vec8q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec4uq get_low() const {
+        return Vec4uq(Vec8q::get_low());
+    }
+    Vec4uq get_high() const {
+        return Vec4uq(Vec8q::get_high());
+    }
+    static constexpr int elementtype() {
+        return 10;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8uq operator + (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq (Vec8q(a) + Vec8q(b));
+}
+
+// vector operator - : subtract
+static inline Vec8uq operator - (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq (Vec8q(a) - Vec8q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec8uq operator * (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq (Vec8q(a) * Vec8q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const a, uint32_t b) {
+    return _mm512_srl_epi64(a,_mm_cvtsi32_si128((int32_t)b));
+}
+static inline Vec8uq operator >> (Vec8uq const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+// vector operator >>= : shift right artihmetic
+static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+// vector operator >>= : shift right logical
+static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const a, uint32_t b) {
+    return Vec8uq ((Vec8q)a << (int32_t)b);
+}
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const a, int32_t b) {
+    return Vec8uq ((Vec8q)a << b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8qb operator < (Vec8uq const a, Vec8uq const b) {
+    return _mm512_cmp_epu64_mask(a, b, 1);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8qb operator > (Vec8uq const a, Vec8uq const b) {
+    return _mm512_cmp_epu64_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8qb operator >= (Vec8uq const a, Vec8uq const b) {
+    return _mm512_cmp_epu64_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8qb operator <= (Vec8uq const a, Vec8uq const b) {
+    return _mm512_cmp_epu64_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec8uq operator & (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(Vec8q(a) & Vec8q(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec8uq operator | (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(Vec8q(a) | Vec8q(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8uq operator ^ (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(Vec8q(a) ^ Vec8q(b));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8uq select (Vec8qb const s, Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8uq if_add (Vec8qb const f, Vec8uq const a, Vec8uq const b) {
+    return _mm512_mask_add_epi64(a, (uint8_t)f, a, b);
+}
+
+// Conditional subtract
+static inline Vec8uq if_sub (Vec8qb const f, Vec8uq const a, Vec8uq const b) {
+    return _mm512_mask_sub_epi64(a, (uint8_t)f, a, b);
+}
+
+// Conditional multiply
+static inline Vec8uq if_mul (Vec8qb const f, Vec8uq const a, Vec8uq const b) {
+#if INSTRSET >= 10
+    return _mm512_mask_mullo_epi64(a, f, a, b);  // AVX512DQ
+#else
+    return select(f, a*b, a);
+#endif
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint64_t horizontal_add (Vec8uq const a) {
+    return (uint64_t)horizontal_add(Vec8q(a));
+}
+
+// function max: a > b ? a : b
+static inline Vec8uq max(Vec8uq const a, Vec8uq const b) {
+    return _mm512_max_epu64(a, b);
+}
+
+// function min: a < b ? a : b
+static inline Vec8uq min(Vec8uq const a, Vec8uq const b) {
+    return _mm512_min_epu64(a, b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q permute8(Vec8q const a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m512i y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec8q>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_si512();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {    // use larger permutation
+            constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // permutation pattern
+            constexpr uint8_t  ppat = (L.a[0] & 3) | (L.a[1]<<2 & 0xC) | (L.a[2]<<4 & 0x30) | (L.a[3]<<6 & 0xC0);
+            y = _mm512_shuffle_i64x2(a, a, ppat);
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in all lanes
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm512_unpackhi_epi64(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm512_unpacklo_epi64(y, y);
+            }
+            else { // general permute
+                y = _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)uint8_t(flags >> perm_ipattern));
+            }
+        }
+        else {  // different patterns in all lanes
+            if constexpr ((flags & perm_rotate_big) != 0) {// fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                y = _mm512_alignr_epi64 (y, y, rot);
+            }
+            else if constexpr ((flags & perm_broadcast) != 0) { // broadcast one element
+                constexpr int e = flags >> perm_rot_count;
+                if constexpr(e != 0) {
+                    y = _mm512_alignr_epi64(y, y, e);
+                }
+                y = _mm512_broadcastq_epi64(_mm512_castsi512_si128(y));
+            }
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm512_maskz_compress_epi64(__mmask8(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm512_maskz_expand_epi64(__mmask8(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_cross_lane) == 0) {  // no lane crossing. Use pshufb
+                const EList <int8_t, 64> bm = pshufb_mask<Vec8q>(indexs);
+                return _mm512_shuffle_epi8(y, Vec8q().load(bm.a));
+            }
+            else {
+                // full permute needed
+                const __m512i pmask = constant16ui <
+                    i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0, i4 & 7, 0, i5 & 7, 0, i6 & 7, 0, i7 & 7, 0>();
+                y = _mm512_permutexvar_epi64(pmask, y);
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {           // additional zeroing needed
+        y = _mm512_maskz_mov_epi64(zero_mask<8>(indexs), y);
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq permute8(Vec8uq const a) {
+    return Vec8uq (permute8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8q(a)));
+}
+
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i permute16(Vec16i const a) {
+    int constexpr indexs[16] = {  // indexes as array
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    __m512i y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec16i>(indexs);
+
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_si512();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {              // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {    // use larger permutation
+            constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // permutation pattern
+            y = permute8 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]> (Vec8q(a));
+            if (!(flags & perm_addz)) return y;            // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in all lanes
+            if constexpr ((flags & perm_punpckh) != 0) {   // fits punpckhi
+                y = _mm512_unpackhi_epi32(y, y);
+            }
+            else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo
+                y = _mm512_unpacklo_epi32(y, y);
+            }
+            else { // general permute
+                y = _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)uint8_t(flags >> perm_ipattern));
+            }
+        }
+        else {  // different patterns in all lanes
+            if constexpr ((flags & perm_rotate_big) != 0) {// fits big rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count
+                return _mm512_maskz_alignr_epi32 (zero_mask<16>(indexs), y, y, rot);
+            }
+            else if constexpr ((flags & perm_broadcast) != 0) { // broadcast one element
+                constexpr int e = flags >> perm_rot_count; // element index
+                if constexpr(e != 0) {
+                    y = _mm512_alignr_epi32(y, y, e);
+                }
+                y = _mm512_broadcastd_epi32(_mm512_castsi512_si128(y));
+            }
+            else if constexpr ((flags & perm_zext) != 0) {
+                y = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(y)); // zero extension
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm512_maskz_compress_epi32(__mmask16(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm512_maskz_expand_epi32(__mmask16(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb
+                const EList <int8_t, 64> bm = pshufb_mask<Vec16i>(indexs);
+                return _mm512_shuffle_epi8(a, Vec16i().load(bm.a));
+            }
+            else {
+                // full permute needed
+                const __m512i pmask = constant16ui <
+                    i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15,
+                    i8 & 15, i9 & 15, i10 & 15, i11 & 15, i12 & 15, i13 & 15, i14 & 15, i15 & 15>();
+                return _mm512_maskz_permutexvar_epi32(zero_mask<16>(indexs), pmask, a);
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {           // additional zeroing needed
+        y = _mm512_maskz_mov_epi32(zero_mask<16>(indexs), y);
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16ui permute16(Vec16ui const a) {
+    return Vec16ui (permute16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+// permute and blend Vec8q
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q blend8(Vec8q const a, Vec8q const b) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    __m512i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec8q>(indexs); // get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512(); // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute8 <i0, i1, i2, i3, i4, i5, i6, i7> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        constexpr EList<int, 16> L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes
+        return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs);  // blend mask
+        y = _mm512_mask_mov_epi64 (a, mb, b);
+    }
+    else if constexpr ((flags & blend_rotate_big) != 0) {  // full rotate
+        constexpr uint8_t rot = uint8_t(flags >> blend_rotpattern); // rotate count
+        if constexpr (rot < 8) {
+            y = _mm512_alignr_epi64(b, a, rot);
+        }
+        else {
+            y = _mm512_alignr_epi64(a, b, rot & 7);
+        }
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 128-bit blocks
+        constexpr EList<int, 4> L = largeblock_perm<8>(indexs); // get 128-bit blend pattern
+        constexpr uint8_t shuf = (L.a[0] & 3) | (L.a[1] & 3) << 2 | (L.a[2] & 3) << 4 | (L.a[3] & 3) << 6;
+        if constexpr (make_bit_mask<8, 0x103>(indexs) == 0) {  // fits vshufi64x2 (a,b)
+            y = _mm512_shuffle_i64x2(a, b, shuf);
+        }
+        else if constexpr (make_bit_mask<8, 0x203>(indexs) == 0) { // fits vshufi64x2 (b,a)
+            y = _mm512_shuffle_i64x2(b, a, shuf);
+        }
+        else {
+            const EList <int64_t, 8> bm = perm_mask_broad<Vec8q>(indexs);   // full permute
+            y = _mm512_permutex2var_epi64(a, Vec8q().load(bm.a), b);
+        }
+    }
+    // check if pattern fits special cases
+    else if constexpr ((flags & blend_punpcklab) != 0) {
+        y = _mm512_unpacklo_epi64 (a, b);
+    }
+    else if constexpr ((flags & blend_punpcklba) != 0) {
+        y = _mm512_unpacklo_epi64 (b, a);
+    }
+    else if constexpr ((flags & blend_punpckhab) != 0) {
+        y = _mm512_unpackhi_epi64 (a, b);
+    }
+    else if constexpr ((flags & blend_punpckhba) != 0) {
+        y = _mm512_unpackhi_epi64 (b, a);
+    }
+#if ALLOW_FP_PERMUTE  // allow floating point permute instructions on integer vectors
+    else if constexpr ((flags & blend_shufab) != 0) {      // use floating point instruction shufpd
+        y = _mm512_castpd_si512(_mm512_shuffle_pd(_mm512_castsi512_pd(a), _mm512_castsi512_pd(b), uint8_t(flags >> blend_shufpattern)));
+    }
+    else if constexpr ((flags & blend_shufba) != 0) {      // use floating point instruction shufpd
+        y = _mm512_castpd_si512(_mm512_shuffle_pd(_mm512_castsi512_pd(b), _mm512_castsi512_pd(a), uint8_t(flags >> blend_shufpattern)));
+    }
+#else
+    // we might use 2 x _mm512_mask(z)_shuffle_epi32 like in blend16 below
+#endif
+    else { // No special cases
+        const EList <int64_t, 8> bm = perm_mask_broad<Vec8q>(indexs);   // full permute
+        y = _mm512_permutex2var_epi64(a, Vec8q().load(bm.a), b);
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+        y = _mm512_maskz_mov_epi64(zero_mask<8>(indexs), y);
+    }
+    return y;
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq blend8(Vec8uq const a, Vec8uq const b) {
+    return Vec8uq( blend8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8q(a),Vec8q(b)));
+}
+
+
+// permute and blend Vec16i
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16i blend16(Vec16i const a, Vec16i const b) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; // indexes as array
+    __m512i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec16i>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute16 <i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        constexpr EList<int, 32> L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes
+        return permute16 <
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint16_t mb = (uint16_t)make_bit_mask<16, 0x304>(indexs);  // blend mask
+        y = _mm512_mask_mov_epi32 (a, mb, b);
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 64-bit blocks
+        constexpr EList<int, 8> L = largeblock_perm<16>(indexs); // get 64-bit blend pattern
+        y = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7] >
+            (Vec8q(a), Vec8q(b));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else if constexpr ((flags & blend_same_pattern) != 0) {
+        // same pattern in all 128-bit lanes. check if pattern fits special cases
+        if constexpr ((flags & blend_punpcklab) != 0) {
+            y = _mm512_unpacklo_epi32(a, b);
+        }
+        else if constexpr ((flags & blend_punpcklba) != 0) {
+            y = _mm512_unpacklo_epi32(b, a);
+        }
+        else if constexpr ((flags & blend_punpckhab) != 0) {
+            y = _mm512_unpackhi_epi32(a, b);
+        }
+        else if constexpr ((flags & blend_punpckhba) != 0) {
+            y = _mm512_unpackhi_epi32(b, a);
+        }
+#if ALLOW_FP_PERMUTE  // allow floating point permute instructions on integer vectors
+        else if constexpr ((flags & blend_shufab) != 0) {  // use floating point instruction shufpd
+            y = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(a), _mm512_castsi512_ps(b), uint8_t(flags >> blend_shufpattern)));
+        }
+        else if constexpr ((flags & blend_shufba) != 0) {  // use floating point instruction shufpd
+            y = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(b), _mm512_castsi512_ps(a), uint8_t(flags >> blend_shufpattern)));
+        }
+#endif
+        else {
+            // Use vpshufd twice. This generates two instructions in the dependency chain,
+            // but we are avoiding the slower lane-crossing instruction, and saving 64
+            // bytes of data cache.
+            auto shuf = [](int const (&a)[16]) constexpr { // get pattern for vpshufd
+                int pat[4] = {-1,-1,-1,-1};
+                for (int i = 0; i < 16; i++) {
+                    int ix = a[i];
+                    if (ix >= 0 && pat[i&3] < 0) {
+                        pat[i&3] = ix;
+                    }
+                }
+                return (pat[0] & 3) | (pat[1] & 3) << 2 | (pat[2] & 3) << 4 | (pat[3] & 3) << 6;
+            };
+            constexpr uint8_t  pattern = uint8_t(shuf(indexs));                    // permute pattern
+            constexpr uint16_t froma = (uint16_t)make_bit_mask<16, 0x004>(indexs); // elements from a
+            constexpr uint16_t fromb = (uint16_t)make_bit_mask<16, 0x304>(indexs); // elements from b
+            y = _mm512_maskz_shuffle_epi32(   froma, a, (_MM_PERM_ENUM) pattern);
+            y = _mm512_mask_shuffle_epi32 (y, fromb, b, (_MM_PERM_ENUM) pattern);
+            return y;  // we have already zeroed any unused elements
+        }
+    }
+    else if constexpr ((flags & blend_rotate_big) != 0) {  // full rotate
+        constexpr uint8_t rot = uint8_t(flags >> blend_rotpattern); // rotate count
+        if constexpr (rot < 16) {
+            y = _mm512_alignr_epi32(b, a, rot);
+        }
+        else {
+            y = _mm512_alignr_epi32(a, b, rot & 0x0F);
+        }
+    }
+
+    else { // No special cases
+        const EList <int32_t, 16> bm = perm_mask_broad<Vec16i>(indexs);   // full permute
+        y = _mm512_permutex2var_epi32(a, Vec16i().load(bm.a), b);
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+        y = _mm512_maskz_mov_epi32(zero_mask<16>(indexs), y);
+    }
+    return y;
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16ui blend16(Vec16ui const a, Vec16ui const b) {
+    return Vec16ui( blend16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec16i lookup16(Vec16i const index, Vec16i const table) {
+    return _mm512_permutexvar_epi32(index, table);
+}
+
+static inline Vec16i lookup32(Vec16i const index, Vec16i const table1, Vec16i const table2) {
+    return _mm512_permutex2var_epi32(table1, index, table2);
+}
+
+static inline Vec16i lookup64(Vec16i const index, Vec16i const table1, Vec16i const table2, Vec16i const table3, Vec16i const table4) {
+    Vec16i d12 = _mm512_permutex2var_epi32(table1, index, table2);
+    Vec16i d34 = _mm512_permutex2var_epi32(table3, index, table4);
+    return select((index >> 5) != 0, d34, d12);
+}
+
+template <int n>
+static inline Vec16i lookup(Vec16i const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 16) {
+        Vec16i table1 = Vec16i().load(table);
+        return lookup16(index, table1);
+    }
+    if constexpr (n <= 32) {
+        Vec16i table1 = Vec16i().load(table);
+        Vec16i table2 = Vec16i().load((int8_t*)table + 64);
+        return _mm512_permutex2var_epi32(table1, index, table2);
+    }
+    // n > 32. Limit index
+    Vec16ui index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec16ui(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec16ui(index), uint32_t(n-1));
+    }
+    return _mm512_i32gather_epi32(index1, (const int*)table, 4);
+    // return  _mm512_i32gather_epi32(index1, table, _MM_UPCONV_EPI32_NONE, 4, 0);
+}
+
+
+static inline Vec8q lookup8(Vec8q const index, Vec8q const table) {
+    return _mm512_permutexvar_epi64(index, table);
+}
+
+template <int n>
+static inline Vec8q lookup(Vec8q const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 8) {
+        Vec8q table1 = Vec8q().load(table);
+        return lookup8(index, table1);
+    }
+    if constexpr (n <= 16) {
+        Vec8q table1 = Vec8q().load(table);
+        Vec8q table2 = Vec8q().load((int8_t*)table + 64);
+        return _mm512_permutex2var_epi64(table1, index, table2);
+    }
+    // n > 16. Limit index
+    Vec8uq index1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        index1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        index1 = min(Vec8uq(index), uint32_t(n-1));
+    }
+    return _mm512_i64gather_epi64(index1, (const long long*)table, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i gather16i(void const * a) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if constexpr (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
+            return permute16<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16i b = Vec16i().load((int32_t const *)a + imin);
+            return permute16<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16i b = Vec16i().load((int32_t const *)a + imin);
+        Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i32gather_epi32(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const int *)a, 4);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q gather8q(void const * a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if constexpr (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
+            return permute8<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8q b = Vec8q().load((int64_t const *)a + imin);
+            return permute8<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8q b = Vec8q().load((int64_t const *)a + imin);
+        Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use gather instruction
+    return _mm512_i64gather_epi64(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const long long *)a, 8);
+}
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+******************************************************************************
+*
+* These functions write the elements of a vector to arbitrary positions in an
+* array in memory. Each vector element is written to an array position
+* determined by an index. An element is not written if the corresponding
+* index is out of range.
+* The indexes can be specified as constant template parameters or as an
+* integer vector.
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    static inline void scatter(Vec16i const data, void * array) {
+    __m512i indx = constant16ui<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15>();
+    Vec16ib mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0,
+        i8>=0, i9>=0, i10>=0, i11>=0, i12>=0, i13>=0, i14>=0, i15>=0);
+    _mm512_mask_i32scatter_epi32((int*)array, mask, indx, data, 4);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8q const data, void * array) {
+    __m256i indx = constant8ui<i0,i1,i2,i3,i4,i5,i6,i7>();
+    Vec8qb mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0);
+    _mm512_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Scatter functions with variable indexes
+*
+*****************************************************************************/
+
+static inline void scatter(Vec16i const index, uint32_t limit, Vec16i const data, void * destination) {
+    Vec16ib mask = Vec16ui(index) < limit;
+    _mm512_mask_i32scatter_epi32((int*)destination, mask, index, data, 4);
+}
+
+static inline void scatter(Vec8q const index, uint32_t limit, Vec8q const data, void * destination) {
+    Vec8qb mask = Vec8uq(index) < uint64_t(limit);
+    _mm512_mask_i64scatter_epi64((long long *)destination, (uint8_t)mask, index, data, 8);
+}
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8q const data, void * destination) {
+#if INSTRSET >= 10 //  __AVX512VL__
+    __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
+#else
+    __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xFFu, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
+#endif
+    _mm512_mask_i32scatter_epi64((long long *)destination, (uint8_t)mask, index, data, 8);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 64 bits with sign extension
+static inline Vec8q extend_low (Vec16i const a) {
+    return _mm512_cvtepi32_epi64(a.get_low());
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with sign extension
+static inline Vec8q extend_high (Vec16i const a) {
+    return _mm512_cvtepi32_epi64(a.get_high());
+}
+
+// Function extend_low : extends the low 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_low (Vec16ui const a) {
+    return _mm512_cvtepu32_epi64(a.get_low());
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_high (Vec16ui const a) {
+    return _mm512_cvtepu32_epi64(a.get_high());
+}
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec16i compress (Vec8q const low, Vec8q const high) {
+    Vec8i low2   = _mm512_cvtepi64_epi32(low);
+    Vec8i high2  = _mm512_cvtepi64_epi32(high);
+    return Vec16i(low2, high2);
+}
+static inline Vec16ui compress (Vec8uq const low, Vec8uq const high) {
+    return Vec16ui(compress(Vec8q(low), Vec8q(high)));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec16i compress_saturated (Vec8q const low, Vec8q const high) {
+    Vec8i low2   = _mm512_cvtsepi64_epi32(low);
+    Vec8i high2  = _mm512_cvtsepi64_epi32(high);
+    return Vec16i(low2, high2);
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec16ui compress_saturated (Vec8uq const low, Vec8uq const high) {
+    Vec8ui low2   = _mm512_cvtusepi64_epi32(low);
+    Vec8ui high2  = _mm512_cvtusepi64_epi32(high);
+    return Vec16ui(low2, high2);
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 16 32-bit signed integers
+static inline Vec16i operator / (Vec16i const a, Divisor_i const d) {
+    __m512i m   = _mm512_broadcast_i32x4(d.getm());        // broadcast multiplier
+    __m512i sgn = _mm512_broadcast_i32x4(d.getsign());     // broadcast sign of d
+    __m512i t1  = _mm512_mul_epi32(a,m);                   // 32x32->64 bit signed multiplication of even elements of a
+    __m512i t3  = _mm512_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m512i t4  = _mm512_mul_epi32(t3,m);                  // 32x32->64 bit signed multiplication of odd elements
+    __m512i t2  = _mm512_srli_epi64(t1,32);                // dword of even index results
+    __m512i t7  = _mm512_mask_mov_epi32(t2, 0xAAAA, t4);   // blend two results
+    __m512i t8  = _mm512_add_epi32(t7,a);                  // add
+    __m512i t9  = _mm512_sra_epi32(t8,d.gets1());          // shift right artihmetic
+    __m512i t10 = _mm512_srai_epi32(a,31);                 // sign of a
+    __m512i t11 = _mm512_sub_epi32(t10,sgn);               // sign of a - sign of d
+    __m512i t12 = _mm512_sub_epi32(t9,t11);                // + 1 if a < 0, -1 if d < 0
+    return        _mm512_xor_si512(t12,sgn);               // change sign if divisor negative
+}
+
+// vector of 16 32-bit unsigned integers
+static inline Vec16ui operator / (Vec16ui const a, Divisor_ui const d) {
+    __m512i m   = _mm512_broadcast_i32x4(d.getm());        // broadcast multiplier
+    __m512i t1  = _mm512_mul_epu32(a,m);                   // 32x32->64 bit unsigned multiplication of even elements of a
+    __m512i t3  = _mm512_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
+    __m512i t4  = _mm512_mul_epu32(t3,m);                  // 32x32->64 bit unsigned multiplication of odd elements
+    __m512i t2  = _mm512_srli_epi64(t1,32);                // high dword of even index results
+    __m512i t7  = _mm512_mask_mov_epi32(t2, 0xAAAA, t4);   // blend two results
+    __m512i t8  = _mm512_sub_epi32(a,t7);                  // subtract
+    __m512i t9  = _mm512_srl_epi32(t8,d.gets1());          // shift right logical
+    __m512i t10 = _mm512_add_epi32(t7,t9);                 // add
+    return        _mm512_srl_epi32(t10,d.gets2());         // shift right logical
+}
+
+// vector operator /= : divide
+static inline Vec16i & operator /= (Vec16i & a, Divisor_i const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec16i by compile-time constant
+template <int32_t d>
+static inline Vec16i divide_by_i(Vec16i const x) {
+    static_assert(d != 0, "Integer division by zero");
+    if constexpr (d ==  1) return  x;
+    if constexpr (d == -1) return -x;
+    if constexpr (uint32_t(d) == 0x80000000u) {
+        return _mm512_maskz_set1_epi32(x == Vec16i(0x80000000), 1);  // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0;
+    }
+    constexpr uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);      // compile-time abs(d). (force compiler to treat d as 32 bits, not 64 bits)
+    if constexpr ((d1 & (d1-1)) == 0) {
+        // d1 is a power of 2. use shift
+        constexpr int k = bit_scan_reverse_const(d1);
+        __m512i sign;
+        if constexpr (k > 1) sign = _mm512_srai_epi32(x, k-1); else sign = x;  // k copies of sign bit
+        __m512i bias    = _mm512_srli_epi32(sign, 32-k);             // bias = x >= 0 ? 0 : k-1
+        __m512i xpbias  = _mm512_add_epi32 (x, bias);                // x + bias
+        __m512i q       = _mm512_srai_epi32(xpbias, k);              // (x + bias) >> k
+        if (d > 0)      return q;                                    // d > 0: return  q
+        return _mm512_sub_epi32(_mm512_setzero_epi32(), q);          // d < 0: return -q
+    }
+    // general case
+    constexpr int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);   // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
+    constexpr int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));   // multiplier
+    const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
+    return x / div;
+}
+
+// define Vec8i a / const_int(d)
+template <int32_t d>
+static inline Vec16i operator / (Vec16i const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16i a / const_uint(d)
+template <uint32_t d>
+static inline Vec16i operator / (Vec16i const a, Const_uint_t<d>) {
+    static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int32_t(d)>(a);                     // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// Divide Vec16ui by compile-time constant
+template <uint32_t d>
+static inline Vec16ui divide_by_ui(Vec16ui const x) {
+    static_assert(d != 0, "Integer division by zero");
+    if constexpr (d == 1) return x;                        // divide by 1
+    constexpr  int b = bit_scan_reverse_const(d);          // floor(log2(d))
+    if constexpr ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm512_srli_epi32(x, b);                   // x >> b
+    }
+    // general case (d > 2)
+    constexpr uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d
+    constexpr  uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
+    constexpr  bool round_down = (2*rem < d);                        // check if fraction is less than 0.5
+    constexpr uint32_t mult1 = round_down ? mult : mult + 1;
+
+    // do 32*32->64 bit unsigned multiplication and get high part of result
+    const __m512i multv = _mm512_maskz_set1_epi32(0x5555, mult1); // zero-extend mult and broadcast
+    __m512i t1 = _mm512_mul_epu32(x,multv);                // 32x32->64 bit unsigned multiplication of even elements
+    if constexpr (round_down) {
+        t1     = _mm512_add_epi64(t1,multv);               // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m512i t2 = _mm512_srli_epi64(t1,32);                 // high dword of result 0 and 2
+    __m512i t3 = _mm512_srli_epi64(x,32);                  // get odd elements into position for multiplication
+    __m512i t4 = _mm512_mul_epu32(t3,multv);               // 32x32->64 bit unsigned multiplication of x[1] and x[3]
+    if constexpr (round_down) {
+        t4     = _mm512_add_epi64(t4,multv);               // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
+    }
+    __m512i t7 = _mm512_mask_mov_epi32(t2, 0xAAAA, t4);    // blend two results
+    Vec16ui q  = _mm512_srli_epi32(t7, b);                 // shift right by b
+    return q;                                              // no overflow possible
+}
+
+// define Vec8ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec16ui operator / (Vec16ui const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec8ui a / const_int(d)
+template <int32_t d>
+static inline Vec16ui operator / (Vec16ui const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                             // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI512_H
diff --git a/Bwdif/VCL2/vectori512e.h b/Bwdif/VCL2/vectori512e.h
new file mode 100644
index 0000000..e1dc4d9
--- /dev/null
+++ b/Bwdif/VCL2/vectori512e.h
@@ -0,0 +1,2342 @@
+/****************************  vectori512e.h   *******************************
+* Author:        Agner Fog
+* Date created:  2014-07-23
+* Last modified: 2020-03-26
+* Version:       2.01.02
+* Project:       vector classes
+* Description:
+* Header file defining 512-bit integer vector classes for 32 and 64 bit integers.
+* Emulated for processors without AVX512 instruction set.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec16i    Vector of  16  32-bit signed   integers
+* Vec16ui   Vector of  16  32-bit unsigned integers
+* Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
+* Vec8q     Vector of   8  64-bit signed   integers
+* Vec8uq    Vector of   8  64-bit unsigned integers
+* Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
+*
+* Each vector object is represented internally in the CPU as two 256-bit registers.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTORI512E_H
+#define VECTORI512E_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+// check combination of header files
+#if defined (VECTORI512_H)
+#error Two different versions of vectori512.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+/*****************************************************************************
+*
+*          Vector of 512 bits
+*
+*****************************************************************************/
+
+class Vec512b {
+protected:
+    Vec256b z0;                         // low half
+    Vec256b z1;                         // high half
+public:
+    // Default constructor:
+    Vec512b() {
+    }
+    // Constructor to build from two Vec256b:
+    Vec512b(Vec256b const a0, Vec256b const a1) {
+        z0 = a0;  z1 = a1;
+    }
+    // Member function to load from array (unaligned)
+    Vec512b & load(void const * p) {
+        z0 = Vec8i().load(p);
+        z1 = Vec8i().load((int32_t const*)p+8);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec512b & load_a(void const * p) {
+        z0 = Vec8i().load_a(p);
+        z1 = Vec8i().load_a((int32_t const*)p+8);
+        return *this;
+    }
+    // Member function to store into array (unaligned)
+    void store(void * p) const {
+        Vec8i(z0).store(p);
+        Vec8i(z1).store((int32_t*)p+8);
+    }
+    // Member function to store into array, aligned by 64
+    void store_a(void * p) const {
+        Vec8i(z0).store_a(p);
+        Vec8i(z1).store_a((int32_t*)p+8);
+    }
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // This may be more efficient than store_a when storing large blocks of memory if it 
+    // is unlikely that the data will stay in the cache until it is read again.
+    // Note: Will generate runtime error if p is not aligned by 64
+    void store_nt(void * p) const {
+        Vec8i(z0).store_nt(p);
+        Vec8i(z1).store_nt((int32_t*)p+8);
+    }
+    Vec256b get_low() const {            // get low half
+        return z0;
+    }
+    Vec256b get_high() const {           // get high half
+        return z1;
+    }
+    static constexpr int size() {
+        return 512;
+    }
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec512b operator & (Vec512b const a, Vec512b const b) {
+    return Vec512b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec512b operator && (Vec512b const a, Vec512b const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec512b operator | (Vec512b const a, Vec512b const b) {
+    return Vec512b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec512b operator || (Vec512b const a, Vec512b const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec512b operator ^ (Vec512b const a, Vec512b const b) {
+    return Vec512b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec512b operator ~ (Vec512b const a) {
+    return Vec512b(~a.get_low(), ~a.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec512b & operator &= (Vec512b & a, Vec512b const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec512b & operator |= (Vec512b & a, Vec512b const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec512b & operator ^= (Vec512b & a, Vec512b const b) {
+    a = a ^ b;
+    return a;
+}
+
+// Define functions for this class
+
+// function andnot: a & ~ b
+static inline Vec512b andnot (Vec512b const a, Vec512b const b) {
+    return Vec512b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Boolean vector (broad) base classes
+*
+*****************************************************************************/
+
+class Vec16b : public Vec512b {
+public:
+    // Default constructor:
+    Vec16b () {
+    }
+    // Constructor to build from all elements:
+    Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
+    bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
+        *this = Vec512b(Vec8i(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7), Vec8i(-(int)b8, -(int)b9, -(int)b10, -(int)b11, -(int)b12, -(int)b13, -(int)b14, -(int)b15));
+    }
+    // Constructor to convert from type Vec512b
+    Vec16b (Vec512b const & x) {  // gcc requires const & here
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to make from two halves
+    Vec16b (Vec8ib const x0, Vec8ib const x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Constructor to make from two halves
+    Vec16b (Vec8i const x0, Vec8i const x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Constructor to broadcast single value:
+    Vec16b(bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16b & operator = (bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+        return *this;
+    }
+    // split into two halves
+    Vec8ib get_low() const {
+        return Vec8ib(z0);
+    }
+    Vec8ib get_high() const {
+        return Vec8ib(z1);
+    }
+    /*
+    // Assignment operator to convert from type Vec512b
+    Vec16b & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    } */
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec16b const insert(int index, bool value) {
+        if ((uint32_t)index < 8) {
+            z0 = Vec8ib(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec8ib(z1).insert(index-8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        if ((uint32_t)index < 8) {
+            return Vec8ib(z0).extract(index);
+        }
+        else {
+            return Vec8ib(z1).extract(index-8);
+        }
+    }
+    // Extract a single element. Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec16b(int b) = delete;
+    // Prevent assigning int because of ambiguity
+    Vec16b & operator = (int x) = delete;
+};
+
+// Define operators for this class
+
+// vector operator & : bitwise and
+static inline Vec16b operator & (Vec16b const a, Vec16b const b) {
+    return Vec16b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec16b operator && (Vec16b const a, Vec16b const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16b operator | (Vec16b const a, Vec16b const b) {
+    return Vec16b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec16b operator || (Vec16b const a, Vec16b const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16b operator ^ (Vec16b const a, Vec16b const b) {
+    return Vec16b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16b operator ~ (Vec16b const a) {
+    return Vec16b(~(a.get_low()), ~(a.get_high()));
+}
+
+// vector operator ! : element not
+static inline Vec16b operator ! (Vec16b const a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16b & operator &= (Vec16b & a, Vec16b const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16b & operator |= (Vec16b & a, Vec16b const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16b & operator ^= (Vec16b & a, Vec16b const b) {
+    a = a ^ b;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Functions for boolean vectors
+*
+*****************************************************************************/
+
+// function andnot: a & ~ b
+static inline Vec16b andnot (Vec16b const a, Vec16b const b) {
+    return Vec16b(Vec8ib(andnot(a.get_low(),b.get_low())), Vec8ib(andnot(a.get_high(),b.get_high())));
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec16b const a) {
+    return  horizontal_and(a.get_low() & a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec16b const a) {
+    return  horizontal_or(a.get_low() | a.get_high());
+}
+
+
+/*****************************************************************************
+*
+*          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
+*
+*****************************************************************************/
+
+class Vec16ib : public Vec16b {
+public:
+    // Default constructor:
+    Vec16ib () {
+    }
+    /*
+    Vec16ib (Vec16b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    } */
+    // Constructor to build from all elements:
+    Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
+        bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) {
+        z0 = Vec8ib(x0, x1, x2, x3, x4, x5, x6, x7);
+        z1 = Vec8ib(x8, x9, x10, x11, x12, x13, x14, x15);
+    }
+    // Constructor to convert from type Vec512b
+    Vec16ib (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Construct from two halves
+    Vec16ib (Vec8ib const x0, Vec8ib const x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec16ib & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast scalar value:
+    Vec16ib(bool b) : Vec16b(b) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec16ib & operator = (bool b) {
+        *this = Vec16b(b);
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec16ib & load_bits(uint16_t a) {
+        z0 = Vec8ib().load_bits(uint8_t(a));
+        z1 = Vec8ib().load_bits(uint8_t(a>>8));
+        return *this;
+    }
+    // Prevent constructing from int, etc.
+    Vec16ib(int b) = delete;
+    Vec16ib & operator = (int x) = delete;
+};
+
+// Define operators for Vec16ib
+
+// vector operator & : bitwise and
+static inline Vec16ib operator & (Vec16ib const a, Vec16ib const b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec16ib operator && (Vec16ib const a, Vec16ib const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec16ib operator | (Vec16ib const a, Vec16ib const b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec16ib operator || (Vec16ib const a, Vec16ib const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ib operator ^ (Vec16ib const a, Vec16ib const b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator == : xnor
+static inline Vec16ib operator == (Vec16ib const a, Vec16ib const b) {
+    return Vec16ib(Vec16b(a) ^ Vec16b(~b));
+}
+
+// vector operator != : xor
+static inline Vec16ib operator != (Vec16ib const a, Vec16ib const b) {
+    return Vec16ib(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ib operator ~ (Vec16ib const a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec16ib operator ! (Vec16ib const a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector function andnot
+static inline Vec16ib andnot (Vec16ib const a, Vec16ib const b) {
+    return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vec8b: Base class vector of 8 Booleans
+*
+*****************************************************************************/
+
+class Vec8b : public Vec16b {
+public:
+    // Default constructor:
+    Vec8b () {
+    }
+    /*
+    Vec8b (Vec16b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    } */
+    // Constructor to convert from type Vec512b
+    Vec8b (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // construct from two halves
+    Vec8b (Vec4qb const x0, Vec4qb const x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Constructor to broadcast single value:
+    Vec8b(bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8b & operator = (bool b) {
+        z0 = z1 = Vec8i(-int32_t(b));
+        return *this;
+    }
+    // split into two halves
+    Vec4qb get_low() const {
+        return Vec4qb(z0);
+    }
+    Vec4qb get_high() const {
+        return Vec4qb(z1);
+    }
+    /*
+    // Assignment operator to convert from type Vec512b
+    Vec8b & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    } */
+    // Member function to change a single element in vector
+    Vec8b const insert(int index, bool value) {
+        if ((uint32_t)index < 4) {
+            z0 = Vec4qb(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec4qb(z1).insert(index-4, value);
+        }
+        return *this;
+    }
+    bool extract(int index) const {
+        if ((uint32_t)index < 4) {
+            return Vec4qb(Vec4q(z0)).extract(index);
+        }
+        else {
+            return Vec4qb(Vec4q(z1)).extract(index-4);
+        }
+    }
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec8b(int b) = delete;
+    // Prevent assigning int because of ambiguity
+    Vec8b & operator = (int x) = delete;
+};
+
+
+/*****************************************************************************
+*
+*          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
+*
+*****************************************************************************/
+
+class Vec8qb : public Vec8b {
+public:
+    // Default constructor:
+    Vec8qb () {
+    }
+    Vec8qb (Vec16b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to build from all elements:
+    Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
+        z0 = Vec4qb(x0, x1, x2, x3);
+        z1 = Vec4qb(x4, x5, x6, x7);
+    }
+    // Constructor to convert from type Vec512b
+    Vec8qb (Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // construct from two halves
+    Vec8qb (Vec4qb const x0, Vec4qb const x1) {
+        z0 = x0;
+        z1 = x1;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8qb & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Constructor to broadcast single value:
+    Vec8qb(bool b) : Vec8b(b) {
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec8qb & operator = (bool b) {
+        *this = Vec8b(b);
+        return *this;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec8qb & load_bits(uint8_t a) {
+        z0 = Vec4qb().load_bits(a);
+        z1 = Vec4qb().load_bits(uint8_t(a>>4u));
+        return *this;
+    }
+    // Prevent constructing from int, etc. because of ambiguity
+    Vec8qb(int b) = delete;
+    // Prevent assigning int because of ambiguity
+    Vec8qb & operator = (int x) = delete;
+};
+
+// Define operators for Vec8qb
+
+// vector operator & : bitwise and
+static inline Vec8qb operator & (Vec8qb const a, Vec8qb const b) {
+    return Vec16b(a) & Vec16b(b);
+}
+static inline Vec8qb operator && (Vec8qb const a, Vec8qb const b) {
+    return a & b;
+}
+
+// vector operator | : bitwise or
+static inline Vec8qb operator | (Vec8qb const a, Vec8qb const b) {
+    return Vec16b(a) | Vec16b(b);
+}
+static inline Vec8qb operator || (Vec8qb const a, Vec8qb const b) {
+    return a | b;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8qb operator ^ (Vec8qb const a, Vec8qb const b) {
+    return Vec16b(a) ^ Vec16b(b);
+}
+
+// vector operator == : xnor
+static inline Vec8qb operator == (Vec8qb const a, Vec8qb const b) {
+    return Vec8qb(Vec16b(a) ^ Vec16b(~b));
+}
+
+// vector operator != : xor
+static inline Vec8qb operator != (Vec8qb const a, Vec8qb const b) {
+    return Vec8qb(a ^ b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8qb operator ~ (Vec8qb const a) {
+    return ~Vec16b(a);
+}
+
+// vector operator ! : element not
+static inline Vec8qb operator ! (Vec8qb const a) {
+    return ~a;
+}
+
+// vector operator &= : bitwise and
+static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator |= : bitwise or
+static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector function andnot
+static inline Vec8qb andnot (Vec8qb const a, Vec8qb const b) {
+    return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit signed integers
+*
+*****************************************************************************/
+
+class Vec16i: public Vec512b {
+public:
+    // Default constructor:
+    Vec16i() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec16i(int i) {
+        z0 = z1 = Vec8i(i);
+    }
+    // Constructor to build from all elements:
+    Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+    int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
+        z0 = Vec8i(i0, i1, i2, i3, i4, i5, i6, i7);
+        z1 = Vec8i(i8, i9, i10, i11, i12, i13, i14, i15);
+    }
+    // Constructor to build from two Vec8i:
+    Vec16i(Vec8i const a0, Vec8i const a1) {
+        *this = Vec512b(a0, a1);
+    }
+    // Constructor to convert from type Vec512b
+    Vec16i(Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec16i & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16i & load(void const * p) {
+        Vec512b::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16i & load_a(void const * p) {
+        Vec512b::load_a(p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec16i & load_partial(int n, void const * p) {
+        if (n < 8) {
+            z0 = Vec8i().load_partial(n, p);
+            z1 = Vec8i(0);
+        }
+        else {
+            z0 = Vec8i().load(p);
+            z1 = Vec8i().load_partial(n - 8, (int32_t const*)p + 8);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n < 8) {
+            Vec8i(get_low()).store_partial(n, p);
+        }
+        else {
+            Vec8i(get_low()).store(p);
+            Vec8i(get_high()).store_partial(n - 8, (int32_t *)p + 8);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec16i & cutoff(int n) {
+        if (n < 8) {
+            z0 = Vec8i(z0).cutoff(n);
+            z1 = Vec8i(0);
+        }
+        else {
+            z1 = Vec8i(z1).cutoff(n - 8);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16i const insert(int index, int32_t value) {
+        if ((uint32_t)index < 8) {
+            z0 = Vec8i(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec8i(z1).insert(index - 8, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int32_t extract(int index) const {
+        if ((uint32_t)index < 8) {
+            return Vec8i(z0).extract(index);
+        }
+        else {
+            return Vec8i(z1).extract(index - 8);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec8i:
+    Vec8i get_low() const {
+        return Vec8i(z0);
+    }
+    Vec8i get_high() const {
+        return Vec8i(z1);
+    }
+    static constexpr int size() {
+        return 16;
+    }
+    static constexpr int elementtype() {
+        return 8;
+    }
+};
+
+
+// Define operators for Vec16i
+
+// vector operator + : add element by element
+static inline Vec16i operator + (Vec16i const a, Vec16i const b) {
+    return Vec16i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec16i & operator += (Vec16i & a, Vec16i const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec16i operator ++ (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec16i & operator ++ (Vec16i & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec16i operator - (Vec16i const a, Vec16i const b) {
+    return Vec16i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec16i operator - (Vec16i const a) {
+    return Vec16i(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec16i & operator -= (Vec16i & a, Vec16i const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec16i operator -- (Vec16i & a, int) {
+    Vec16i a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec16i & operator -- (Vec16i & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec16i operator * (Vec16i const a, Vec16i const b) {
+    return Vec16i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec16i & operator *= (Vec16i & a, Vec16i const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec16i operator << (Vec16i const a, int32_t b) {
+    return Vec16i(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec16i operator >> (Vec16i const a, int32_t b) {
+    return Vec16i(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec16ib operator == (Vec16i const a, Vec16i const b) {
+    return Vec16ib(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec16ib operator != (Vec16i const a, Vec16i const b) {
+    return Vec16ib(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec16ib operator > (Vec16i const a, Vec16i const b) {
+    return Vec16ib(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec16ib operator < (Vec16i const a, Vec16i const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec16ib operator >= (Vec16i const a, Vec16i const b) {
+    return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec16ib operator <= (Vec16i const a, Vec16i const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16i operator & (Vec16i const a, Vec16i const b) {
+    return Vec16i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec16i & operator &= (Vec16i & a, Vec16i const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec16i operator | (Vec16i const a, Vec16i const b) {
+    return Vec16i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec16i & operator |= (Vec16i & a, Vec16i const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16i operator ^ (Vec16i const a, Vec16i const b) {
+    return Vec16i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec16i & operator ^= (Vec16i & a, Vec16i const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16i operator ~ (Vec16i const a) {
+    return Vec16i(~(a.get_low()), ~(a.get_high()));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16i select (Vec16ib const s, Vec16i const a, Vec16i const b) {
+    return Vec16i(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16i if_add (Vec16ib const f, Vec16i const a, Vec16i const b) {
+    return Vec16i(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec16i if_sub (Vec16ib const f, Vec16i const a, Vec16i const b) {
+    return Vec16i(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec16i if_mul (Vec16ib const f, Vec16i const a, Vec16i const b) {
+    return Vec16i(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int32_t horizontal_add (Vec16i const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec16i add_saturated(Vec16i const a, Vec16i const b) {
+    return Vec16i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec16i sub_saturated(Vec16i const a, Vec16i const b) {
+    return Vec16i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16i max(Vec16i const a, Vec16i const b) {
+    return Vec16i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16i min(Vec16i const a, Vec16i const b) {
+    return Vec16i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec16i abs(Vec16i const a) {
+    return Vec16i(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec16i abs_saturated(Vec16i const a) {
+    return Vec16i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec16i rotate_left(Vec16i const a, int b) {
+    return Vec16i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 16 32-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec16ui : public Vec16i {
+public:
+    // Default constructor:
+    Vec16ui() {
+    };
+    // Constructor to broadcast the same value into all elements:
+    Vec16ui(uint32_t i) {
+        z0 = z1 = Vec8ui(i);
+    };
+    // Constructor to build from all elements:
+    Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
+    uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
+        z0 = Vec8ui(i0, i1, i2, i3, i4, i5, i6, i7);
+        z1 = Vec8ui(i8, i9, i10, i11, i12, i13, i14, i15);
+    };
+    // Constructor to build from two Vec8ui:
+    Vec16ui(Vec8ui const a0, Vec8ui const a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Constructor to convert from type Vec512b
+    Vec16ui(Vec512b const & x) {
+        *this = x;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec16ui & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec16ui & load(void const * p) {
+        Vec16i::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec16ui & load_a(void const * p) {
+        Vec16i::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec16ui const insert(int index, uint32_t value) {
+        Vec16i::insert(index, (int32_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint32_t extract(int index) const {
+        return (uint32_t)Vec16i::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint32_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec4ui:
+    Vec8ui get_low() const {
+        return Vec8ui(Vec16i::get_low());
+    }
+    Vec8ui get_high() const {
+        return Vec8ui(Vec16i::get_high());
+    }
+    static constexpr int elementtype() {
+        return 9;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec16ui operator + (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui (Vec16i(a) + Vec16i(b));
+}
+
+// vector operator - : subtract
+static inline Vec16ui operator - (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui (Vec16i(a) - Vec16i(b));
+}
+
+// vector operator * : multiply
+static inline Vec16ui operator * (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui (Vec16i(a) * Vec16i(b));
+}
+
+// vector operator / : divide. See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const a, uint32_t b) {
+    return Vec16ui(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec16ui operator >> (Vec16ui const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const a, uint32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec16ui operator << (Vec16ui const a, int32_t b) {
+    return Vec16ui ((Vec16i)a << (int32_t)b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec16ib operator < (Vec16ui const a, Vec16ui const b) {
+    return Vec16ib(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec16ib operator > (Vec16ui const a, Vec16ui const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec16ib operator >= (Vec16ui const a, Vec16ui const b) {
+    return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec16ib operator <= (Vec16ui const a, Vec16ui const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec16ui operator & (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(Vec16i(a) & Vec16i(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec16ui operator | (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(Vec16i(a) | Vec16i(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec16ui operator ^ (Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(Vec16i(a) ^ Vec16i(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec16ui operator ~ (Vec16ui const a) {
+    return Vec16ui( ~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec16ui select (Vec16ib const s, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec16ui if_add (Vec16ib const f, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional subtract
+static inline Vec16ui if_sub (Vec16ib const f, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(if_sub(f, Vec16i(a), Vec16i(b)));
+}
+
+// Conditional multiply
+static inline Vec16ui if_mul (Vec16ib const f, Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(if_mul(f, Vec16i(a), Vec16i(b)));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint32_t horizontal_add (Vec16ui const a) {
+    return (uint32_t)horizontal_add((Vec16i)a);
+}
+
+// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec16ui add_saturated(Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec16ui sub_saturated(Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec16ui max(Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec16ui min(Vec16ui const a, Vec16ui const b) {
+    return Vec16ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit signed integers
+*
+*****************************************************************************/
+
+class Vec8q : public Vec512b {
+public:
+    // Default constructor:
+    Vec8q() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8q(int64_t i) {
+        z0 = z1 = Vec4q(i);
+    }
+    // Constructor to build from all elements:
+    Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
+        z0 = Vec4q(i0, i1, i2, i3);
+        z1 = Vec4q(i4, i5, i6, i7);
+    }
+    // Constructor to build from two Vec4q:
+    Vec8q(Vec4q const a0, Vec4q const a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Constructor to convert from type Vec512b
+    Vec8q(Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8q & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8q & load(void const * p) {
+        z0 = Vec4q().load(p);
+        z1 = Vec4q().load((int64_t const*)p+4);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec8q & load_a(void const * p) {
+        z0 = Vec4q().load_a(p);
+        z1 = Vec4q().load_a((int64_t const*)p+4);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec8q & load_partial(int n, void const * p) {
+        if (n < 4) {
+            z0 = Vec4q().load_partial(n, p);
+            z1 = Vec4q(0);
+        }
+        else {
+            z0 = Vec4q().load(p);
+            z1 = Vec4q().load_partial(n - 4, (int64_t const*)p + 4);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n < 4) {
+            Vec4q(get_low()).store_partial(n, p);
+        }
+        else {
+            Vec4q(get_low()).store(p);
+            Vec4q(get_high()).store_partial(n - 4, (int64_t *)p + 4);
+        }
+    }
+    // cut off vector to n elements. The last 8-n elements are set to zero
+    Vec8q & cutoff(int n) {
+        if (n < 4) {
+            z0 = Vec4q(z0).cutoff(n);
+            z1 = Vec4q(0);
+        }
+        else {
+            z1 = Vec4q(z1).cutoff(n - 4);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8q const insert(int index, int64_t value) {
+        if ((uint32_t)index < 4) {
+            z0 = Vec4q(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec4q(z1).insert(index-4, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int64_t extract(int index) const {
+        if ((uint32_t)index < 4) {
+            return Vec4q(z0).extract(index);
+        }
+        else {
+            return Vec4q(z1).extract(index - 4);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2q:
+    Vec4q get_low() const {
+        return Vec4q(z0);
+    }
+    Vec4q get_high() const {
+        return Vec4q(z1);
+    }
+    static constexpr int size() {
+        return 8;
+    }
+    static constexpr int elementtype() {
+        return 10;
+    }
+};
+
+// Define operators for Vec8q
+
+// vector operator + : add element by element
+static inline Vec8q operator + (Vec8q const a, Vec8q const b) {
+    return Vec8q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec8q & operator += (Vec8q & a, Vec8q const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec8q operator ++ (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec8q & operator ++ (Vec8q & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec8q operator - (Vec8q const a, Vec8q const b) {
+    return Vec8q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec8q operator - (Vec8q const a) {
+    return Vec8q(- a.get_low(), - a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec8q & operator -= (Vec8q & a, Vec8q const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec8q operator -- (Vec8q & a, int) {
+    Vec8q a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec8q & operator -- (Vec8q & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec8q operator * (Vec8q const a, Vec8q const b) {
+    return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec8q & operator *= (Vec8q & a, Vec8q const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator << : shift left
+static inline Vec8q operator << (Vec8q const a, int32_t b) {
+    return Vec8q(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec8q operator >> (Vec8q const a, int32_t b) {
+    return Vec8q(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec8qb operator == (Vec8q const a, Vec8q const b) {
+    return Vec8qb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec8qb operator != (Vec8q const a, Vec8q const b) {
+    return Vec8qb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec8qb operator < (Vec8q const a, Vec8q const b) {
+    return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec8qb operator > (Vec8q const a, Vec8q const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec8qb operator >= (Vec8q const a, Vec8q const b) {
+    return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec8qb operator <= (Vec8q const a, Vec8q const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8q operator & (Vec8q const a, Vec8q const b) {
+    return Vec8q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec8q & operator &= (Vec8q & a, Vec8q const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec8q operator | (Vec8q const a, Vec8q const b) {
+    return Vec8q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec8q & operator |= (Vec8q & a, Vec8q const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8q operator ^ (Vec8q const a, Vec8q const b) {
+    return Vec8q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec8q & operator ^= (Vec8q & a, Vec8q const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec8q operator ~ (Vec8q const a) {
+    return Vec8q(~(a.get_low()), ~(a.get_high()));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8q select (Vec8qb const s, Vec8q const a, Vec8q const b) {
+    return Vec8q(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8q if_add (Vec8qb const f, Vec8q const a, Vec8q const b) {
+    return Vec8q(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec8q if_sub (Vec8qb const f, Vec8q const a, Vec8q const b) {
+    return Vec8q(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec8q if_mul (Vec8qb const f, Vec8q const a, Vec8q const b) {
+    return Vec8q(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int64_t horizontal_add (Vec8q const a) {
+    return horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are sign extended before adding to avoid overflow
+static inline int64_t horizontal_add_x (Vec16i const x) {
+    return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements
+// Elements are zero extended before adding to avoid overflow
+static inline uint64_t horizontal_add_x (Vec16ui const x) {
+    return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
+}
+
+// function max: a > b ? a : b
+static inline Vec8q max(Vec8q const a, Vec8q const b) {
+    return Vec8q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8q min(Vec8q const a, Vec8q const b) {
+    return Vec8q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec8q abs(Vec8q const a) {
+    return Vec8q(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec8q abs_saturated(Vec8q const a) {
+    return Vec8q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec8q rotate_left(Vec8q const a, int b) {
+    return Vec8q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 8 64-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec8uq : public Vec8q {
+public:
+    // Default constructor:
+    Vec8uq() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec8uq(uint64_t i) {
+        z0 = z1 = Vec4uq(i);
+    }
+    // Constructor to convert from Vec8q:
+    Vec8uq(Vec8q const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to convert from type Vec512b
+    Vec8uq(Vec512b const & x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Constructor to build from all elements:
+    Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
+        z0 = Vec4q((int64_t)i0, (int64_t)i1, (int64_t)i2, (int64_t)i3);
+        z1 = Vec4q((int64_t)i4, (int64_t)i5, (int64_t)i6, (int64_t)i7);
+    }
+    // Constructor to build from two Vec4uq:
+    Vec8uq(Vec4uq const a0, Vec4uq const a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Assignment operator to convert from Vec8q:
+    Vec8uq & operator = (Vec8q const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec8uq & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec8uq & load(void const * p) {
+        Vec8q::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 32
+    Vec8uq & load_a(void const * p) {
+        Vec8q::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec8uq const insert(int index, uint64_t value) {
+        Vec8q::insert(index, (int64_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint64_t extract(int index) const {
+        return (uint64_t)Vec8q::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint64_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec2uq:
+    Vec4uq get_low() const {
+        return Vec4uq(Vec8q::get_low());
+    }
+    Vec4uq get_high() const {
+        return Vec4uq(Vec8q::get_high());
+    }
+    static constexpr int elementtype() {
+        return 11;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add
+static inline Vec8uq operator + (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq (Vec8q(a) + Vec8q(b));
+}
+
+// vector operator - : subtract
+static inline Vec8uq operator - (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq (Vec8q(a) - Vec8q(b));
+}
+
+// vector operator * : multiply element by element
+static inline Vec8uq operator * (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq (Vec8q(a) * Vec8q(b));
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const a, uint32_t b) {
+    return Vec8uq(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >> : shift right logical all elements
+static inline Vec8uq operator >> (Vec8uq const a, int32_t b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right artihmetic
+static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const a, uint32_t b) {
+    return Vec8uq ((Vec8q)a << (int32_t)b);
+}
+
+// vector operator << : shift left all elements
+static inline Vec8uq operator << (Vec8uq const a, int32_t b) {
+    return Vec8uq ((Vec8q)a << b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec8qb operator < (Vec8uq const a, Vec8uq const b) {
+    return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec8qb operator > (Vec8uq const a, Vec8uq const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec8qb operator >= (Vec8uq const a, Vec8uq const b) {
+    return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec8qb operator <= (Vec8uq const a, Vec8uq const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec8uq operator & (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(Vec8q(a) & Vec8q(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec8uq operator | (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(Vec8q(a) | Vec8q(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec8uq operator ^ (Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(Vec8q(a) ^ Vec8q(b));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec8uq select (Vec8qb const s, Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec8uq if_add (Vec8qb const f, Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec8uq if_sub (Vec8qb const f, Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec8uq if_mul (Vec8qb const f, Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline uint64_t horizontal_add (Vec8uq const a) {
+    return (uint64_t)horizontal_add(Vec8q(a));
+}
+
+// function max: a > b ? a : b
+static inline Vec8uq max(Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec8uq min(Vec8uq const a, Vec8uq const b) {
+    return Vec8uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// Permute vector of 8 64-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q permute8(Vec8q const a) {
+    return Vec8q(blend4<i0,i1,i2,i3> (a.get_low(), a.get_high()),
+                 blend4<i4,i5,i6,i7> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq permute8(Vec8uq const& a) {
+    return Vec8uq(permute8<i0, i1, i2, i3, i4, i5, i6, i7>(Vec8q(a)));
+}
+
+// Permute vector of 16 32-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i permute16(Vec16i const a) {
+    return Vec16i(blend8<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
+                  blend8<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16ui permute16(Vec16ui const a) {
+    return Vec16ui (permute16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+// blend vectors Vec8q
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q blend8(Vec8q const a, Vec8q const b) {
+    Vec4q x0 = blend_half<Vec8q, i0, i1, i2, i3>(a, b);
+    Vec4q x1 = blend_half<Vec8q, i4, i5, i6, i7>(a, b);
+    return Vec8q(x0, x1);
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8uq blend8(Vec8uq const a, Vec8uq const b) {
+    return Vec8uq( blend8<i0,i1,i2,i3,i4,i5,i6,i7> (Vec8q(a),Vec8q(b)));
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16i blend16(Vec16i const a, Vec16i const b) {
+    Vec8i x0 = blend_half<Vec16i, i0, i1, i2, i3, i4, i5, i6, i7>(a, b);
+    Vec8i x1 = blend_half<Vec16i, i8, i9, i10, i11, i12, i13, i14, i15>(a, b);
+    return Vec16i(x0, x1);
+}
+
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+          int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
+static inline Vec16ui blend16(Vec16ui const a, Vec16ui const b) {
+    return Vec16ui( blend16<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors or as an array.
+*
+*****************************************************************************/
+
+static inline Vec16i lookup16(Vec16i const i1, Vec16i const table) {
+    int32_t t[16];
+    table.store(t);
+    return Vec16i(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]],
+        t[i1[8]], t[i1[9]], t[i1[10]], t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]);
+}
+
+template <int n>
+static inline Vec16i lookup(Vec16i const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 8) {
+        Vec8i table1 = Vec8i().load(table);
+        return Vec16i(
+            lookup8(index.get_low(), table1),
+            lookup8(index.get_high(), table1));
+    }
+    if constexpr (n <= 16) return lookup16(index, Vec16i().load(table));
+    // n > 16. Limit index
+    Vec16ui i1;
+    if constexpr ((n & (n - 1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec16ui(index) & (n - 1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec16ui(index), n - 1);
+    }
+    int32_t const * t = (int32_t const *)table;
+    return Vec16i(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]],
+        t[i1[8]], t[i1[9]], t[i1[10]], t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]);
+}
+
+static inline Vec16i lookup32(Vec16i const index, Vec16i const table1, Vec16i const table2) {
+    int32_t tab[32];
+    table1.store(tab);  table2.store(tab+16);
+    Vec8i t0 = lookup<32>(index.get_low(), tab);
+    Vec8i t1 = lookup<32>(index.get_high(), tab);
+    return Vec16i(t0, t1);
+}
+
+static inline Vec16i lookup64(Vec16i const index, Vec16i const table1, Vec16i const table2, Vec16i const table3, Vec16i const table4) {
+    int32_t tab[64];
+    table1.store(tab);  table2.store(tab + 16);  table3.store(tab + 32);  table4.store(tab + 48);
+    Vec8i t0 = lookup<64>(index.get_low(), tab);
+    Vec8i t1 = lookup<64>(index.get_high(), tab);
+    return Vec16i(t0, t1);
+}
+
+
+static inline Vec8q lookup8(Vec8q const index, Vec8q const table) {
+    int64_t tab[8];
+    table.store(tab);
+    Vec4q t0 = lookup<8>(index.get_low(), tab);
+    Vec4q t1 = lookup<8>(index.get_high(), tab);
+    return Vec8q(t0, t1);
+}
+
+template <int n>
+static inline Vec8q lookup(Vec8q const index, void const * table) {
+    if constexpr (n <= 0) return 0;
+    if constexpr (n <= 4) {
+        Vec4q table1 = Vec4q().load(table);
+        return Vec8q(
+            lookup4 (index.get_low(),  table1),
+            lookup4 (index.get_high(), table1));
+    }
+    if constexpr (n <= 8) {
+        return lookup8(index, Vec8q().load(table));
+    }
+    // n > 8. Limit index
+    Vec8uq i1;
+    if constexpr ((n & (n-1)) == 0) {
+        // n is a power of 2, make index modulo n
+        i1 = Vec8uq(index) & (n-1);
+    }
+    else {
+        // n is not a power of 2, limit to n-1
+        i1 = min(Vec8uq(index), n-1);
+    }
+    int64_t const * t = (int64_t const *)table;
+    return Vec8q(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
+}
+
+/*****************************************************************************
+*
+*          Vector scatter functions
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+    static inline void scatter(Vec16i const data, void * array) {
+    int32_t* arr = (int32_t*)array;
+    const int index[16] = {i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15};
+    for (int i = 0; i < 16; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+}
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline void scatter(Vec8q const data, void * array) {
+    int64_t* arr = (int64_t*)array;
+    const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
+    for (int i = 0; i < 8; i++) {
+        if (index[i] >= 0) arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec16i const index, uint32_t limit, Vec16i const data, void * array) {
+    int32_t* arr = (int32_t*)array;
+    for (int i = 0; i < 16; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8q const index, uint32_t limit, Vec8q const data, void * array) {
+    int64_t* arr = (int64_t*)array;
+    for (int i = 0; i < 8; i++) {
+        if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i];
+    }
+}
+
+static inline void scatter(Vec8i const index, uint32_t limit, Vec8q const data, void * array) {
+    int64_t* arr = (int64_t*)array;
+    for (int i = 0; i < 8; i++) {
+        if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
+    }
+}
+
+// Scatter functions with variable indexes:
+
+static inline void scatter16i(Vec16i index, uint32_t limit, Vec16i data, void * destination) {
+    uint32_t ix[16];  index.store(ix);
+    for (int i = 0; i < 16; i++) {
+        if (ix[i] < limit) ((int*)destination)[ix[i]] = data[i];
+    }
+}
+
+static inline void scatter8q(Vec8q index, uint32_t limit, Vec8q data, void * destination) {
+    uint64_t ix[8];  index.store(ix);
+    for (int i = 0; i < 8; i++) {
+        if (ix[i] < limit) ((int64_t*)destination)[ix[i]] = data[i];
+    }
+}
+
+static inline void scatter8i(Vec8i index, uint32_t limit, Vec8i data, void * destination) {
+    uint32_t ix[8];  index.store(ix);
+    for (int i = 0; i < 8; i++) {
+        if (ix[i] < limit) ((int*)destination)[ix[i]] = data[i];
+    }
+}
+
+static inline void scatter4q(Vec4q index, uint32_t limit, Vec4q data, void * destination) {
+    uint64_t ix[4];  index.store(ix);
+    for (int i = 0; i < 4; i++) {
+        if (ix[i] < limit) ((int64_t*)destination)[ix[i]] = data[i];
+    }
+}
+
+static inline void scatter4i(Vec4i index, uint32_t limit, Vec4i data, void * destination) {
+    uint32_t ix[4];  index.store(ix);
+    for (int i = 0; i < 4; i++) {
+        if (ix[i] < limit) ((int*)destination)[ix[i]] = data[i];
+    }
+}
+
+/*****************************************************************************
+*
+*          Gather functions with fixed indexes
+*
+*****************************************************************************/
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
+static inline Vec16i gather16i(void const * a) {
+    int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 15) {
+        // load one contiguous block and permute
+        if constexpr (imax > 15) {
+            // make sure we don't read past the end of the array
+            Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
+            return permute16<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
+                i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
+        }
+        else {
+            Vec16i b = Vec16i().load((int32_t const *)a + imin);
+            return permute16<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
+                i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
+    &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)
+    &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
+    &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
+        // load two contiguous blocks and blend
+        Vec16i b = Vec16i().load((int32_t const *)a + imin);
+        Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
+        const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
+        const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
+        const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
+        const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
+        const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
+        const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
+        const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
+        const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
+        const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
+        const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
+        const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
+        const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
+        const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
+        const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
+        const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
+        const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
+        return blend16<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), a);
+}
+
+
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
+static inline Vec8q gather8q(void const * a) {
+    int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array
+    constexpr int imin = min_index(indexs);
+    constexpr int imax = max_index(indexs);
+    static_assert(imin >= 0, "Negative index in gather function");
+
+    if constexpr (imax - imin <= 7) {
+        // load one contiguous block and permute
+        if constexpr (imax > 7) {
+            // make sure we don't read past the end of the array
+            Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
+            return permute8<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
+        }
+        else {
+            Vec8q b = Vec8q().load((int64_t const *)a + imin);
+            return permute8<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
+        }
+    }
+    if constexpr ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
+    &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
+        // load two contiguous blocks and blend
+        Vec8q b = Vec8q().load((int64_t const *)a + imin);
+        Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
+        const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
+        const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
+        const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
+        const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
+        const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
+        const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
+        const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
+        const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
+        return blend8<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
+    }
+    // use lookup function
+    return lookup<imax+1>(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), a);
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+/*
+// Function extend_to_int : extends Vec16s to Vec16i with sign extension
+static inline Vec16i extend_to_int (Vec16s const a) {
+    return Vec16i(extend_low(a), extend_high(a));
+}
+
+// Function extend_to_int : extends Vec16us to Vec16ui with zero extension
+static inline Vec16ui extend_to_int (Vec16us const a) {
+    return Vec16i(extend_low(a), extend_high(a));
+}
+
+// Function extend_to_int : extends Vec16c to Vec16i with sign extension
+static inline Vec16i extend_to_int (Vec16c const a) {
+    return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
+}
+
+// Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
+static inline Vec16ui extend_to_int (Vec16uc const a) {
+    return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
+}*/
+
+
+// Extend 32-bit integers to 64-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 64 bits with sign extension
+static inline Vec8q extend_low (Vec16i const a) {
+    return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with sign extension
+static inline Vec8q extend_high (Vec16i const a) {
+    return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_low (Vec16ui const a) {
+    return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 64 bits with zero extension
+static inline Vec8uq extend_high (Vec16ui const a) {
+    return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+/*
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec16c compress_to_int8 (Vec16i const a) {
+    Vec16s b = compress(a.get_low(), a.get_high());
+    Vec16c c = compress(b.get_low(), b.get_high());
+    return c;
+}
+
+static inline Vec16s compress_to_int16 (Vec16i const a) {
+    return compress(a.get_low(), a.get_high());
+}
+
+// with signed saturation
+static inline Vec16c compress_to_int8_saturated (Vec16i const a) {
+    Vec16s b = compress_saturated(a.get_low(), a.get_high());
+    Vec16c c = compress_saturated(b.get_low(), b.get_high());
+    return c;
+}
+
+static inline Vec16s compress_to_int16_saturated (Vec16i const a) {
+    return compress_saturated(a.get_low(), a.get_high());
+}
+
+// with unsigned saturation
+static inline Vec16uc compress_to_int8_saturated (Vec16ui const a) {
+    Vec16us b = compress_saturated(a.get_low(), a.get_high());
+    Vec16uc c = compress_saturated(b.get_low(), b.get_high());
+    return c;
+}
+
+static inline Vec16us compress_to_int16_saturated (Vec16ui const a) {
+    return compress_saturated(a.get_low(), a.get_high());
+}*/
+
+// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Overflow wraps around
+static inline Vec16i compress (Vec8q const low, Vec8q const high) {
+    return Vec16i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Signed, with saturation
+static inline Vec16i compress_saturated (Vec8q const low, Vec8q const high) {
+    return Vec16i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
+// Unsigned, with saturation
+static inline Vec16ui compress_saturated (Vec8uq const low, Vec8uq const high) {
+    return Vec16ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector operator / : divide all elements by same integer
+static inline Vec16i operator / (Vec16i const a, Divisor_i const d) {
+    return Vec16i(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16i & operator /= (Vec16i & a, Divisor_i const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+static inline Vec16ui operator / (Vec16ui const a, Divisor_ui const d) {
+    return Vec16ui(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec16i by compile-time constant
+template <int32_t d>
+static inline Vec16i divide_by_i(Vec16i const a) {
+    return Vec16i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec16i a / const_int(d)
+template <int32_t d>
+static inline Vec16i operator / (Vec16i const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec16i a / const_uint(d)
+template <uint32_t d>
+static inline Vec16i operator / (Vec16i const a, Const_uint_t<d>) {
+    static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int32_t(d)>(a);                               // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// Divide Vec16ui by compile-time constant
+template <uint32_t d>
+static inline Vec16ui divide_by_ui(Vec16ui const a) {
+    return Vec16ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec16ui a / const_uint(d)
+template <uint32_t d>
+static inline Vec16ui operator / (Vec16ui const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec16ui a / const_int(d)
+template <int32_t d>
+static inline Vec16ui operator / (Vec16ui const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Boolean <-> bitfield conversion functions
+*
+*****************************************************************************/
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16b const a) {
+    return uint16_t(to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8));
+}
+
+// to_bits: convert to integer bitfield
+static inline uint16_t to_bits(Vec16ib const a) {
+    return uint16_t(to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8));
+}
+
+// to_bits: convert to integer bitfield
+static inline uint8_t to_bits(Vec8b const a) {
+    return uint8_t(to_bits(a.get_low()) | (to_bits(a.get_high()) << 4));
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI512E_H
diff --git a/Bwdif/VCL2/vectori512s.h b/Bwdif/VCL2/vectori512s.h
new file mode 100644
index 0000000..38000a6
--- /dev/null
+++ b/Bwdif/VCL2/vectori512s.h
@@ -0,0 +1,2315 @@
+/****************************  vectori512s.h   ********************************
+* Author:        Agner Fog
+* Date created:  2019-04-20
+* Last modified: 2020-02-23
+* Version:       2.01.01
+* Project:       vector classes
+* Description:
+* Header file defining 512-bit integer vector classes for 8 and 16 bit integers.
+* For x86 microprocessors with AVX512BW and later instruction sets.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec64c    Vector of  64  8-bit  signed   integers
+* Vec64uc   Vector of  64  8-bit  unsigned integers
+* Vec64cb   Vector of  64  booleans for use with Vec64c and Vec64uc
+* Vec32s    Vector of  32  16-bit signed   integers
+* Vec32us   Vector of  32  16-bit unsigned integers
+* Vec32sb   Vector of  32  booleans for use with Vec32s and Vec32us
+* Other 512-bit integer vectors are defined in Vectori512.h
+*
+* Each vector object is represented internally in the CPU as a 512-bit register.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef VECTORI512S_H
+#define VECTORI512S_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+// check combination of header files
+#ifdef VECTORI512SE_H
+#error Two different versions of vectorf256.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+/*****************************************************************************
+*
+*          Vector of 64 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec64c: public Vec512b {
+public:
+    // Default constructor:
+    Vec64c() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec64c(int8_t i) {
+        zmm = _mm512_set1_epi8(i);
+    }
+    // Constructor to build from all elements:
+    Vec64c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8,  int8_t i9,  int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,
+        int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
+        int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31,
+        int8_t i32, int8_t i33, int8_t i34, int8_t i35, int8_t i36, int8_t i37, int8_t i38, int8_t i39,
+        int8_t i40, int8_t i41, int8_t i42, int8_t i43, int8_t i44, int8_t i45, int8_t i46, int8_t i47,
+        int8_t i48, int8_t i49, int8_t i50, int8_t i51, int8_t i52, int8_t i53, int8_t i54, int8_t i55,
+        int8_t i56, int8_t i57, int8_t i58, int8_t i59, int8_t i60, int8_t i61, int8_t i62, int8_t i63) {
+        // _mm512_set_epi8 and _mm512_set_epi16 missing in GCC 7.4.0
+        int8_t aa[64] = {
+            i0, i1, i2, i3, i4, i5, i6, i7,i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31,
+            i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+            i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 };
+        load(aa);
+    }
+    // Constructor to build from two Vec32c:
+    Vec64c(Vec32c const a0, Vec32c const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec64c(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec64c & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec64c & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec64c & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec64c & load_partial(int n, void const * p) {
+        if (n >= 64) {
+            zmm = _mm512_loadu_si512(p);
+        }
+        else {
+            zmm = _mm512_maskz_loadu_epi8(__mmask64(((uint64_t)1 << n) - 1), p);
+        }
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (n >= 64) {
+            // _mm512_storeu_epi8(p, zmm);
+            _mm512_storeu_si512(p, zmm);
+        }
+        else {
+            _mm512_mask_storeu_epi8(p, __mmask64(((uint64_t)1 << n) - 1), zmm);
+        }
+    }
+    // cut off vector to n elements. The last 64-n elements are set to zero
+    Vec64c & cutoff(int n) {
+        if (n < 64) {
+            zmm = _mm512_maskz_mov_epi8(__mmask64(((uint64_t)1 << n) - 1), zmm);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec64c const insert(int index, int8_t value) {
+        zmm = _mm512_mask_set1_epi8(zmm, __mmask64((uint64_t)1 << index), value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(int index) const {
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        __m512i x = _mm512_maskz_compress_epi8(__mmask64((uint64_t)1 << index), zmm);
+        return (int8_t)_mm_cvtsi128_si32(_mm512_castsi512_si128(x));
+#else
+        int8_t a[64];
+        store(a);
+        return a[index & 63];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec32c:
+    Vec32c get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec32c get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static constexpr int size() {
+        return 64;
+    }
+    static constexpr int elementtype() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec64b: Vector of 64 Booleans for use with Vec64c and Vec64uc
+*
+*****************************************************************************/
+
+class Vec64b {
+protected:
+    __mmask64  mm; // Boolean vector
+public:
+    // Default constructor:
+    Vec64b () {
+    }
+    // Constructor to build from all elements:
+    /*
+    Vec64b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
+        bool b8,  bool b9,  bool b10, bool b11, bool b12, bool b13, bool b14, bool b15,
+        bool b16, bool b17, bool b18, bool b19, bool b20, bool b21, bool b22, bool b23,
+        bool b24, bool b25, bool b26, bool b27, bool b28, bool b29, bool b30, bool b31,
+        bool b32, bool b33, bool b34, bool b35, bool b36, bool b37, bool b38, bool b39,
+        bool b40, bool b41, bool b42, bool b43, bool b44, bool b45, bool b46, bool b47,
+        bool b48, bool b49, bool b50, bool b51, bool b52, bool b53, bool b54, bool b55,
+        bool b56, bool b57, bool b58, bool b59, bool b60, bool b61, bool b62, bool b63) {
+        mm = uint64_t(
+            (uint64_t)b0        | (uint64_t)b1  << 1  | (uint64_t)b2  << 2  | (uint64_t)b3  << 3  |
+            (uint64_t)b4  << 4  | (uint64_t)b5  << 5  | (uint64_t)b6  << 6  | (uint64_t)b7  << 7  |
+            (uint64_t)b8  << 8  | (uint64_t)b9  << 9  | (uint64_t)b10 << 10 | (uint64_t)b11 << 11 |
+            (uint64_t)b12 << 12 | (uint64_t)b13 << 13 | (uint64_t)b14 << 14 | (uint64_t)b15 << 15 |
+            (uint64_t)b16 << 16 | (uint64_t)b17 << 17 | (uint64_t)b18 << 18 | (uint64_t)b19 << 19 |
+            (uint64_t)b20 << 20 | (uint64_t)b21 << 21 | (uint64_t)b22 << 22 | (uint64_t)b23 << 23 |
+            (uint64_t)b24 << 24 | (uint64_t)b25 << 25 | (uint64_t)b26 << 26 | (uint64_t)b27 << 27 |
+            (uint64_t)b28 << 28 | (uint64_t)b29 << 29 | (uint64_t)b30 << 30 | (uint64_t)b31 << 31 |
+            (uint64_t)b32 << 32 | (uint64_t)b33 << 33 | (uint64_t)b34 << 34 | (uint64_t)b35 << 35 |
+            (uint64_t)b36 << 36 | (uint64_t)b37 << 37 | (uint64_t)b38 << 38 | (uint64_t)b39 << 39 |
+            (uint64_t)b40 << 40 | (uint64_t)b41 << 41 | (uint64_t)b42 << 42 | (uint64_t)b43 << 43 |
+            (uint64_t)b44 << 44 | (uint64_t)b45 << 45 | (uint64_t)b46 << 46 | (uint64_t)b47 << 47 |
+            (uint64_t)b48 << 48 | (uint64_t)b49 << 49 | (uint64_t)b50 << 50 | (uint64_t)b51 << 51 |
+            (uint64_t)b52 << 52 | (uint64_t)b53 << 53 | (uint64_t)b54 << 54 | (uint64_t)b55 << 55 |
+            (uint64_t)b56 << 56 | (uint64_t)b57 << 57 | (uint64_t)b58 << 58 | (uint64_t)b59 << 59 |
+            (uint64_t)b60 << 60 | (uint64_t)b61 << 61 | (uint64_t)b62 << 62 | (uint64_t)b63 << 63);
+    } */
+    // Constructor to convert from type __mmask64 used in intrinsics:
+    Vec64b (__mmask64 x) {
+        mm = x;
+    }
+    // Constructor to broadcast single value:
+    Vec64b(bool b) {
+        mm = __mmask64(-int64_t(b));
+    }
+    // Constructor to make from two halves
+    Vec64b(Vec32b const x0, Vec32b const x1) {
+        mm = uint32_t(__mmask32(x0)) | uint64_t(__mmask32(x1)) << 32;
+    }
+    // Assignment operator to convert from type __mmask64 used in intrinsics:
+    Vec64b & operator = (__mmask64 x) {
+        mm = x;
+        return *this;
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec64b & operator = (bool b) {
+        mm = Vec64b(b);
+        return *this;
+    }
+    // split into two halves
+    Vec32b get_low() const {
+        return Vec32b(__mmask32(mm));
+    }
+    Vec32b get_high() const {
+        return Vec32b(__mmask32(mm >> 32));
+    }
+    // Member function to change a single element in vector
+    Vec64b & insert (uint32_t index, bool a) {
+        uint64_t mask = uint64_t(1) << index;
+        mm = (mm & ~mask) | uint64_t(a) << index;
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return ((mm >> index) & 1) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Type cast operator to convert to __mmask64 used in intrinsics
+    operator __mmask64() const {
+        return mm;
+    }
+    // Member function to change a bitfield to a boolean vector
+    Vec64b & load_bits(uint64_t a) {
+        mm = __mmask64(a);
+        return *this;
+    }
+    static constexpr int size() {
+        return 64;
+    }
+    static constexpr int elementtype() {
+        return 2;
+    }
+};
+
+typedef Vec64b Vec64cb;   // compact boolean vector
+typedef Vec64b Vec64ucb;  // compact boolean vector
+
+
+/*****************************************************************************
+*
+*          Define operators and functions for Vec64cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec64cb operator & (Vec64cb const a, Vec64cb const b) {
+    //return _kand_mask64(a, b);
+    return __mmask64(a) & __mmask64(b);
+}
+static inline Vec64cb operator && (Vec64cb const a, Vec64cb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec64cb & operator &= (Vec64cb & a, Vec64cb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec64cb operator | (Vec64cb const a, Vec64cb const b) {
+    //return _kor_mask64(a, b);
+    return __mmask64(a) | __mmask64(b);
+}
+static inline Vec64cb operator || (Vec64cb const a, Vec64cb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec64cb & operator |= (Vec64cb & a, Vec64cb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec64cb operator ^ (Vec64cb const a, Vec64cb const b) {
+    //return _kxor_mask64(a, b);
+    return __mmask64(a) ^ __mmask64(b);
+}
+// vector operator ^= : bitwise xor
+static inline Vec64cb & operator ^= (Vec64cb & a, Vec64cb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec64cb operator == (Vec64cb const a, Vec64cb const b) {
+    return __mmask64(a) ^ ~ __mmask64(b);
+    //return _kxnor_mask64(a, b); // not all compilers have this intrinsic
+}
+
+// vector operator != : xor
+static inline Vec64cb operator != (Vec64cb const a, Vec64cb const b) {
+    //return _kxor_mask64(a, b);
+    return __mmask64(a) ^ __mmask64(b);
+}
+
+// vector operator ~ : bitwise not
+static inline Vec64cb operator ~ (Vec64cb const a) {
+    //return _knot_mask64(a);
+    return ~ __mmask64(a);
+}
+
+// vector operator ! : element not
+static inline Vec64cb operator ! (Vec64cb const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec64cb andnot (Vec64cb const a, Vec64cb const b) {
+    //return  _kxnor_mask64(b, a);
+    return __mmask64(a) & ~ __mmask64(b);
+}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec64cb const a) {
+    return int64_t(__mmask64(a)) == -(int64_t)(1);
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec64cb const a) {
+    return int64_t(__mmask64(a)) != 0;
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint64_t to_bits(Vec64cb x) {
+    return uint64_t(__mmask64(x));
+}
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec64c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec64c operator + (Vec64c const a, Vec64c const b) {
+    return _mm512_add_epi8(a, b);
+}
+// vector operator += : add
+static inline Vec64c & operator += (Vec64c & a, Vec64c const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec64c operator ++ (Vec64c & a, int) {
+    Vec64c a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec64c & operator ++ (Vec64c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec64c operator - (Vec64c const a, Vec64c const b) {
+    return _mm512_sub_epi8(a, b);
+}
+// vector operator - : unary minus
+static inline Vec64c operator - (Vec64c const a) {
+    return _mm512_sub_epi8(_mm512_setzero_epi32(), a);
+}
+// vector operator -= : subtract
+static inline Vec64c & operator -= (Vec64c & a, Vec64c const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec64c operator -- (Vec64c & a, int) {
+    Vec64c a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec64c & operator -- (Vec64c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec64c operator * (Vec64c const a, Vec64c const b) {
+    // There is no 8-bit multiply. Split into two 16-bit multiplies
+    __m512i aodd    = _mm512_srli_epi16(a,8);              // odd numbered elements of a
+    __m512i bodd    = _mm512_srli_epi16(b,8);              // odd numbered elements of b
+    __m512i muleven = _mm512_mullo_epi16(a,b);             // product of even numbered elements
+    __m512i mulodd  = _mm512_mullo_epi16(aodd,bodd);       // product of odd  numbered elements
+    mulodd          = _mm512_slli_epi16(mulodd,8);         // put odd numbered elements back in place
+    __m512i product = _mm512_mask_mov_epi8(muleven, 0xAAAAAAAAAAAAAAAA, mulodd); // interleave even and odd
+    return product;
+}
+
+// vector operator *= : multiply
+static inline Vec64c & operator *= (Vec64c & a, Vec64c const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec64c operator << (Vec64c const a, int32_t b) {
+    uint32_t mask = (uint32_t)0xFF >> (uint32_t)b;                   // mask to remove bits that are shifted out
+    __m512i am    = _mm512_and_si512(a,_mm512_set1_epi8((char)mask));// remove bits that will overflow
+    __m512i res   = _mm512_sll_epi16(am,_mm_cvtsi32_si128(b));       // 16-bit shifts
+    return res;
+}
+
+// vector operator <<= : shift left
+static inline Vec64c & operator <<= (Vec64c & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec64c operator >> (Vec64c const a, int32_t b) {
+    __m512i aeven = _mm512_slli_epi16(a, 8);                            // even numbered elements of a. get sign bit in position
+    aeven         = _mm512_sra_epi16(aeven, _mm_cvtsi32_si128(b + 8));  // shift arithmetic, back to position
+    __m512i aodd  = _mm512_sra_epi16(a, _mm_cvtsi32_si128(b));          // shift odd numbered elements arithmetic
+    __m512i res = _mm512_mask_mov_epi8(aeven, 0xAAAAAAAAAAAAAAAA, aodd);// interleave even and odd
+    return  res;
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec64c & operator >>= (Vec64c & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec64cb operator == (Vec64c const a, Vec64c const b) {
+    return _mm512_cmpeq_epi8_mask(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec64cb operator != (Vec64c const a, Vec64c const b) {
+    return _mm512_cmpneq_epi8_mask(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec64cb operator > (Vec64c const a, Vec64c const b) {
+    return _mm512_cmp_epi8_mask(a, b, 6);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec64cb operator < (Vec64c const a, Vec64c const b) {
+    return _mm512_cmp_epi8_mask(a, b, 1);
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec64cb operator >= (Vec64c const a, Vec64c const b) {
+    return _mm512_cmp_epi8_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec64cb operator <= (Vec64c const a, Vec64c const b) {
+    return _mm512_cmp_epi8_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec64c operator & (Vec64c const a, Vec64c const b) {
+    return _mm512_and_epi32(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec64c & operator &= (Vec64c & a, Vec64c const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec64c operator | (Vec64c const a, Vec64c const b) {
+    return _mm512_or_epi32(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec64c & operator |= (Vec64c & a, Vec64c const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec64c operator ^ (Vec64c const a, Vec64c const b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec64c & operator ^= (Vec64c & a, Vec64c const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec64c operator ~ (Vec64c const a) {
+    return Vec64c(~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec64c select (Vec64cb const s, Vec64c const a, Vec64c const b) {
+    return _mm512_mask_mov_epi8(b, s, a);  // conditional move may be optimized better by the compiler than blend
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec64c if_add (Vec64cb const f, Vec64c const a, Vec64c const b) {
+    return _mm512_mask_add_epi8(a, f, a, b);
+}
+
+// Conditional subtract
+static inline Vec64c if_sub (Vec64cb const f, Vec64c const a, Vec64c const b) {
+    return _mm512_mask_sub_epi8(a, f, a, b);
+}
+
+// Conditional multiply
+static inline Vec64c if_mul (Vec64cb const f, Vec64c const a, Vec64c const b) {
+    Vec64c m = a * b;
+    return select(f, m, a);
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int8_t horizontal_add (Vec64c const a) {
+    __m512i sum1 = _mm512_sad_epu8(a,_mm512_setzero_si512());
+    return (int8_t)horizontal_add(Vec8q(sum1));
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec64c const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec64c add_saturated(Vec64c const a, Vec64c const b) {
+    return _mm512_adds_epi8(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec64c sub_saturated(Vec64c const a, Vec64c const b) {
+    return _mm512_subs_epi8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec64c max(Vec64c const a, Vec64c const b) {
+    return _mm512_max_epi8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec64c min(Vec64c const a, Vec64c const b) {
+    return _mm512_min_epi8(a,b);
+
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec64c abs(Vec64c const a) {
+    return _mm512_abs_epi8(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec64c abs_saturated(Vec64c const a) {
+    return _mm512_min_epu8(abs(a), Vec64c(0x7F));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec64c rotate_left(Vec64c const a, int b) {
+    uint8_t mask  = 0xFFu << b;                    // mask off overflow bits
+    __m512i m     = _mm512_set1_epi8(mask);
+    __m128i bb    = _mm_cvtsi32_si128(b & 7);      // b modulo 8
+    __m128i mbb   = _mm_cvtsi32_si128((- b) & 7);  // 8-b modulo 8
+    __m512i left  = _mm512_sll_epi16(a, bb);       // a << b
+    __m512i right = _mm512_srl_epi16(a, mbb);      // a >> 8-b
+            left  = _mm512_and_si512(m, left);     // mask off overflow bits
+            right = _mm512_andnot_si512(m, right);
+    return  _mm512_or_si512(left, right);          // combine left and right shifted bits
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 64 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec64uc : public Vec64c {
+public:
+    // Default constructor:
+    Vec64uc() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec64uc(uint8_t i) {
+        zmm = _mm512_set1_epi8((int8_t)i);
+    }
+    // Constructor to build from all elements:
+    Vec64uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,
+        uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
+        uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31,
+        uint8_t i32, uint8_t i33, uint8_t i34, uint8_t i35, uint8_t i36, uint8_t i37, uint8_t i38, uint8_t i39,
+        uint8_t i40, uint8_t i41, uint8_t i42, uint8_t i43, uint8_t i44, uint8_t i45, uint8_t i46, uint8_t i47,
+        uint8_t i48, uint8_t i49, uint8_t i50, uint8_t i51, uint8_t i52, uint8_t i53, uint8_t i54, uint8_t i55,
+        uint8_t i56, uint8_t i57, uint8_t i58, uint8_t i59, uint8_t i60, uint8_t i61, uint8_t i62, uint8_t i63)
+        : Vec64c(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31,
+            i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+            i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63) {}
+
+    // Constructor to build from two Vec32uc:
+    Vec64uc(Vec32uc const a0, Vec32uc const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec64uc(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec64uc & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec64uc & load(void const * p) {
+        Vec64c::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec64uc & load_a(void const * p) {
+        Vec64c::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec64uc const insert(int index, uint8_t value) {
+        Vec64c::insert(index, (int8_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(int index) const {
+        return (uint8_t)Vec64c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (int index) const {
+        return (uint8_t)Vec64c::extract(index);
+    }
+    // Member functions to split into two Vec32uc:
+    Vec32uc get_low() const {
+        return Vec32uc(Vec64c::get_low());
+    }
+    Vec32uc get_high() const {
+        return Vec32uc(Vec64c::get_high());
+    }
+    static constexpr int elementtype() {
+        return 5;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add element by element
+static inline Vec64uc operator + (Vec64uc const a, Vec64uc const b) {
+    return _mm512_add_epi8(a, b);
+}
+
+// vector operator - : subtract element by element
+static inline Vec64uc operator - (Vec64uc const a, Vec64uc const b) {
+    return _mm512_sub_epi8(a, b);
+}
+
+// vector operator ' : multiply element by element
+static inline Vec64uc operator * (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) * Vec64c(b));
+}
+
+// vector operator / : divide. See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec64uc operator >> (Vec64uc const a, uint32_t b) {
+    uint32_t mask = (uint32_t)0xFF << (uint32_t)b;                     // mask to remove bits that are shifted out
+    __m512i am    = _mm512_and_si512(a,_mm512_set1_epi8((char)mask));  // remove bits that will overflow
+    __m512i res   = _mm512_srl_epi16(am,_mm_cvtsi32_si128((int32_t)b));// 16-bit shifts
+    return res;
+}
+static inline Vec64uc operator >> (Vec64uc const a, int b) {
+    return a >> (uint32_t)b;
+}
+
+// vector operator >>= : shift right logical
+static inline Vec64uc & operator >>= (Vec64uc & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical (signed b)
+static inline Vec64uc & operator >>= (Vec64uc & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec64uc operator << (Vec64uc const a, uint32_t b) {
+    return Vec64uc(Vec64c(a) << int32_t(b));
+}
+static inline Vec64uc operator << (Vec64uc const a, int b) {
+    return a << (uint32_t)b;
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec64cb operator < (Vec64uc const a, Vec64uc const b) {
+    return _mm512_cmp_epu8_mask(a, b, 1);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec64cb operator > (Vec64uc const a, Vec64uc const b) {
+    return _mm512_cmp_epu8_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec64cb operator >= (Vec64uc const a, Vec64uc const b) {
+    return _mm512_cmp_epu8_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec64cb operator <= (Vec64uc const a, Vec64uc const b) {
+    return _mm512_cmp_epu8_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec64uc operator & (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) & Vec64c(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec64uc operator | (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) | Vec64c(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec64uc operator ^ (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) ^ Vec64c(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec64uc operator ~ (Vec64uc const a) {
+    return Vec64uc( ~ Vec64c(a));
+}
+
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec64uc select (Vec64cb const s, Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(select(s, Vec64c(a), Vec64c(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec64uc if_add (Vec64cb const f, Vec64uc const a, Vec64uc const b) {
+    return _mm512_mask_add_epi8(a, f, a, b);
+}
+
+// Conditional subtract
+static inline Vec64uc if_sub (Vec64cb const f, Vec64uc const a, Vec64uc const b) {
+    return _mm512_mask_sub_epi8(a, f, a, b);
+}
+
+// Conditional multiply
+static inline Vec64uc if_mul (Vec64cb const f, Vec64uc const a, Vec64uc const b) {
+    Vec64uc m = a * b;
+    return select(f, m, a);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec64uc add_saturated(Vec64uc const a, Vec64uc const b) {
+    return _mm512_adds_epu8(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec64uc sub_saturated(Vec64uc const a, Vec64uc const b) {
+    return _mm512_subs_epu8(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec64uc max(Vec64uc const a, Vec64uc const b) {
+    return _mm512_max_epu8(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec64uc min(Vec64uc const a, Vec64uc const b) {
+    return _mm512_min_epu8(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec32s: public Vec512b {
+public:
+    // Default constructor:
+    Vec32s() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32s(int16_t i) {
+        zmm = _mm512_set1_epi16(i);
+    }
+    // Constructor to build from all elements:
+    Vec32s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7,
+        int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15,
+        int16_t i16, int16_t i17, int16_t i18, int16_t i19, int16_t i20, int16_t i21, int16_t i22, int16_t i23,
+        int16_t i24, int16_t i25, int16_t i26, int16_t i27, int16_t i28, int16_t i29, int16_t i30, int16_t i31) {
+#if true
+        // _mm512_set_epi16 missing in GCC 7.4.0. This may be more efficient after all:
+        int16_t aa[32] = {
+            i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 };
+        load(aa);
+#else
+        zmm = _mm512_set_epi16(
+            i31, i30, i29, i28, i27, i26, i25, i24, i23, i22, i21, i20, i19, i18, i17, i16,
+            i15, i14, i13, i12, i11, i10, i9,  i8,  i7,  i6,  i5,  i4,  i3,  i2,  i1,  i0);
+#endif
+    }
+    // Constructor to build from two Vec16s:
+    Vec32s(Vec16s const a0, Vec16s const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec32s(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec32s & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return zmm;
+    }
+    // Member function to load from array (unaligned)
+    Vec32s & load(void const * p) {
+        zmm = _mm512_loadu_si512(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec32s & load_a(void const * p) {
+        zmm = _mm512_load_si512(p);
+        return *this;
+    }
+    // Member function to load 32 unsigned 8-bit integers from array
+    Vec32s & load_32uc(void const * p) {
+        zmm = _mm512_cvtepu8_epi16(Vec32uc().load(p));
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec32s & load_partial(int n, void const * p) {
+        zmm = _mm512_maskz_loadu_epi16(__mmask32(((uint64_t)1 << n) - 1), p);
+        return *this;
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        _mm512_mask_storeu_epi16(p, __mmask32(((uint64_t)1 << n) - 1), zmm);
+    }
+    // cut off vector to n elements. The last 32-n elements are set to zero
+    Vec32s & cutoff(int n) {
+        zmm = _mm512_maskz_mov_epi16(__mmask32(((uint64_t)1 << n) - 1), zmm);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32s const insert(int index, int16_t value) {
+        zmm = _mm512_mask_set1_epi16(zmm, __mmask64((uint64_t)1 << index), value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int16_t extract(int index) const {
+#if INSTRSET >= 10 && defined (__AVX512VBMI2__)
+        __m512i x = _mm512_maskz_compress_epi16(__mmask32(1u << index), zmm);
+        return (int16_t)_mm_cvtsi128_si32(_mm512_castsi512_si128(x));
+#else
+        int16_t a[32];
+        store(a);
+        return a[index & 31];
+#endif
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16s:
+    Vec16s get_low() const {
+        return _mm512_castsi512_si256(zmm);
+    }
+    Vec16s get_high() const {
+        return _mm512_extracti64x4_epi64(zmm,1);
+    }
+    static constexpr int size() {
+        return 32;
+    }
+    static constexpr int elementtype() {
+        return 6;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec32sb: Vector of 64 Booleans for use with Vec32s and Vec32us
+*
+*****************************************************************************/
+
+typedef Vec32b Vec32sb;  // compact boolean vector
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec32s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32s operator + (Vec32s const a, Vec32s const b) {
+    return _mm512_add_epi16(a, b);
+}
+// vector operator += : add
+static inline Vec32s & operator += (Vec32s & a, Vec32s const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec32s operator ++ (Vec32s & a, int) {
+    Vec32s a0 = a;
+    a = a + 1;
+    return a0;
+}
+// prefix operator ++
+static inline Vec32s & operator ++ (Vec32s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32s operator - (Vec32s const a, Vec32s const b) {
+    return _mm512_sub_epi16(a, b);
+}
+// vector operator - : unary minus
+static inline Vec32s operator - (Vec32s const a) {
+    return _mm512_sub_epi16(_mm512_setzero_epi32(), a);
+}
+// vector operator -= : subtract
+static inline Vec32s & operator -= (Vec32s & a, Vec32s const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec32s operator -- (Vec32s & a, int) {
+    Vec32s a0 = a;
+    a = a - 1;
+    return a0;
+}
+// prefix operator --
+static inline Vec32s & operator -- (Vec32s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32s operator * (Vec32s const a, Vec32s const b) {
+    return _mm512_mullo_epi16(a, b);
+}
+
+// vector operator *= : multiply
+static inline Vec32s & operator *= (Vec32s & a, Vec32s const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer. See bottom of file
+
+// vector operator << : shift left
+static inline Vec32s operator << (Vec32s const a, int32_t b) {
+    return _mm512_sll_epi16(a, _mm_cvtsi32_si128(b));
+}
+// vector operator <<= : shift left
+static inline Vec32s & operator <<= (Vec32s & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec32s operator >> (Vec32s const a, int32_t b) {
+    return _mm512_sra_epi16(a, _mm_cvtsi32_si128(b));
+}
+// vector operator >>= : shift right arithmetic
+static inline Vec32s & operator >>= (Vec32s & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32sb operator == (Vec32s const a, Vec32s const b) {
+    return _mm512_cmpeq_epi16_mask(a, b);
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32sb operator != (Vec32s const a, Vec32s const b) {
+    return _mm512_cmpneq_epi16_mask(a, b);
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec32sb operator > (Vec32s const a, Vec32s const b) {
+    return _mm512_cmp_epi16_mask(a, b, 6);
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec32sb operator < (Vec32s const a, Vec32s const b) {
+    return _mm512_cmp_epi16_mask(a, b, 1);
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32sb operator >= (Vec32s const a, Vec32s const b) {
+    return _mm512_cmp_epi16_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32sb operator <= (Vec32s const a, Vec32s const b) {
+    return _mm512_cmp_epi16_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec32s operator & (Vec32s const a, Vec32s const b) {
+    return _mm512_and_epi32(a, b);
+}
+
+// vector operator &= : bitwise and
+static inline Vec32s & operator &= (Vec32s & a, Vec32s const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32s operator | (Vec32s const a, Vec32s const b) {
+    return _mm512_or_epi32(a, b);
+}
+
+// vector operator |= : bitwise or
+static inline Vec32s & operator |= (Vec32s & a, Vec32s const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32s operator ^ (Vec32s const a, Vec32s const b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec32s & operator ^= (Vec32s & a, Vec32s const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32s operator ~ (Vec32s const a) {
+    return Vec32s(~ Vec16i(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec32s select (Vec32sb const s, Vec32s const a, Vec32s const b) {
+    return _mm512_mask_mov_epi16(b, s, a);  // conditional move may be optimized better by the compiler than blend
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32s if_add (Vec32sb const f, Vec32s const a, Vec32s const b) {
+    return _mm512_mask_add_epi16(a, f, a, b);
+}
+
+// Conditional subtract
+static inline Vec32s if_sub (Vec32sb const f, Vec32s const a, Vec32s const b) {
+    return _mm512_mask_sub_epi16(a, f, a, b);
+}
+
+// Conditional multiply
+static inline Vec32s if_mul (Vec32sb const f, Vec32s const a, Vec32s const b) {
+    return _mm512_mask_mullo_epi16(a, f, a, b);
+}
+
+// Horizontal add: Calculates the sum of all vector elements.
+// Overflow will wrap around
+static inline int16_t horizontal_add (Vec32s const a) {
+    Vec16s s = a.get_low() + a.get_high();
+    return horizontal_add(s);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec32s const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32s add_saturated(Vec32s const a, Vec32s const b) {
+    return _mm512_adds_epi16(a, b);
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32s sub_saturated(Vec32s const a, Vec32s const b) {
+    return _mm512_subs_epi16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec32s max(Vec32s const a, Vec32s const b) {
+    return _mm512_max_epi16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec32s min(Vec32s const a, Vec32s const b) {
+    return _mm512_min_epi16(a,b);
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32s abs(Vec32s const a) {
+    return _mm512_abs_epi16(a);
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32s abs_saturated(Vec32s const a) {
+    return _mm512_min_epu16(abs(a), Vec32s(0x7FFF));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32s rotate_left(Vec32s const a, int b) {
+    __m512i left  = _mm512_sll_epi16(a,_mm_cvtsi32_si128(b & 0xF));      // a << b
+    __m512i right = _mm512_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0xF)); // a >> (32 - b)
+    __m512i rot   = _mm512_or_si512(left,right);                         // or
+    return  rot;
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec32us : public Vec32s {
+public:
+    // Default constructor:
+    Vec32us() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32us(uint16_t i) {
+        zmm = _mm512_set1_epi16((int16_t)i);
+    }
+    // Constructor to build from all elements. Inherit from Vec32s
+    Vec32us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7,
+        uint16_t i8,  uint16_t i9,  uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15,
+        uint16_t i16, uint16_t i17, uint16_t i18, uint16_t i19, uint16_t i20, uint16_t i21, uint16_t i22, uint16_t i23,
+        uint16_t i24, uint16_t i25, uint16_t i26, uint16_t i27, uint16_t i28, uint16_t i29, uint16_t i30, uint16_t i31)
+    : Vec32s(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+         i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31) {}
+
+    // Constructor to build from two Vec16us:
+    Vec32us(Vec16us const a0, Vec16us const a1) {
+        zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
+    }
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec32us(__m512i const x) {
+        zmm = x;
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec32us & operator = (__m512i const x) {
+        zmm = x;
+        return *this;
+    }
+    // Member function to load from array (unaligned)
+    Vec32us & load(void const * p) {
+        Vec32s::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec32us & load_a(void const * p) {
+        Vec32s::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32us const insert(int index, uint16_t value) {
+        Vec32s::insert(index, (int16_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint16_t extract(int index) const {
+        return (uint16_t)Vec32s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (int index) const {
+        return (uint16_t)Vec32s::extract(index);
+    }
+    // Member functions to split into two Vec16us:
+    Vec16us get_low() const {
+        return Vec16us(Vec32s::get_low());
+    }
+    Vec16us get_high() const {
+        return Vec16us(Vec32s::get_high());
+    }
+    static constexpr int elementtype() {
+        return 7;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add element by element
+static inline Vec32us operator + (Vec32us const a, Vec32us const b) {
+    return _mm512_add_epi16(a, b);
+}
+
+// vector operator - : subtract element by element
+static inline Vec32us operator - (Vec32us const a, Vec32us const b) {
+    return _mm512_sub_epi16(a, b);
+}
+
+// vector operator * : multiply element by element
+static inline Vec32us operator * (Vec32us const a, Vec32us const b) {
+    return _mm512_mullo_epi16(a, b);
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec32us operator >> (Vec32us const a, uint32_t b) {
+    return _mm512_srl_epi16(a, _mm_cvtsi32_si128((int)b));
+}
+static inline Vec32us operator >> (Vec32us const a, int b) {
+    return a >> uint32_t(b);
+}
+
+// vector operator >>= : shift right logical
+static inline Vec32us & operator >>= (Vec32us & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical (signed b)
+static inline Vec32us & operator >>= (Vec32us & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32us operator << (Vec32us const a, uint32_t b) {
+    return _mm512_sll_epi16(a, _mm_cvtsi32_si128((int)b));
+}
+static inline Vec32us operator << (Vec32us const a, int b) {
+    return a << uint32_t(b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32sb operator < (Vec32us const a, Vec32us const b) {
+    return _mm512_cmp_epu16_mask(a, b, 1);
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32sb operator > (Vec32us const a, Vec32us const b) {
+    return _mm512_cmp_epu16_mask(a, b, 6);
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32sb operator >= (Vec32us const a, Vec32us const b) {
+    return _mm512_cmp_epu16_mask(a, b, 5);
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32sb operator <= (Vec32us const a, Vec32us const b) {
+    return _mm512_cmp_epu16_mask(a, b, 2);
+}
+
+// vector operator & : bitwise and
+static inline Vec32us operator & (Vec32us const a, Vec32us const b) {
+    return Vec32us(Vec32s(a) & Vec32s(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec32us operator | (Vec32us const a, Vec32us const b) {
+    return Vec32us(Vec32s(a) | Vec32s(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32us operator ^ (Vec32us const a, Vec32us const b) {
+    return Vec32us(Vec32s(a) ^ Vec32s(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32us operator ~ (Vec32us const a) {
+    return Vec32us( ~ Vec32s(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec32us select (Vec32sb const s, Vec32us const a, Vec32us const b) {
+    return Vec32us(select(s, Vec32s(a), Vec32s(b)));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32us if_add (Vec32sb const f, Vec32us const a, Vec32us const b) {
+    return _mm512_mask_add_epi16(a, f, a, b);
+}
+
+// Conditional subtract
+static inline Vec32us if_sub (Vec32sb const f, Vec32us const a, Vec32us const b) {
+    return _mm512_mask_sub_epi16(a, f, a, b);
+}
+
+// Conditional multiply
+static inline Vec32us if_mul (Vec32sb const f, Vec32us const a, Vec32us const b) {
+    return _mm512_mask_mullo_epi16(a, f, a, b);
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32us add_saturated(Vec32us const a, Vec32us const b) {
+    return _mm512_adds_epu16(a, b);
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32us sub_saturated(Vec32us const a, Vec32us const b) {
+    return _mm512_subs_epu16(a, b);
+}
+
+// function max: a > b ? a : b
+static inline Vec32us max(Vec32us const a, Vec32us const b) {
+    return _mm512_max_epu16(a,b);
+}
+
+// function min: a < b ? a : b
+static inline Vec32us min(Vec32us const a, Vec32us const b) {
+    return _mm512_min_epu16(a,b);
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute functions
+*
+******************************************************************************
+*
+* These permute functions can reorder the elements of a vector and optionally
+* set some elements to zero. See Vectori128.h for description
+*
+*****************************************************************************/
+
+// Permute vector of 32 16-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int... i0 >
+    static inline Vec32s permute32(Vec32s const a) {
+    int constexpr indexs[32] = { i0... };
+    __m512i y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec32s>(indexs);
+
+    static_assert(sizeof... (i0) == 32, "permute32 must have 32 indexes");
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_si512();  // just return zero
+
+    if constexpr ((flags & perm_perm) != 0) {                   // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {         // use larger permutation
+            constexpr EList<int, 16> L = largeblock_perm<32>(indexs); // permutation pattern
+            y = permute16 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7],
+                L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15]> (Vec16i(a));
+            if (!(flags & perm_addz)) return y;                 // no remaining zeroing
+        }
+        else if constexpr ((flags & perm_same_pattern) != 0) {  // same pattern in all lanes
+            if constexpr ((flags & perm_rotate) != 0) {         // fits palignr. rotate within lanes
+                y = _mm512_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF);
+            }
+            else { // use pshufb
+                const EList <int8_t, 64> bm = pshufb_mask<Vec32s>(indexs);
+                return _mm512_shuffle_epi8(a, Vec32s().load(bm.a));
+            }
+        }
+        else {  // different patterns in all lanes
+            if constexpr ((flags & perm_cross_lane) == 0) {     // no lane crossing. Use pshufb
+                const EList <int8_t, 64> bm = pshufb_mask<Vec32s>(indexs);
+                return _mm512_shuffle_epi8(a, Vec32s().load(bm.a));
+            }
+            else if constexpr ((flags & perm_rotate_big) != 0) {// fits full rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count) * 2; // rotate count
+                constexpr uint8_t r1 = (rot >> 4 << 1) & 7;
+                constexpr uint8_t r2 = (r1 + 2) & 7;
+                __m512i y1 = a, y2 = a;
+                if constexpr (r1 != 0) y1 = _mm512_alignr_epi64 (a, a, r1); // rotate 128-bit blocks
+                if constexpr (r2 != 0) y2 = _mm512_alignr_epi64 (a, a, r2); // rotate 128-bit blocks
+                y = _mm512_alignr_epi8(y2, y1, rot & 15);
+            }
+            else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) {
+                y = _mm512_broadcastw_epi16(_mm512_castsi512_si128(y));     // broadcast first element
+            }
+            else if constexpr ((flags & perm_zext) != 0) {     // fits zero extension
+                y = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(y));  // zero extension
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#if defined (__AVX512VBMI2__)
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm512_maskz_compress_epi16(__mmask32(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm512_maskz_expand_epi16(__mmask32(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#endif  // AVX512VBMI2
+            else {  // full permute needed
+                const EList <int16_t, 32> bm = perm_mask_broad<Vec32s>(indexs);
+                y = _mm512_permutexvar_epi16 (Vec32s().load(bm.a), y);
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {           // additional zeroing needed
+        y = _mm512_maskz_mov_epi16(zero_mask<32>(indexs), y);
+    }
+    return y;
+}
+
+template <int... i0 >
+    static inline Vec32us permute32(Vec32us const a) {
+    return Vec32us (permute32<i0...> (Vec32s(a)));
+}
+
+
+// Permute vector of 64 8-bit integers.
+// Index -1 gives 0, index V_DC means don't care.
+template <int... i0 >
+static inline Vec64c permute64(Vec64c const a) {
+    int constexpr indexs[64] = { i0... };
+    __m512i y = a;  // result
+    // get flags for possibilities that fit the permutation pattern
+    constexpr uint64_t flags = perm_flags<Vec64c>(indexs);
+
+    static_assert(sizeof... (i0) == 64, "permute64 must have 64 indexes");
+    static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function");
+
+    if constexpr ((flags & perm_allzero) != 0) {
+        return _mm512_setzero_si512();                                    // just return zero
+    }
+    if constexpr ((flags & perm_perm) != 0) {                             // permutation needed
+
+        if constexpr ((flags & perm_largeblock) != 0) {                   // use larger permutation
+            constexpr EList<int, 32> L = largeblock_perm<64>(indexs);      // permutation pattern
+            y = permute32 <
+                L.a[0],  L.a[1],  L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
+                L.a[8],  L.a[9],  L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15],
+                L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+                L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>
+                (Vec32s(a));
+            if (!(flags & perm_addz)) return y;                           // no remaining zeroing
+        }
+        else {
+            if constexpr ((flags & perm_cross_lane) == 0) {               // no lane crossing. Use pshufb
+                const EList <int8_t, 64> bm = pshufb_mask<Vec64c>(indexs);
+                return _mm512_shuffle_epi8(a, Vec64c().load(bm.a));
+            }
+            else if constexpr ((flags & perm_rotate_big) != 0) {          // fits full rotate
+                constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotate count
+                constexpr uint8_t r1 = (rot >> 4 << 1) & 7;
+                constexpr uint8_t r2 = (r1 + 2) & 7;
+                __m512i y1 = a, y2 = a;
+                if constexpr (r1 != 0) y1 = _mm512_alignr_epi64(y, y, r1);// rotate 128-bit blocks
+                if constexpr (r2 != 0) y2 = _mm512_alignr_epi64(a, a, r2);// rotate 128-bit blocks
+                y = _mm512_alignr_epi8(y2, y1, rot & 15);
+            }
+            else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) {
+                y = _mm512_broadcastb_epi8(_mm512_castsi512_si128(y));    // broadcast first element
+            }
+            else if constexpr ((flags & perm_zext) != 0) {                // fits zero extension
+                y = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(y));      // zero extension
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#if defined (__AVX512VBMI2__)
+            else if constexpr ((flags & perm_compress) != 0) {
+                y = _mm512_maskz_compress_epi8(__mmask64(compress_mask(indexs)), y); // compress
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+            else if constexpr ((flags & perm_expand) != 0) {
+                y = _mm512_maskz_expand_epi8(__mmask64(expand_mask(indexs)), y); // expand
+                if constexpr ((flags & perm_addz2) == 0) return y;
+            }
+#endif  // AVX512VBMI2
+            else {      // full permute needed
+#ifdef __AVX512VBMI__   // full permute instruction available
+                const EList <int8_t, 64> bm = perm_mask_broad<Vec64c>(indexs);
+                y = _mm512_permutexvar_epi8(Vec64c().load(bm.a), y);
+#else
+                // There is no 8-bit full permute. Use 16-bit permute
+                // getevenmask: get permutation mask for destination bytes with even position
+                auto getevenmask = [](int const (&indexs)[64]) constexpr {
+                    EList<uint16_t, 32> u = {{0}};       // list to return
+                    for (int i = 0; i < 64; i += 2) {    // loop through even indexes
+                        uint16_t ix = indexs[i] & 63;
+                        // source bytes with odd position are in opposite 16-bit word becase of 32-bit rotation
+                        u.a[i>>1] = ((ix >> 1) ^ (ix & 1)) | (((ix & 1) ^ 1) << 5);
+                    }
+                    return u;
+                };
+                // getoddmask: get permutation mask for destination bytes with odd position
+                auto getoddmask = [](int const (&indexs)[64]) constexpr {
+                    EList<uint16_t, 32> u = {{0}};       // list to return
+                    for (int i = 1; i < 64; i += 2) {  // loop through odd indexes
+                        uint16_t ix = indexs[i] & 63;
+                        u.a[i>>1] = (ix >> 1) | ((ix & 1) << 5);
+                    }
+                    return u;
+                };
+                EList<uint16_t, 32> evenmask = getevenmask(indexs);
+                EList<uint16_t, 32> oddmask  = getoddmask (indexs);
+                // Rotate to get odd bytes into even position, and vice versa.
+                // There is no 16-bit rotate, use 32-bit rotate.
+                // The wrong position of the odd bytes is compensated for in getevenmask
+                __m512i ro    = _mm512_rol_epi32 (a, 8);                     // rotate
+                __m512i yeven = _mm512_permutex2var_epi16(ro, Vec32s().load(evenmask.a), a);  // destination bytes with even position
+                __m512i yodd  = _mm512_permutex2var_epi16(ro, Vec32s().load(oddmask.a),  a);  // destination bytes with odd  position
+                __mmask64 maske = 0x5555555555555555;                        // mask for even position
+                y = _mm512_mask_mov_epi8(yodd, maske, yeven);                // interleave even and odd position bytes
+#endif
+            }
+        }
+    }
+    if constexpr ((flags & perm_zeroing) != 0) {      // additional zeroing needed
+        y = _mm512_maskz_mov_epi8(zero_mask<64>(indexs), y);
+    }
+    return y;
+}
+
+template <int... i0 >
+static inline Vec64uc permute64(Vec64uc const a) {
+    return Vec64uc(permute64<i0...>(Vec64c(a)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector blend functions
+*
+*****************************************************************************/
+
+// permute and blend Vec32s
+template <int ... i0 >
+static inline Vec32s blend32(Vec32s const a, Vec32s const b) {
+    int constexpr indexs[32] = { i0 ... }; // indexes as array
+    static_assert(sizeof... (i0) == 32, "blend32 must have 32 indexes");
+    __m512i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec32s>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute32 <i0 ... >(a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        constexpr EList<int, 64> L = blend_perm_indexes<32, 2>(indexs); // get permutation indexes
+        return permute32 <
+            L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39],
+            L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47],
+            L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55],
+            L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] > (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint32_t mb = (uint32_t)make_bit_mask<32, 0x305>(indexs);  // blend mask
+        y = _mm512_mask_mov_epi16(a, mb, b);
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 32-bit blocks
+        constexpr EList<int, 16> L = largeblock_perm<32>(indexs); // get 32-bit blend pattern
+        y = blend16 <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7],
+            L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] >
+            (Vec16i(a), Vec16i(b));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else { // No special cases
+        const EList <int16_t, 32> bm = perm_mask_broad<Vec32s>(indexs);      // full permute
+        y = _mm512_permutex2var_epi16(a, Vec32s().load(bm.a), b);
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+        y = _mm512_maskz_mov_epi16(zero_mask<32>(indexs), y);
+    }
+    return y;
+}
+
+template <int ... i0 >
+    static inline Vec32us blend32(Vec32us const a, Vec32us const b) {
+    return Vec32us(blend32<i0 ...> (Vec32s(a),Vec32s(b)));
+}
+
+    // permute and blend Vec64c
+template <int ... i0 >
+static inline Vec64c blend64(Vec64c const a, Vec64c const b) {
+    int constexpr indexs[64] = { i0 ... }; // indexes as array
+    static_assert(sizeof... (i0) == 64, "blend64 must have 64 indexes");
+    __m512i y = a;                                         // result
+    constexpr uint64_t flags = blend_flags<Vec64c>(indexs);// get flags for possibilities that fit the index pattern
+
+    static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function");
+
+    if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512();  // just return zero
+
+    if constexpr ((flags & blend_b) == 0) {                // nothing from b. just permute a
+        return permute64 <i0 ... >(a);
+    }
+    if constexpr ((flags & blend_a) == 0) {                // nothing from a. just permute b
+        constexpr EList<int, 128> L = blend_perm_indexes<64, 2>(indexs); // get permutation indexes
+        return permute64 <
+            L.a[64],  L.a[65],  L.a[66],  L.a[67],  L.a[68],  L.a[69],  L.a[70],  L.a[71],
+            L.a[72],  L.a[73],  L.a[74],  L.a[75],  L.a[76],  L.a[77],  L.a[78],  L.a[79],
+            L.a[80],  L.a[81],  L.a[82],  L.a[83],  L.a[84],  L.a[85],  L.a[86],  L.a[87],
+            L.a[88],  L.a[89],  L.a[90],  L.a[91],  L.a[92],  L.a[93],  L.a[94],  L.a[95],
+            L.a[96],  L.a[97],  L.a[98],  L.a[99],  L.a[100], L.a[101], L.a[102], L.a[103],
+            L.a[104], L.a[105], L.a[106], L.a[107], L.a[108], L.a[109], L.a[110], L.a[111],
+            L.a[112], L.a[113], L.a[114], L.a[115], L.a[116], L.a[117], L.a[118], L.a[119],
+            L.a[120], L.a[121], L.a[122], L.a[123], L.a[124], L.a[125], L.a[126], L.a[127]
+        > (b);
+    }
+    if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending
+        constexpr uint64_t mb = make_bit_mask<64, 0x306>(indexs);  // blend mask
+        y = _mm512_mask_mov_epi8(a, mb, b);
+    }
+    else if constexpr ((flags & blend_largeblock) != 0) {  // blend and permute 16-bit blocks
+        constexpr EList<int, 32> L = largeblock_perm<64>(indexs); // get 16-bit blend pattern
+        y = blend32 <
+            L.a[0],  L.a[1],  L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
+            L.a[8],  L.a[9],  L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15],
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]
+        > (Vec32s(a), Vec32s(b));
+        if (!(flags & blend_addz)) return y;               // no remaining zeroing
+    }
+    else { // No special cases
+#ifdef  __AVX512VBMI__   // AVX512VBMI
+        const EList <int8_t, 64> bm = perm_mask_broad<Vec64c>(indexs);      // full permute
+        y = _mm512_permutex2var_epi8(a, Vec64c().load(bm.a), b);
+#else   // split into two permutes
+        constexpr EList<int, 128> L = blend_perm_indexes<64, 0> (indexs);
+        __m512i ya = permute64 <
+            L.a[0],  L.a[1],  L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
+            L.a[8],  L.a[9],  L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15],
+            L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
+            L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31],
+            L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39],
+            L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47],
+            L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55],
+            L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63]
+        > (a);
+        __m512i yb = permute64 <
+            L.a[64],  L.a[65],  L.a[66],  L.a[67],  L.a[68],  L.a[69],  L.a[70],  L.a[71],
+            L.a[72],  L.a[73],  L.a[74],  L.a[75],  L.a[76],  L.a[77],  L.a[78],  L.a[79],
+            L.a[80],  L.a[81],  L.a[82],  L.a[83],  L.a[84],  L.a[85],  L.a[86],  L.a[87],
+            L.a[88],  L.a[89],  L.a[90],  L.a[91],  L.a[92],  L.a[93],  L.a[94],  L.a[95],
+            L.a[96],  L.a[97],  L.a[98],  L.a[99],  L.a[100], L.a[101], L.a[102], L.a[103],
+            L.a[104], L.a[105], L.a[106], L.a[107], L.a[108], L.a[109], L.a[110], L.a[111],
+            L.a[112], L.a[113], L.a[114], L.a[115], L.a[116], L.a[117], L.a[118], L.a[119],
+            L.a[120], L.a[121], L.a[122], L.a[123], L.a[124], L.a[125], L.a[126], L.a[127]
+        > (b);
+        uint64_t bm = make_bit_mask<64, 0x306> (indexs);
+        y = _mm512_mask_mov_epi8(ya, bm, yb);
+#endif
+    }
+    if constexpr ((flags & blend_zeroing) != 0) {          // additional zeroing needed
+        y = _mm512_maskz_mov_epi8(zero_mask<64>(indexs), y);
+    }
+    return y;
+}
+
+template <int ... i0 >
+static inline Vec64uc blend64(Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(blend64 <i0 ...>(Vec64c(a), Vec64c(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors
+*
+*****************************************************************************/
+
+// lookup in table of 64 int8_t values
+static inline Vec64c lookup64(Vec64c const index, Vec64c const table) {
+#ifdef  __AVX512VBMI__   // AVX512VBMI instruction set not supported yet (April 2019)
+    return _mm512_permutexvar_epi8(index, table);
+#else
+    // broadcast each 128-bit lane, because int8_t shuffle is only within 128-bit lanes
+    __m512i lane0 = _mm512_broadcast_i32x4(_mm512_castsi512_si128(table));
+    __m512i lane1 = _mm512_shuffle_i64x2(table, table, 0x55);
+    __m512i lane2 = _mm512_shuffle_i64x2(table, table, 0xAA);
+    __m512i lane3 = _mm512_shuffle_i64x2(table, table, 0xFF);
+    Vec64c  laneselect = index >> 4;  // upper part of index selects lane
+    // select and permute from each lane
+    Vec64c  dat0  = _mm512_maskz_shuffle_epi8(      laneselect==0, lane0, index);
+    Vec64c  dat1  = _mm512_mask_shuffle_epi8 (dat0, laneselect==1, lane1, index);
+    Vec64c  dat2  = _mm512_maskz_shuffle_epi8(      laneselect==2, lane2, index);
+    Vec64c  dat3  = _mm512_mask_shuffle_epi8 (dat2, laneselect==3, lane3, index);
+    return dat1 | dat3;
+#endif
+}
+
+// lookup in table of 128 int8_t values
+static inline Vec64c lookup128(Vec64c const index, Vec64c const table1, Vec64c const table2) {
+#ifdef  __AVX512VBMI__   // AVX512VBMI instruction set not supported yet (April 2019)
+    return _mm512_permutex2var_epi8(table1, index, table2);
+
+#else
+    // use 16-bits permute, which is included in AVX512BW
+    __m512i ieven2 = _mm512_srli_epi16 (index, 1);              // even pos bytes of index / 2 (extra bits will be ignored)
+    __m512i e1 = _mm512_permutex2var_epi16(table1, ieven2, table2); // 16-bits results for even pos index
+    __mmask32 me1 = (Vec32s(index) & 1) != 0;                   // even pos indexes are odd value
+    __m512i e2 = _mm512_mask_srli_epi16(e1, me1, e1, 8);        // combined results for even pos index. get upper 8 bits down if index was odd
+    __m512i iodd2  = _mm512_srli_epi16 (index, 9);              // odd  pos bytes of index / 2
+    __m512i o1 = _mm512_permutex2var_epi16(table1, iodd2, table2); // 16-bits results for odd pos index
+    __mmask32 mo1 = (Vec32s(index) & 0x100) == 0;               // odd pos indexes have even value
+    __m512i o2 = _mm512_mask_slli_epi16(o1, mo1, o1, 8);        // combined results for odd pos index. get lower 8 bits up if index was even
+    __mmask64 maske = 0x5555555555555555;                       // mask for even position
+    return  _mm512_mask_mov_epi8(o2, maske, e2);                // interleave even and odd position result
+#endif
+}
+
+// lookup in table of 256 int8_t values.
+// The complete table of all possible 256 byte values is contained in four vectors
+// The index is treated as unsigned
+static inline Vec64c lookup256(Vec64c const index, Vec64c const table1, Vec64c const table2, Vec64c const table3, Vec64c const table4) {
+#ifdef  __AVX512VBMI__   // AVX512VBMI instruction set not supported yet (April 2019)
+    Vec64c d12 = _mm512_permutex2var_epi8(table1, index, table2);
+    Vec64c d34 = _mm512_permutex2var_epi8(table3, index, table4);
+    return select(index < 0, d34, d12);  // use sign bit to select
+#else
+    // the AVX512BW version of lookup128 ignores upper bytes of index
+    // (the compiler will optimize away common subexpressions of the two lookup128)
+    Vec64c d12 = lookup128(index, table1, table2);
+    Vec64c d34 = lookup128(index, table3, table4);
+    return select(index < 0, d34, d12);
+#endif
+}
+
+
+// lookup in table of 32 values
+static inline Vec32s lookup32(Vec32s const index, Vec32s const table) {
+    return _mm512_permutexvar_epi16(index, table);
+}
+
+// lookup in table of 64 values
+static inline Vec32s lookup64(Vec32s const index, Vec32s const table1, Vec32s const table2) {
+    return _mm512_permutex2var_epi16(table1, index, table2);
+}
+
+// lookup in table of 128 values
+static inline Vec32s lookup128(Vec32s const index, Vec32s const table1, Vec32s const table2, Vec32s const table3, Vec32s const table4) {
+    Vec32s d12 = _mm512_permutex2var_epi16(table1, index, table2);
+    Vec32s d34 = _mm512_permutex2var_epi16(table3, index, table4);
+    return select((index >> 6) != 0, d34, d12);
+}
+
+
+/*****************************************************************************
+*
+*          Byte shifts
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+template <unsigned int b>
+static inline Vec64c shift_bytes_up(Vec64c const a) {
+    __m512i ahi, alo;
+    if constexpr (b == 0) return a;
+    else if constexpr ((b & 3) == 0) {  // b is divisible by 4
+        return _mm512_alignr_epi32(a, _mm512_setzero_si512(), (16 - (b >> 2)) & 15);
+    }
+    else if constexpr (b < 16) {
+        alo = a;
+        ahi = _mm512_maskz_shuffle_i64x2(0xFC, a, a, 0x90);  // shift a 16 bytes up, zero lower part
+    }
+    else if constexpr (b < 32) {
+        alo = _mm512_maskz_shuffle_i64x2(0xFC, a, a, 0x90);  // shift a 16 bytes up, zero lower part
+        ahi = _mm512_maskz_shuffle_i64x2(0xF0, a, a, 0x40);  // shift a 32 bytes up, zero lower part
+    }
+    else if constexpr (b < 48) {
+        alo = _mm512_maskz_shuffle_i64x2(0xF0, a, a, 0x40);  // shift a 32 bytes up, zero lower part
+        ahi = _mm512_maskz_shuffle_i64x2(0xC0, a, a, 0x00);  // shift a 48 bytes up, zero lower part
+    }
+    else if constexpr (b < 64) {
+        alo = _mm512_maskz_shuffle_i64x2(0xC0, a, a, 0x00);  // shift a 48 bytes up, zero lower part
+        ahi = _mm512_setzero_si512();                        // zero
+    }
+    else {
+        return _mm512_setzero_si512();                       // zero
+    }
+    return _mm512_alignr_epi8(alo, ahi, 16-(b & 0xF));       // shift within 16-bytes lane
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+template <unsigned int b>
+static inline Vec64c shift_bytes_down(Vec64c const a) {
+    if constexpr ((b & 3) == 0) {  // b is divisible by 4
+        return _mm512_alignr_epi32(_mm512_setzero_si512(), a, ((b >> 2) & 15));
+    }
+    __m512i ahi, alo;
+    if constexpr (b < 16) {
+        alo =  _mm512_maskz_shuffle_i64x2(0x3F, a, a, 0x39);  // shift a 16 bytes down, zero upper part
+        ahi = a;
+    }
+    else if constexpr (b < 32) {
+        alo = _mm512_maskz_shuffle_i64x2(0x0F, a, a, 0x0E);  // shift a 32 bytes down, zero upper part
+        ahi = _mm512_maskz_shuffle_i64x2(0x3F, a, a, 0x39);  // shift a 16 bytes down, zero upper part
+    }
+    else if constexpr (b < 48) {
+        alo = _mm512_maskz_shuffle_i64x2(0x03, a, a, 0x03);  // shift a 48 bytes down, zero upper part
+        ahi = _mm512_maskz_shuffle_i64x2(0x0F, a, a, 0x0E);  // shift a 32 bytes down, zero upper part
+    }
+    else if constexpr (b < 64) {
+        alo = _mm512_setzero_si512();
+        ahi = _mm512_maskz_shuffle_i64x2(0x03, a, a, 0x03);  // shift a 48 bytes down, zero upper part
+    }
+    else {
+        return _mm512_setzero_si512();                       // zero
+    }
+    return _mm512_alignr_epi8(alo, ahi, b & 0xF);            // shift within 16-bytes lane
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 32 elements to 16 bits with sign extension
+static inline Vec32s extend_low (Vec64c const a) {
+    __m512i a2   = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    Vec64cb sign = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(),a2);// 0 > a2
+    __m512i ss   = _mm512_maskz_set1_epi8(sign, -1);
+    return         _mm512_unpacklo_epi8(a2, ss);                     // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec32s extend_high (Vec64c const a) {
+    __m512i a2   = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    Vec64cb sign = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(),a2);// 0 > a2
+    __m512i ss   = _mm512_maskz_set1_epi8(sign, -1);
+    return         _mm512_unpacklo_epi8(a2, ss);                     // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec32us extend_low (Vec64uc const a) {
+    __m512i a2   = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    return    _mm512_unpacklo_epi8(a2, _mm512_setzero_si512());      // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec32us extend_high (Vec64uc const a) {
+    __m512i a2   = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    return    _mm512_unpacklo_epi8(a2, _mm512_setzero_si512());      // interleave with zero extensions
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec16i extend_low (Vec32s const a) {
+    __m512i a2   = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    Vec32sb sign = _mm512_cmpgt_epi16_mask(_mm512_setzero_si512(),a2);// 0 > a2
+    __m512i ss   = _mm512_maskz_set1_epi16(sign, -1);
+    return         _mm512_unpacklo_epi16(a2, ss);                    // interleave with sign extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec16i extend_high (Vec32s const a) {
+    __m512i a2   = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    Vec32sb sign = _mm512_cmpgt_epi16_mask(_mm512_setzero_si512(),a2);// 0 > a2
+    __m512i ss   = _mm512_maskz_set1_epi16(sign, -1);
+    return         _mm512_unpacklo_epi16(a2, ss);                    // interleave with sign extensions
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec16ui extend_low (Vec32us const a) {
+    __m512i a2   = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    return    _mm512_unpacklo_epi16(a2, _mm512_setzero_si512());     // interleave with zero extensions
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec16ui extend_high (Vec32us const a) {
+    __m512i a2   = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a));  // get low 64-bit blocks
+    return    _mm512_unpacklo_epi16(a2, _mm512_setzero_si512());     // interleave with zero extensions
+}
+
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec64c compress (Vec32s const low, Vec32s const high) {
+    __mmask64 mask = 0x5555555555555555;
+    __m512i lowm  = _mm512_maskz_mov_epi8 (mask, low);     // bytes of low
+    __m512i highm = _mm512_maskz_mov_epi8 (mask, high);    // bytes of high
+    __m512i pk    = _mm512_packus_epi16(lowm, highm);      // unsigned pack
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec64c compress_saturated (Vec32s const low, Vec32s const high) {
+    __m512i pk    = _mm512_packs_epi16(low,high);          // packed with signed saturation
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec64uc compress (Vec32us const low, Vec32us const high) {
+    return  Vec64uc (compress((Vec32s)low, (Vec32s)high));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec64uc compress_saturated (Vec32us const low, Vec32us const high) {
+    __m512i maxval  = _mm512_set1_epi32(0x00FF00FF);       // maximum value
+    __m512i low1    = _mm512_min_epu16(low,maxval);        // upper limit
+    __m512i high1   = _mm512_min_epu16(high,maxval);       // upper limit
+    __m512i pk      = _mm512_packus_epi16(low1,high1);     // this instruction saturates from signed 32 bit to unsigned 16 bit
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed to unsigned, with saturation
+static inline Vec64uc compress_saturated_s2u (Vec32s const low, Vec32s const high) {
+    __m512i pk    = _mm512_packus_epi16(low,high);         // this instruction saturates from signed 16 bit to unsigned 8 bit
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec32s compress (Vec16i const low, Vec16i const high) {
+    __mmask32 mask = 0x55555555;
+    __m512i lowm  = _mm512_maskz_mov_epi16 (mask, low);    // words of low
+    __m512i highm = _mm512_maskz_mov_epi16 (mask, high);   // words of high
+    __m512i pk    = _mm512_packus_epi32(lowm, highm);      // unsigned pack
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec32s compress_saturated (Vec16i const low, Vec16i const high) {
+    __m512i pk    =  _mm512_packs_epi32(low,high);         // pack with signed saturation
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec32us compress (Vec16ui const low, Vec16ui const high) {
+    return Vec32us (compress((Vec16i)low, (Vec16i)high));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec32us compress_saturated (Vec16ui const low, Vec16ui const high) {
+    __m512i maxval  = _mm512_set1_epi32(0x0000FFFF);       // maximum value
+    __m512i low1    = _mm512_min_epu32(low,maxval);        // upper limit
+    __m512i high1   = _mm512_min_epu32(high,maxval);       // upper limit
+    __m512i pk      = _mm512_packus_epi32(low1,high1);     // this instruction saturates from signed 32 bit to unsigned 16 bit
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed to unsigned, with saturation
+static inline Vec32us compress_saturated_s2u (Vec16i const low, Vec16i const high) {
+    __m512i pk    =  _mm512_packus_epi32(low,high);        // this instruction saturates from signed 32 bit to unsigned 16 bit
+    __m512i in    = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>();
+    return  _mm512_permutexvar_epi64(in, pk);              // put in right place
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 32 16-bit signed integers
+static inline Vec32s operator / (Vec32s const a, Divisor_s const d) {
+    __m512i m   = _mm512_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m512i sgn = _mm512_broadcastq_epi64(d.getsign());    // broadcast sign of d
+    __m512i t1  = _mm512_mulhi_epi16(a, m);                // multiply high signed words
+    __m512i t2  = _mm512_add_epi16(t1,a);                  // + a
+    __m512i t3  = _mm512_sra_epi16(t2,d.gets1());          // shift right artihmetic
+    __m512i t4  = _mm512_srai_epi16(a,15);                 // sign of a
+    __m512i t5  = _mm512_sub_epi16(t4,sgn);                // sign of a - sign of d
+    __m512i t6  = _mm512_sub_epi16(t3,t5);                 // + 1 if a < 0, -1 if d < 0
+    return        _mm512_xor_si512(t6,sgn);                // change sign if divisor negative
+}
+
+// vector of 16 16-bit unsigned integers
+static inline Vec32us operator / (Vec32us const a, Divisor_us const d) {
+    __m512i m   = _mm512_broadcastq_epi64(d.getm());       // broadcast multiplier
+    __m512i t1  = _mm512_mulhi_epu16(a, m);                // multiply high signed words
+    __m512i t2  = _mm512_sub_epi16(a,t1);                  // subtract
+    __m512i t3  = _mm512_srl_epi16(t2,d.gets1());          // shift right logical
+    __m512i t4  = _mm512_add_epi16(t1,t3);                 // add
+    return        _mm512_srl_epi16(t4,d.gets2());          // shift right logical
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec64c operator / (Vec64c const a, Divisor_s const d) {
+    // sign-extend even-numbered and odd-numbered elements to 16 bits
+    Vec32s  even = _mm512_srai_epi16(_mm512_slli_epi16(a, 8),8);
+    Vec32s  odd  = _mm512_srai_epi16(a, 8);
+    Vec32s  evend = even / d;         // divide even-numbered elements
+    Vec32s  oddd  = odd  / d;         // divide odd-numbered  elements
+            oddd  = _mm512_slli_epi16(oddd, 8); // shift left to put back in place
+    __m512i res  = _mm512_mask_mov_epi8(evend, 0xAAAAAAAAAAAAAAAA, oddd); // interleave even and odd
+    return res;
+}
+
+// vector of 32 8-bit unsigned integers
+static inline Vec64uc operator / (Vec64uc const a, Divisor_us const d) {
+    // zero-extend even-numbered and odd-numbered elements to 16 bits
+    Vec32us  even = _mm512_maskz_mov_epi8(__mmask64(0x5555555555555555), a);
+    Vec32us  odd  = _mm512_srli_epi16(a, 8);
+    Vec32us  evend = even / d;         // divide even-numbered elements
+    Vec32us  oddd  = odd  / d;         // divide odd-numbered  elements
+    oddd  = _mm512_slli_epi16(oddd, 8); // shift left to put back in place
+    __m512i res  = _mm512_mask_mov_epi8(evend, 0xAAAAAAAAAAAAAAAA, oddd); // interleave even and odd
+    return res;
+}
+
+// vector operator /= : divide
+static inline Vec32s & operator /= (Vec32s & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec32us & operator /= (Vec32us & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+
+}
+
+// vector operator /= : divide
+static inline Vec64c & operator /= (Vec64c & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec64uc & operator /= (Vec64uc & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+// Divide Vec32s by compile-time constant
+template <int d>
+static inline Vec32s divide_by_i(Vec32s const x) {
+    constexpr int16_t d0 = int16_t(d);                               // truncate d to 16 bits
+    static_assert(d0 != 0, "Integer division by zero");
+    if constexpr (d0 ==  1) return  x;                               // divide by  1
+    if constexpr (d0 == -1) return -x;                               // divide by -1
+    if constexpr (uint16_t(d0) == 0x8000u) {
+        return _mm512_maskz_set1_epi16(x == Vec32s((int16_t)0x8000u), 1); // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0;
+    }
+    constexpr uint16_t d1 = d0 > 0 ? d0 : -d0;                       // compile-time abs(d0)
+    if constexpr ((d1 & (d1-1)) == 0) {
+        // d is a power of 2. use shift
+        constexpr int k = bit_scan_reverse_const(uint32_t(d1));
+        __m512i sign;
+        if constexpr (k > 1) sign = _mm512_srai_epi16(x, k-1); else sign = x;  // k copies of sign bit
+        __m512i bias    = _mm512_srli_epi16(sign, 16-k);             // bias = x >= 0 ? 0 : k-1
+        __m512i xpbias  = _mm512_add_epi16 (x, bias);                // x + bias
+        __m512i q       = _mm512_srai_epi16(xpbias, k);              // (x + bias) >> k
+        if (d0 > 0)  return q;                                       // d0 > 0: return  q
+        return _mm512_sub_epi16(_mm512_setzero_si512(), q);          // d0 < 0: return -q
+    }
+    // general case
+    constexpr int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1;        // ceil(log2(d)). (d < 2 handled above)
+    constexpr int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier
+    constexpr int shift1 = L - 1;
+    const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1);
+    return x / div;
+}
+
+// define Vec32s a / const_int(d)
+template <int d>
+static inline Vec32s operator / (Vec32s const a, Const_int_t<d>) {
+    return divide_by_i<d>(a);
+}
+
+// define Vec32s a / const_uint(d)
+template <uint32_t d>
+static inline Vec32s operator / (Vec32s const a, Const_uint_t<d>) {
+    static_assert(d < 0x8000u, "Dividing signed integer by overflowing unsigned");
+    return divide_by_i<int(d)>(a);                                   // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32s & operator /= (Vec32s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32s & operator /= (Vec32s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// Divide Vec32us by compile-time constant
+template <uint32_t d>
+static inline Vec32us divide_by_ui(Vec32us const x) {
+    constexpr uint16_t d0 = uint16_t(d);                             // truncate d to 16 bits
+    static_assert(d0 != 0, "Integer division by zero");
+    if constexpr (d0 == 1) return x;                                 // divide by 1
+    constexpr int b = bit_scan_reverse_const(d0);                    // floor(log2(d))
+    if constexpr ((d0 & (d0-1)) == 0) {
+        // d is a power of 2. use shift
+        return  _mm512_srli_epi16(x, b);                             // x >> b
+    }
+    // general case (d > 2)
+    constexpr uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0);// multiplier = 2^(32+b) / d
+    constexpr uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d
+    constexpr bool round_down = (2*rem < d0);                        // check if fraction is less than 0.5
+    Vec32us x1 = x;
+    if constexpr (round_down) {
+        x1 = x1 + 1;                                                 // round down mult and compensate by adding 1 to x
+    }
+    constexpr uint16_t mult1 = round_down ? mult : mult + 1;
+    const __m512i multv = _mm512_set1_epi16(mult1);                  // broadcast mult
+    __m512i xm = _mm512_mulhi_epu16(x1, multv);                      // high part of 16x16->32 bit unsigned multiplication
+    Vec32us q    = _mm512_srli_epi16(xm, b);                         // shift right by b
+    if constexpr (round_down) {
+        Vec32sb overfl = (x1 == Vec32us(_mm512_setzero_si512()));    // check for overflow of x+1
+        return select(overfl, Vec32us(uint32_t(mult1 >> b)), q);     // deal with overflow (rarely needed)
+    }
+    else {
+        return q;                                                    // no overflow possible
+    }
+}
+
+// define Vec32us a / const_uint(d)
+template <uint32_t d>
+static inline Vec32us operator / (Vec32us const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec32us a / const_int(d)
+template <int d>
+static inline Vec32us operator / (Vec32us const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                             // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32us & operator /= (Vec32us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32us & operator /= (Vec32us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+
+// define Vec64c a / const_int(d)
+template <int d>
+static inline Vec64c operator / (Vec64c const a, Const_int_t<d>) {
+    // expand into two Vec32s
+    Vec32s low  = extend_low(a)  / Const_int_t<d>();
+    Vec32s high = extend_high(a) / Const_int_t<d>();
+    return compress(low,high);
+}
+
+// define Vec64c a / const_uint(d)
+template <uint32_t d>
+static inline Vec64c operator / (Vec64c const a, Const_uint_t<d>) {
+    static_assert(uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned");
+    return a / Const_int_t<d>();                           // signed divide
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec64c & operator /= (Vec64c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec64c & operator /= (Vec64c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec64uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec64uc operator / (Vec64uc const a, Const_uint_t<d>) {
+    // expand into two Vec32us
+    Vec32us low  = extend_low(a)  / Const_uint_t<d>();
+    Vec32us high = extend_high(a) / Const_uint_t<d>();
+    return compress(low,high);
+}
+
+// define Vec64uc a / const_int(d)
+template <int d>
+static inline Vec64uc operator / (Vec64uc const a, Const_int_t<d>) {
+    static_assert(int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return a / Const_uint_t<d>();                          // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec64uc & operator /= (Vec64uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec64uc & operator /= (Vec64uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI512S_H
diff --git a/Bwdif/VCL2/vectori512se.h b/Bwdif/VCL2/vectori512se.h
new file mode 100644
index 0000000..83a38a5
--- /dev/null
+++ b/Bwdif/VCL2/vectori512se.h
@@ -0,0 +1,2076 @@
+/****************************  vectori512se.h   *******************************
+* Author:        Agner Fog
+* Date created:  2019-04-20
+* Last modified: 2020-02-23
+* Version:       2.01.01
+* Project:       vector class library
+* Description:
+* Header file defining 512-bit integer vector classes for 8 and 16 bit integers.
+* Emulated for processors without AVX512BW instruction set.
+*
+* Instructions: see vcl_manual.pdf
+*
+* The following vector classes are defined here:
+* Vec64c    Vector of  64  8-bit  signed   integers
+* Vec64uc   Vector of  64  8-bit  unsigned integers
+* Vec64cb   Vector of  64  booleans for use with Vec64c and Vec64uc
+* Vec32s    Vector of  32  16-bit signed   integers
+* Vec32us   Vector of  32  16-bit unsigned integers
+* Vec32sb   Vector of  32  booleans for use with Vec32s and Vec32us
+* Other 512-bit integer vectors are defined in Vectori512.h
+*
+* Each vector object is represented internally in the CPU as two 256-bit registers.
+* This header file defines operators and functions for these vectors.
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef VECTORI512SE_H
+#define VECTORI512SE_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+// check combination of header files
+#ifdef VECTORI512S_H
+#error Two different versions of vectorf256.h included
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+/*****************************************************************************
+*
+*          Vector of 64 8-bit signed integers
+*
+*****************************************************************************/
+
+class Vec64c  {
+protected:
+    Vec256b z0;          // lower 256 bits
+    Vec256b z1;          // higher 256 bits
+public:
+    // Default constructor:
+    Vec64c() {
+    }
+    // Constructor to build from two Vec32c:
+    Vec64c(Vec32c const a0, Vec32c const a1) {
+        z0 = a0;
+        z1 = a1;
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec64c(int8_t i) {
+        z0 = z1 = Vec32c(i);
+    }
+    // Constructor to build from all elements:
+    Vec64c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
+        int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15,
+        int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23,
+        int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31,
+        int8_t i32, int8_t i33, int8_t i34, int8_t i35, int8_t i36, int8_t i37, int8_t i38, int8_t i39,
+        int8_t i40, int8_t i41, int8_t i42, int8_t i43, int8_t i44, int8_t i45, int8_t i46, int8_t i47,
+        int8_t i48, int8_t i49, int8_t i50, int8_t i51, int8_t i52, int8_t i53, int8_t i54, int8_t i55,
+        int8_t i56, int8_t i57, int8_t i58, int8_t i59, int8_t i60, int8_t i61, int8_t i62, int8_t i63) {
+        // _mm512_set_epi8 and _mm512_set_epi16 missing in GCC 7.4.0
+        int8_t aa[64] = {
+            i0, i1, i2, i3, i4, i5, i6, i7,i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31,
+            i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+            i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 };
+        load(aa);
+    }
+#ifdef VECTORI512_H
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec64c(__m512i const x) {
+        z0 = Vec16i(x).get_low();
+        z1 = Vec16i(x).get_high();
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec64c & operator = (__m512i const x) {
+        return *this = Vec64c(x);
+    }
+    // Type cast operator to convert to __m512i used in intrinsics
+    operator __m512i() const {
+        return Vec16i(Vec8i(z0),Vec8i(z1));
+    }
+#else
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec64c & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+#endif
+    // Constructor to convert from type Vec512b
+    Vec64c(Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+     }
+    // Type cast operator to convert to Vec512b used in emulation
+    operator Vec512b() const {
+        return Vec512b(z0,z1);
+    }
+    // Member function to load from array (unaligned)
+    Vec64c & load(void const * p) {
+        Vec16i x = Vec16i().load(p);
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec64c & load_a(void const * p) {
+        Vec16i x = Vec16i().load_a(p);
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec64c & load_partial(int n, void const * p) {
+        Vec32c lo, hi;
+        if ((uint32_t)n < 32) {
+            lo = Vec32c().load_partial(n,p);
+            hi = Vec32c(0);
+        }
+        else {
+            lo = Vec32c().load(p);
+            hi = Vec32c().load_partial(n-32, ((int8_t*)p)+32);
+        }
+        *this = Vec64c(lo, hi);
+        return *this;
+    }
+    // store
+    void store(void * p) const {
+        Vec16i x = Vec16i(Vec8i(z0),Vec8i(z1));
+        x.store(p);
+    }
+    // store aligned
+    void store_a(void * p) const {
+        Vec16i x = Vec16i(Vec8i(z0),Vec8i(z1));
+        x.store_a(p);
+    } 
+    // Member function storing to aligned uncached memory (non-temporal store).
+    // Note: Will generate runtime error if p is not aligned by 64
+    void store_nt(void * p) const {
+        Vec16i x = Vec16i(Vec8i(z0),Vec8i(z1));
+        x.store_nt(p);
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if ((uint32_t)n < 32) {
+            get_low().store_partial(n, p);
+        }
+        else {
+            get_low().store(p);
+            get_high().store_partial(n-32, ((int8_t*)p)+32);
+        }
+    }
+    // cut off vector to n elements. The last 64-n elements are set to zero
+    Vec64c & cutoff(int n) {
+        Vec32c lo, hi;
+        if ((uint32_t)n < 32) {
+            lo = Vec32c(get_low()).cutoff(n);
+            hi = Vec32c(0);
+        }
+        else {
+            lo = get_low();
+            hi = Vec32c(get_high()).cutoff(n-32);
+        }
+        *this = Vec64c(lo, hi);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec64c const insert(int index, int8_t value) {
+        Vec32c lo, hi;
+        if ((uint32_t)index < 32) {
+            lo = Vec32c(get_low()).insert(index, value);
+            hi = get_high();
+        }
+        else {
+            lo = get_low();
+            hi = Vec32c(get_high()).insert(index-32, value);
+        }
+        *this = Vec64c(lo, hi);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int8_t extract(int index) const {
+        if ((uint32_t)index < 32) {
+            return Vec32c(get_low()).extract(index);
+        }
+        else {
+            return Vec32c(get_high()).extract(index-32);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int8_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec32c:
+    Vec32c get_low() const {
+        return z0;
+    }
+    Vec32c get_high() const {
+        return z1;
+    }
+    static constexpr int size() {
+        return 64;
+    }
+    static constexpr int elementtype() {
+        return 4;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec64cb: Vector of 64 Booleans for use with Vec64c and Vec64uc
+*
+*****************************************************************************/
+
+class Vec64cb : public Vec64c {
+public:
+    // Default constructor:
+    Vec64cb () {
+    }
+    Vec64cb (Vec64c const a) : Vec64c(a) {}
+
+    // Constructor to build from all elements: Not implemented
+
+    // Constructor to convert from type __mmask64 used in intrinsics: not possible
+    // Vec64cb (__mmask64 x);
+
+    // Constructor to broadcast single value:
+    Vec64cb(bool b) {
+        z0 = z1 = Vec32c(-int8_t(b));
+    }
+    // Constructor to make from two halves (big booleans)
+    Vec64cb (Vec32cb const x0, Vec32cb const x1) : Vec64c(x0,x1) {}
+
+    // Assignment operator to convert from type __mmask64 used in intrinsics: not possible
+    //Vec64cb & operator = (__mmask64 x);
+
+    // Member functions to split into two Vec32cb:
+    Vec32cb get_low() const {
+        return Vec32c(z0);
+    }
+    Vec32cb get_high() const {
+        return Vec32c(z1);
+    }
+    // Assignment operator to broadcast scalar value:
+    Vec64cb & operator = (bool b) {
+        *this = Vec64cb(b);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec64cb & insert (int index, bool a) {
+        if ((uint32_t)index < 32) {
+            z0 = get_low().insert(index, a);
+        }
+        else {
+            z1 = get_high().insert(index-32, a);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        if (index < 32) {
+            return get_low().extract(index);
+        }
+        else {
+            return get_high().extract(index-32);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Type cast operator to convert to __mmask64 used in intrinsics. not possible
+    //operator __mmask64() const;
+
+    // Member function to change a bitfield to a boolean vector
+    Vec64cb & load_bits(uint64_t a) {
+        Vec32cb x0 = Vec32cb().load_bits(uint32_t(a));
+        Vec32cb x1 = Vec32cb().load_bits(uint32_t(a>>32));
+        *this = Vec64cb(x0,x1);
+        return *this;
+    }
+    static constexpr int size() {
+        return 64;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    Vec64cb(int b) = delete; // Prevent constructing from int, etc.
+    Vec64cb & operator = (int x) = delete; // Prevent assigning int because of ambiguity
+};
+
+
+/*****************************************************************************
+*
+*          Define operators and functions for Vec64cb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec64cb operator & (Vec64cb const a, Vec64cb const b) {
+    return Vec64cb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec64cb operator && (Vec64cb const a, Vec64cb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec64cb & operator &= (Vec64cb & a, Vec64cb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec64cb operator | (Vec64cb const a, Vec64cb const b) {
+    return Vec64cb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec64cb operator || (Vec64cb const a, Vec64cb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec64cb & operator |= (Vec64cb & a, Vec64cb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec64cb operator ^ (Vec64cb const a, Vec64cb const b) {
+    return Vec64cb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec64cb & operator ^= (Vec64cb & a, Vec64cb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec64cb operator == (Vec64cb const a, Vec64cb const b) {
+    return Vec64cb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : xor
+static inline Vec64cb operator != (Vec64cb const a, Vec64cb const b) {
+    return a ^ b;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec64cb operator ~ (Vec64cb const a) {
+    return Vec64cb(~a.get_low(), ~a.get_high());}
+
+// vector operator ! : element not
+static inline Vec64cb operator ! (Vec64cb const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec64cb andnot (Vec64cb const a, Vec64cb const b) {
+    return Vec64cb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec64cb const a) {
+    return horizontal_and(a.get_low()) && horizontal_and(a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec64cb const a) {
+    return horizontal_or(a.get_low()) || horizontal_or(a.get_high());
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint64_t to_bits(Vec64cb x) {
+    return (uint64_t(to_bits(x.get_high())) << 32) | to_bits(x.get_low());
+}
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec64c
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec64c operator + (Vec64c const a, Vec64c const b) {
+    return Vec64c(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec64c & operator += (Vec64c & a, Vec64c const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec64c operator ++ (Vec64c & a, int) {
+    Vec64c a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec64c & operator ++ (Vec64c & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec64c operator - (Vec64c const a, Vec64c const b) {
+    return Vec64c(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec64c operator - (Vec64c const a) {
+    return Vec64c(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec64c & operator -= (Vec64c & a, Vec64c const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec64c operator -- (Vec64c & a, int) {
+    Vec64c a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec64c & operator -- (Vec64c & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec64c operator * (Vec64c const a, Vec64c const b) {
+    return Vec64c(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec64c & operator *= (Vec64c & a, Vec64c const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec64c operator << (Vec64c const a, int32_t b) {
+    return Vec64c(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec64c & operator <<= (Vec64c & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec64c operator >> (Vec64c const a, int32_t b) {
+    return Vec64c(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec64c & operator >>= (Vec64c & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec64cb operator == (Vec64c const a, Vec64c const b) {
+    return Vec64cb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec64cb operator != (Vec64c const a, Vec64c const b) {
+    return Vec64cb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec64cb operator > (Vec64c const a, Vec64c const b) {
+    return Vec64cb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec64cb operator < (Vec64c const a, Vec64c const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec64cb operator >= (Vec64c const a, Vec64c const b) {
+    return Vec64cb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec64cb operator <= (Vec64c const a, Vec64c const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec64c operator & (Vec64c const a, Vec64c const b) {
+    return Vec64c(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec64c & operator &= (Vec64c & a, Vec64c const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec64c operator | (Vec64c const a, Vec64c const b) {
+    return Vec64c(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec64c & operator |= (Vec64c & a, Vec64c const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec64c operator ^ (Vec64c const a, Vec64c const b) {
+    return Vec64c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec64c & operator ^= (Vec64c & a, Vec64c const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec64c operator ~ (Vec64c const a) {
+    return Vec64c(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec64c select (Vec64cb const s, Vec64c const a, Vec64c const b) {
+    return Vec64c(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec64c if_add (Vec64cb const f, Vec64c const a, Vec64c const b) {
+    return Vec64c(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec64c if_sub (Vec64cb const f, Vec64c const a, Vec64c const b) {
+    return Vec64c(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec64c if_mul (Vec64cb const f, Vec64c const a, Vec64c const b) {
+    return Vec64c(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int8_t horizontal_add (Vec64c const a) {
+    return (int8_t)horizontal_add(a.get_low() + a.get_high());
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec64c const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec64c add_saturated(Vec64c const a, Vec64c const b) {
+    return Vec64c(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec64c sub_saturated(Vec64c const a, Vec64c const b) {
+    return Vec64c(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec64c max(Vec64c const a, Vec64c const b) {
+    return Vec64c(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec64c min(Vec64c const a, Vec64c const b) {
+    return Vec64c(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec64c abs(Vec64c const a) {
+    return Vec64c(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec64c abs_saturated(Vec64c const a) {
+    return Vec64c(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec64c rotate_left(Vec64c const a, int b) {
+    return Vec64c(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 64 8-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec64uc : public Vec64c {
+public:
+    // Default constructor:
+    Vec64uc() {
+    }
+    // Construct from Vec64c
+    Vec64uc(Vec64c const a) : Vec64c(a) {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec64uc(uint8_t i) : Vec64c(int8_t(i)) {
+    }
+    // Constructor to build from two Vec32uc:
+    Vec64uc(Vec32uc const a0, Vec32uc const a1) : Vec64c(a0,a1) {
+    }
+    // Constructor to build from all elements:
+    Vec64uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7,
+        uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15,
+        uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23,
+        uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31,
+        uint8_t i32, uint8_t i33, uint8_t i34, uint8_t i35, uint8_t i36, uint8_t i37, uint8_t i38, uint8_t i39,
+        uint8_t i40, uint8_t i41, uint8_t i42, uint8_t i43, uint8_t i44, uint8_t i45, uint8_t i46, uint8_t i47,
+        uint8_t i48, uint8_t i49, uint8_t i50, uint8_t i51, uint8_t i52, uint8_t i53, uint8_t i54, uint8_t i55,
+        uint8_t i56, uint8_t i57, uint8_t i58, uint8_t i59, uint8_t i60, uint8_t i61, uint8_t i62, uint8_t i63) {
+        // _mm512_set_epi8 and _mm512_set_epi16 missing in GCC 7.4.0
+        uint8_t aa[64] = {
+            i0, i1, i2, i3, i4, i5, i6, i7,i8, i9, i10, i11, i12, i13, i14, i15,
+            i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31,
+            i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+            i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 };
+        load(aa);
+    }
+
+#ifdef VECTORI512_H
+   // Constructor to convert from type __m512i used in intrinsics:
+   Vec64uc(__m512i const x) : Vec64c(x) {};
+
+   // Assignment operator to convert from type __m512i used in intrinsics:
+   Vec64uc & operator = (__m512i const x) {
+       return *this = Vec64uc(x);
+   }
+#else
+    // Constructor to convert from type Vec512b
+    Vec64uc(Vec512b const x) : Vec64c(x) {}
+
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec64uc & operator = (Vec512b const x) {
+        return *this = Vec64uc(x);
+    }
+#endif
+    // Member function to load from array (unaligned)
+    Vec64uc & load(void const * p) {
+        Vec64c::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec64uc & load_a(void const * p) {
+        Vec64c::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    // Note: This function is inefficient. Use load function if changing more than one element
+    Vec64uc const insert(int index, uint8_t value) {
+        Vec64c::insert(index, (int8_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint8_t extract(int index) const {
+        return (uint8_t)Vec64c::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint8_t operator [] (int index) const {
+        return (uint8_t)Vec64c::extract(index);
+    }
+    // Member functions to split into two Vec32uc:
+    Vec32uc get_low() const {
+        return Vec32uc(Vec64c::get_low());
+    }
+    Vec32uc get_high() const {
+        return Vec32uc(Vec64c::get_high());
+    }
+    static constexpr int elementtype() {
+        return 5;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add element by element
+static inline Vec64uc operator + (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract element by element
+static inline Vec64uc operator - (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator ' : multiply element by element
+static inline Vec64uc operator * (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide
+// See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec64uc operator >> (Vec64uc const a, uint32_t b) {
+    return Vec64uc(a.get_low() >> b, a.get_high() >> b);
+}
+static inline Vec64uc operator >> (Vec64uc const a, int b) {
+    return a >> uint32_t(b);
+}
+
+// vector operator >>= : shift right logical
+static inline Vec64uc & operator >>= (Vec64uc & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical (signed b)
+static inline Vec64uc & operator >>= (Vec64uc & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec64uc operator << (Vec64uc const a, uint32_t b) {
+    return Vec64uc(a.get_low() << b, a.get_high() << b);
+}
+static inline Vec64uc operator << (Vec64uc const a, int b) {
+    return a << uint32_t(b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec64cb operator < (Vec64uc const a, Vec64uc const b) {
+    return Vec64cb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec64cb operator > (Vec64uc const a, Vec64uc const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec64cb operator >= (Vec64uc const a, Vec64uc const b) {
+    return Vec64cb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec64cb operator <= (Vec64uc const a, Vec64uc const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec64uc operator & (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) & Vec64c(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec64uc operator | (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) | Vec64c(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec64uc operator ^ (Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(Vec64c(a) ^ Vec64c(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec64uc operator ~ (Vec64uc const a) {
+    return Vec64uc( ~ Vec64c(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec64uc select (Vec64cb const s, Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec64uc if_add (Vec64cb const f, Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec64uc if_sub (Vec64cb const f, Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec64uc if_mul (Vec64cb const f, Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec64uc add_saturated(Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec64uc sub_saturated(Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec64uc max(Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec64uc min(Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 16-bit signed integers
+*
+*****************************************************************************/
+
+class Vec32s : public Vec64c {
+public:
+    // Default constructor:
+    Vec32s() {
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32s(int16_t i) {
+        z0 = z1 = Vec16s(i);
+    }
+    // Constructor to build from all elements:
+    Vec32s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7,
+        int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15,
+        int16_t i16, int16_t i17, int16_t i18, int16_t i19, int16_t i20, int16_t i21, int16_t i22, int16_t i23,
+        int16_t i24, int16_t i25, int16_t i26, int16_t i27, int16_t i28, int16_t i29, int16_t i30, int16_t i31) {
+        Vec16s x0 = Vec16s(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+        Vec16s x1 = Vec16s(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+        *this = Vec32s(x0,x1);
+    }
+    // Constructor to build from two Vec16s:
+    Vec32s(Vec16s const a0, Vec16s const a1) {
+        z0 = a0;  z1 = a1;
+    }
+#ifdef VECTORI512_H
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec32s(__m512i const x) {
+        Vec16i zz(x);
+        z0 = zz.get_low();
+        z1 = zz.get_high();
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec32s & operator = (__m512i const x) {
+        Vec16i zz(x);
+        z0 = zz.get_low();
+        z1 = zz.get_high();
+        return *this;
+    }
+#else
+    // Constructor to convert from type Vec512b
+    Vec32s(Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+    }
+    // Assignment operator to convert from type Vec512b
+    Vec32s & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+#endif
+    // Member function to load from array (unaligned)
+    Vec32s & load(void const * p) {
+        z0 = Vec16s().load(p);
+        z1 = Vec16s().load((int16_t*)p + 16);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec32s & load_a(void const * p) {
+        z0 = Vec16s().load_a(p);
+        z1 = Vec16s().load_a((int16_t*)p + 16);
+        return *this;
+    }
+    // Partial load. Load n elements and set the rest to 0
+    Vec32s & load_partial(int n, void const * p) {
+        if (uint32_t(n) < 16) {
+            z0 = Vec16s().load_partial(n, p);
+            z1 = Vec16s(0);
+        }
+        else {
+            z0 = Vec16s().load(p);
+            z1 = Vec16s().load_partial(n-16, (int16_t*)p + 16);
+        }
+        return *this;
+    }
+    // store
+    void store(void * p) const {
+        Vec16s(z0).store(p);
+        Vec16s(z1).store((int16_t*)p + 16);
+    }
+    // store aligned
+    void store_a(void * p) const {
+        Vec16s(z0).store_a(p);
+        Vec16s(z1).store_a((int16_t*)p + 16);
+    }
+    // Partial store. Store n elements
+    void store_partial(int n, void * p) const {
+        if (uint32_t(n) < 16) {
+            Vec16s(z0).store_partial(n, p);
+        }
+        else {
+            Vec16s(z0).store(p);
+            Vec16s(z1).store_partial(n-16, (int16_t*)p + 16);
+        }
+    }
+    // cut off vector to n elements. The last 32-n elements are set to zero
+    Vec32s & cutoff(int n) {
+        if (uint32_t(n) < 16) {
+            z0 = Vec16s(z0).cutoff(n);
+            z1 = Vec16s(0);
+        }
+        else {
+            z1 = Vec16s(z1).cutoff(n-16);
+        }
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32s const insert(int index, int16_t value) {
+        if ((uint32_t)index < 16) {
+            z0 = Vec16s(z0).insert(index, value);
+        }
+        else {
+            z1 = Vec16s(z1).insert(index-16, value);
+        }
+        return *this;
+    }
+    // Member function extract a single element from vector
+    int16_t extract(int index) const {
+        if (index < 16) {
+            return Vec16s(z0).extract(index);
+        }
+        else {
+            return Vec16s(z1).extract(index-16);
+        }
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    int16_t operator [] (int index) const {
+        return extract(index);
+    }
+    // Member functions to split into two Vec16s:
+    Vec16s get_low() const {
+        return z0;
+    }
+    Vec16s get_high() const {
+        return z1;
+    }
+    static constexpr int size() {
+        return 32;
+    }
+    static constexpr int elementtype() {
+        return 6;
+    }
+};
+
+
+/*****************************************************************************
+*
+*          Vec32sb: Vector of 64 Booleans for use with Vec32s and Vec32us
+*
+*****************************************************************************/
+
+class Vec32sb : public Vec32s {
+public:
+    // Default constructor:
+    Vec32sb () {
+    }
+    // Constructor to build from all elements: Not implemented
+
+    // Constructor to convert from type __mmask32 used in intrinsics: not possible
+
+    // Constructor to broadcast single value:
+    Vec32sb(bool b) {
+        z0 = z1 = Vec16s(-int16_t(b));
+    }
+    // Constructor to make from two halves
+    Vec32sb (Vec16sb const x0, Vec16sb const x1) {
+        z0 = x0;  z1 = x1;
+    }
+    // Assignment operator to convert from type __mmask32 used in intrinsics: not possible
+
+    // Assignment operator to broadcast scalar value:
+    Vec32sb & operator = (bool b) {
+        *this = Vec32sb(b);
+        return *this;
+    }
+    // Member functions to split into two Vec16sb:
+    Vec16sb get_low() const {
+        return z0;
+    }
+    Vec16sb get_high() const {
+        return z1;
+    }
+    // Member function to change a single element in vector
+    Vec32sb & insert(int index, bool a) {
+        Vec32s::insert(index, -(int16_t)a);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    bool extract(int index) const {
+        return Vec32s::extract(index) != 0;
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    bool operator [] (int index) const {
+        return extract(index);
+    }
+    // Type cast operator to convert to __mmask64 used in intrinsics. Not possible
+
+    // Member function to change a bitfield to a boolean vector
+    Vec32sb & load_bits(uint32_t a) {
+        z0 = Vec16sb().load_bits(uint16_t(a));
+        z1 = Vec16sb().load_bits(uint16_t(a>>16));
+        return *this;
+    }
+    static constexpr int elementtype() {
+        return 3;
+    }
+    Vec32sb(int b) = delete; // Prevent constructing from int, etc.
+    Vec32sb & operator = (int x) = delete; // Prevent assigning int because of ambiguity
+};
+
+
+/*****************************************************************************
+*
+*          Define operators and functions for Vec32sb
+*
+*****************************************************************************/
+
+// vector operator & : bitwise and
+static inline Vec32sb operator & (Vec32sb const a, Vec32sb const b) {
+    return Vec32sb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+static inline Vec32sb operator && (Vec32sb const a, Vec32sb const b) {
+    return a & b;
+}
+// vector operator &= : bitwise and
+static inline Vec32sb & operator &= (Vec32sb & a, Vec32sb const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32sb operator | (Vec32sb const a, Vec32sb const b) {
+    return Vec32sb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+static inline Vec32sb operator || (Vec32sb const a, Vec32sb const b) {
+    return a | b;
+}
+// vector operator |= : bitwise or
+static inline Vec32sb & operator |= (Vec32sb & a, Vec32sb const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32sb operator ^ (Vec32sb const a, Vec32sb const b) {
+    return Vec32sb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+// vector operator ^= : bitwise xor
+static inline Vec32sb & operator ^= (Vec32sb & a, Vec32sb const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator == : xnor
+static inline Vec32sb operator == (Vec32sb const a, Vec32sb const b) {
+    return Vec32sb(a.get_low() == b.get_low(), a.get_high() == b.get_high());}
+
+// vector operator != : xor
+static inline Vec32sb operator != (Vec32sb const a, Vec32sb const b) {
+    return Vec32sb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());}
+
+// vector operator ~ : bitwise not
+static inline Vec32sb operator ~ (Vec32sb const a) {
+    return Vec32sb(~a.get_low(), ~a.get_high());}
+
+// vector operator ! : element not
+static inline Vec32sb operator ! (Vec32sb const a) {
+    return ~a;
+}
+
+// vector function andnot
+static inline Vec32sb andnot (Vec32sb const a, Vec32sb const b) {
+    return Vec32sb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));}
+
+// horizontal_and. Returns true if all bits are 1
+static inline bool horizontal_and (Vec32sb const a) {
+    return horizontal_and(a.get_low()) && horizontal_and(a.get_high());
+}
+
+// horizontal_or. Returns true if at least one bit is 1
+static inline bool horizontal_or (Vec32sb const a) {
+    return horizontal_or(a.get_low()) || horizontal_or(a.get_high());
+}
+
+// to_bits: convert boolean vector to integer bitfield
+static inline uint32_t to_bits(Vec32sb a) {
+    return uint32_t(to_bits(a.get_high())) << 16 | to_bits(a.get_low());
+}
+
+
+/*****************************************************************************
+*
+*          Define operators for Vec32s
+*
+*****************************************************************************/
+
+// vector operator + : add element by element
+static inline Vec32s operator + (Vec32s const a, Vec32s const b) {
+    return Vec32s(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator += : add
+static inline Vec32s & operator += (Vec32s & a, Vec32s const b) {
+    a = a + b;
+    return a;
+}
+
+// postfix operator ++
+static inline Vec32s operator ++ (Vec32s & a, int) {
+    Vec32s a0 = a;
+    a = a + 1;
+    return a0;
+}
+
+// prefix operator ++
+static inline Vec32s & operator ++ (Vec32s & a) {
+    a = a + 1;
+    return a;
+}
+
+// vector operator - : subtract element by element
+static inline Vec32s operator - (Vec32s const a, Vec32s const b) {
+    return Vec32s(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator - : unary minus
+static inline Vec32s operator - (Vec32s const a) {
+    return Vec32s(-a.get_low(), -a.get_high());
+}
+
+// vector operator -= : subtract
+static inline Vec32s & operator -= (Vec32s & a, Vec32s const b) {
+    a = a - b;
+    return a;
+}
+
+// postfix operator --
+static inline Vec32s operator -- (Vec32s & a, int) {
+    Vec32s a0 = a;
+    a = a - 1;
+    return a0;
+}
+
+// prefix operator --
+static inline Vec32s & operator -- (Vec32s & a) {
+    a = a - 1;
+    return a;
+}
+
+// vector operator * : multiply element by element
+static inline Vec32s operator * (Vec32s const a, Vec32s const b) {
+    return Vec32s(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator *= : multiply
+static inline Vec32s & operator *= (Vec32s & a, Vec32s const b) {
+    a = a * b;
+    return a;
+}
+
+// vector operator / : divide all elements by same integer
+// See bottom of file
+
+// vector operator << : shift left
+static inline Vec32s operator << (Vec32s const a, int32_t b) {
+    return Vec32s(a.get_low() << b, a.get_high() << b);
+}
+
+// vector operator <<= : shift left
+static inline Vec32s & operator <<= (Vec32s & a, int32_t b) {
+    a = a << b;
+    return a;
+}
+
+// vector operator >> : shift right arithmetic
+static inline Vec32s operator >> (Vec32s const a, int32_t b) {
+    return Vec32s(a.get_low() >> b, a.get_high() >> b);
+}
+
+// vector operator >>= : shift right arithmetic
+static inline Vec32s & operator >>= (Vec32s & a, int32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator == : returns true for elements for which a == b
+static inline Vec32sb operator == (Vec32s const a, Vec32s const b) {
+    return Vec32sb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
+}
+
+// vector operator != : returns true for elements for which a != b
+static inline Vec32sb operator != (Vec32s const a, Vec32s const b) {
+    return Vec32sb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b
+static inline Vec32sb operator > (Vec32s const a, Vec32s const b) {
+    return Vec32sb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
+}
+
+// vector operator < : returns true for elements for which a < b
+static inline Vec32sb operator < (Vec32s const a, Vec32s const b) {
+    return b > a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (signed)
+static inline Vec32sb operator >= (Vec32s const a, Vec32s const b) {
+    return Vec32sb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (signed)
+static inline Vec32sb operator <= (Vec32s const a, Vec32s const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32s operator & (Vec32s const a, Vec32s const b) {
+    return Vec32s(a.get_low() & b.get_low(), a.get_high() & b.get_high());
+}
+
+// vector operator &= : bitwise and
+static inline Vec32s & operator &= (Vec32s & a, Vec32s const b) {
+    a = a & b;
+    return a;
+}
+
+// vector operator | : bitwise or
+static inline Vec32s operator | (Vec32s const a, Vec32s const b) {
+    return Vec32s(a.get_low() | b.get_low(), a.get_high() | b.get_high());
+}
+
+// vector operator |= : bitwise or
+static inline Vec32s & operator |= (Vec32s & a, Vec32s const b) {
+    a = a | b;
+    return a;
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32s operator ^ (Vec32s const a, Vec32s const b) {
+    return Vec32s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
+}
+
+// vector operator ^= : bitwise xor
+static inline Vec32s & operator ^= (Vec32s & a, Vec32s const b) {
+    a = a ^ b;
+    return a;
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32s operator ~ (Vec32s const a) {
+    return Vec32s(~a.get_low(), ~a.get_high());
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec32s select (Vec32sb const s, Vec32s const a, Vec32s const b) {
+    return Vec32s(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32s if_add (Vec32sb const f, Vec32s const a, Vec32s const b) {
+    return Vec32s(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec32s if_sub (Vec32sb const f, Vec32s const a, Vec32s const b) {
+    return Vec32s(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec32s if_mul (Vec32sb const f, Vec32s const a, Vec32s const b) {
+    return Vec32s(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around
+static inline int16_t horizontal_add (Vec32s const a) {
+    Vec16s s = a.get_low() + a.get_high();
+    return (int16_t)horizontal_add(s);
+}
+
+// Horizontal add extended: Calculates the sum of all vector elements.
+// Each element is sign-extended before addition to avoid overflow
+static inline int32_t horizontal_add_x (Vec32s const a) {
+    return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high());
+}
+
+// function add_saturated: add element by element, signed with saturation
+static inline Vec32s add_saturated(Vec32s const a, Vec32s const b) {
+    return Vec32s(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, signed with saturation
+static inline Vec32s sub_saturated(Vec32s const a, Vec32s const b) {
+    return Vec32s(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32s max(Vec32s const a, Vec32s const b) {
+    return Vec32s(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32s min(Vec32s const a, Vec32s const b) {
+    return Vec32s(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+// function abs: a >= 0 ? a : -a
+static inline Vec32s abs(Vec32s const a) {
+    return Vec32s(abs(a.get_low()), abs(a.get_high()));
+}
+
+// function abs_saturated: same as abs, saturate if overflow
+static inline Vec32s abs_saturated(Vec32s const a) {
+    return Vec32s(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
+}
+
+// function rotate_left all elements
+// Use negative count to rotate right
+static inline Vec32s rotate_left(Vec32s const a, int b) {
+    return Vec32s(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
+}
+
+
+/*****************************************************************************
+*
+*          Vector of 32 16-bit unsigned integers
+*
+*****************************************************************************/
+
+class Vec32us : public Vec32s {
+public:
+    // Default constructor:
+    Vec32us() {
+    }
+    // Construct from Vec32s
+    Vec32us(Vec32s const a) {
+        z0 = a.get_low();  z1 = a.get_high();
+    }
+    // Constructor to broadcast the same value into all elements:
+    Vec32us(uint16_t i) {
+        z0 = z1 = Vec16us(i);
+    }
+    // Constructor to build from all elements:
+    Vec32us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7,
+        uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15,
+        uint16_t i16, uint16_t i17, uint16_t i18, uint16_t i19, uint16_t i20, uint16_t i21, uint16_t i22, uint16_t i23,
+        uint16_t i24, uint16_t i25, uint16_t i26, uint16_t i27, uint16_t i28, uint16_t i29, uint16_t i30, uint16_t i31) {
+        Vec16us x0 = Vec16us(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+        Vec16us x1 = Vec16us(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31);
+        *this = Vec32us(x0,x1);
+    }
+    // Constructor to build from two Vec16us:
+    Vec32us(Vec16us const a0, Vec16us const a1) {
+        z0 = a0;  z1 = a1;
+    }
+#ifdef VECTORI512_H
+    // Constructor to convert from type __m512i used in intrinsics:
+    Vec32us(__m512i const x) : Vec32s(x) {
+    }
+    // Assignment operator to convert from type __m512i used in intrinsics:
+    Vec32us & operator = (__m512i const x) {
+        return *this = Vec32us(x);
+    }
+#else
+    // Constructor to convert from type Vec512b
+    Vec32us(Vec512b const x) : Vec32s(x) {}
+    // Assignment operator to convert from type Vec512b
+    Vec32us & operator = (Vec512b const x) {
+        z0 = x.get_low();
+        z1 = x.get_high();
+        return *this;
+    }
+#endif
+    // Member function to load from array (unaligned)
+    Vec32us & load(void const * p) {
+        Vec32s::load(p);
+        return *this;
+    }
+    // Member function to load from array, aligned by 64
+    Vec32us & load_a(void const * p) {
+        Vec32s::load_a(p);
+        return *this;
+    }
+    // Member function to change a single element in vector
+    Vec32us const insert(int index, uint16_t value) {
+        Vec32s::insert(index, (int16_t)value);
+        return *this;
+    }
+    // Member function extract a single element from vector
+    uint16_t extract(int index) const {
+        return (uint16_t)Vec32s::extract(index);
+    }
+    // Extract a single element. Use store function if extracting more than one element.
+    // Operator [] can only read an element, not write.
+    uint16_t operator [] (int index) const {
+        return (uint16_t)Vec32s::extract(index);
+    }
+    // Member functions to split into two Vec16us:
+    Vec16us get_low() const {
+        return Vec16us(Vec32s::get_low());
+    }
+    Vec16us get_high() const {
+        return Vec16us(Vec32s::get_high());
+    }
+    static constexpr int elementtype() {
+        return 7;
+    }
+};
+
+// Define operators for this class
+
+// vector operator + : add element by element
+static inline Vec32us operator + (Vec32us const a, Vec32us const b) {
+    return Vec32us(a.get_low() + b.get_low(), a.get_high() + b.get_high());
+}
+
+// vector operator - : subtract element by element
+static inline Vec32us operator - (Vec32us const a, Vec32us const b) {
+    return Vec32us(a.get_low() - b.get_low(), a.get_high() - b.get_high());
+}
+
+// vector operator * : multiply element by element
+static inline Vec32us operator * (Vec32us const a, Vec32us const b) {
+    return Vec32us(a.get_low() * b.get_low(), a.get_high() * b.get_high());
+}
+
+// vector operator / : divide. See bottom of file
+
+// vector operator >> : shift right logical all elements
+static inline Vec32us operator >> (Vec32us const a, uint32_t b) {
+    return Vec32us(a.get_low() >> b, a.get_high() >> b);
+}
+static inline Vec32us operator >> (Vec32us const a, int b) {
+    return a >> uint32_t(b);
+}
+
+// vector operator >>= : shift right logical
+static inline Vec32us & operator >>= (Vec32us & a, uint32_t b) {
+    a = a >> b;
+    return a;
+}
+
+// vector operator >>= : shift right logical (signed b)
+static inline Vec32us & operator >>= (Vec32us & a, int32_t b) {
+    a = a >> uint32_t(b);
+    return a;
+}
+
+// vector operator << : shift left all elements
+static inline Vec32us operator << (Vec32us const a, uint32_t b) {
+    return Vec32us(a.get_low() << b, a.get_high() << b);
+}
+static inline Vec32us operator << (Vec32us const a, int b) {
+    return a << uint32_t(b);
+}
+
+// vector operator < : returns true for elements for which a < b (unsigned)
+static inline Vec32sb operator < (Vec32us const a, Vec32us const b) {
+    return Vec32sb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
+}
+
+// vector operator > : returns true for elements for which a > b (unsigned)
+static inline Vec32sb operator > (Vec32us const a, Vec32us const b) {
+    return b < a;
+}
+
+// vector operator >= : returns true for elements for which a >= b (unsigned)
+static inline Vec32sb operator >= (Vec32us const a, Vec32us const b) {
+    return Vec32sb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
+}
+
+// vector operator <= : returns true for elements for which a <= b (unsigned)
+static inline Vec32sb operator <= (Vec32us const a, Vec32us const b) {
+    return b >= a;
+}
+
+// vector operator & : bitwise and
+static inline Vec32us operator & (Vec32us const a, Vec32us const b) {
+    return Vec32us(Vec32s(a) & Vec32s(b));
+}
+
+// vector operator | : bitwise or
+static inline Vec32us operator | (Vec32us const a, Vec32us const b) {
+    return Vec32us(Vec32s(a) | Vec32s(b));
+}
+
+// vector operator ^ : bitwise xor
+static inline Vec32us operator ^ (Vec32us const a, Vec32us const b) {
+    return Vec32us(Vec32s(a) ^ Vec32s(b));
+}
+
+// vector operator ~ : bitwise not
+static inline Vec32us operator ~ (Vec32us const a) {
+    return Vec32us( ~ Vec32s(a));
+}
+
+// Functions for this class
+
+// Select between two operands. Corresponds to this pseudocode:
+// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
+static inline Vec32us select (Vec32sb const s, Vec32us const a, Vec32us const b) {
+    return Vec32us(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
+static inline Vec32us if_add (Vec32sb const f, Vec32us const a, Vec32us const b) {
+    return Vec32us(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional subtract
+static inline Vec32us if_sub (Vec32sb const f, Vec32us const a, Vec32us const b) {
+    return Vec32us(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// Conditional multiply
+static inline Vec32us if_mul (Vec32sb const f, Vec32us const a, Vec32us const b) {
+    return Vec32us(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high()));
+}
+
+// function add_saturated: add element by element, unsigned with saturation
+static inline Vec32us add_saturated(Vec32us const a, Vec32us const b) {
+    return Vec32us(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
+}
+
+// function sub_saturated: subtract element by element, unsigned with saturation
+static inline Vec32us sub_saturated(Vec32us const a, Vec32us const b) {
+    return Vec32us(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
+}
+
+// function max: a > b ? a : b
+static inline Vec32us max(Vec32us const a, Vec32us const b) {
+    return Vec32us(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
+}
+
+// function min: a < b ? a : b
+static inline Vec32us min(Vec32us const a, Vec32us const b) {
+    return Vec32us(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Vector permute and blend functions
+*
+*****************************************************************************/
+
+// Permute vector of 32 16-bit integers.
+template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32s permute32(Vec32s const a) {
+    return Vec32s(
+        blend16<i0, i1, i2 ,i3 ,i4 ,i5 ,i6 ,i7, i8, i9, i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()),
+        blend16<i16,i17,i18,i19,i20,i21,i22,i23,i24,i25,i26,i27,i28,i29,i30,i31> (a.get_low(), a.get_high()));
+}
+
+template <int... i0 >
+    static inline Vec32us permute32(Vec32us const a) {
+    return Vec32us (permute32<i0...> (Vec32s(a)));
+}
+
+// Permute vector of 64 8-bit integers.
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31,
+    int i32, int i33, int i34, int i35, int i36, int i37, int i38, int i39,
+    int i40, int i41, int i42, int i43, int i44, int i45, int i46, int i47,
+    int i48, int i49, int i50, int i51, int i52, int i53, int i54, int i55,
+    int i56, int i57, int i58, int i59, int i60, int i61, int i62, int i63 >
+    static inline Vec64c permute64(Vec64c const a) {
+    return Vec64c(
+        blend32 <
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+        i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31
+        > (a.get_low(), a.get_high()),
+        blend32 <
+        i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+        i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63
+        > (a.get_low(), a.get_high()));
+}
+
+template <int... i0 >
+static inline Vec64uc permute64(Vec64uc const a) {
+    return Vec64uc (permute64<i0...> (Vec64c(a)));
+}
+
+// Blend vector of 32 16-bit integers
+template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
+    static inline Vec32s blend32(Vec32s const& a, Vec32s const& b) {
+    Vec16s x0 = blend_half<Vec32s, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15>(a, b);
+    Vec16s x1 = blend_half<Vec32s, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31>(a, b);
+    return Vec32s(x0, x1);
+}
+
+template <int ... i0 >
+static inline Vec32us blend32(Vec32us const a, Vec32us const b) {
+    return Vec32us(blend32<i0 ...> (Vec32s(a),Vec32s(b)));
+}
+
+// Blend vector of 64 8-bit integers
+template <
+    int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
+    int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15,
+    int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
+    int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31,
+    int i32, int i33, int i34, int i35, int i36, int i37, int i38, int i39,
+    int i40, int i41, int i42, int i43, int i44, int i45, int i46, int i47,
+    int i48, int i49, int i50, int i51, int i52, int i53, int i54, int i55,
+    int i56, int i57, int i58, int i59, int i60, int i61, int i62, int i63 >
+    static inline Vec64c blend64(Vec64c const a, Vec64c const b) {
+    Vec32c x0 = blend_half < Vec64c,
+        i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+        i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 > (a, b);
+    Vec32c x1 = blend_half < Vec64c,
+        i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+        i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 > (a, b);
+    return Vec64c(x0, x1);
+}
+
+template <int ... i0 >
+static inline Vec64uc blend64(Vec64uc const a, Vec64uc const b) {
+    return Vec64uc(blend64 <i0 ...>(Vec64c(a), Vec64c(b)));
+}
+
+
+/*****************************************************************************
+*
+*          Vector lookup functions
+*
+******************************************************************************
+*
+* These functions use vector elements as indexes into a table.
+* The table is given as one or more vectors
+*
+*****************************************************************************/
+
+// lookup in table of 64 int8_t values
+static inline Vec64c lookup64(Vec64c const index, Vec64c const table1) {
+    int8_t table[64], result[64];
+    table1.store(table);
+    for (int i=0; i<64; i++) result[i] = table[index[i] & 63];
+    return Vec64c().load(result);
+}
+
+// lookup in table of 128 int8_t values
+static inline Vec64c lookup128(Vec64c const index, Vec64c const table1, Vec64c const table2) {
+    int8_t table[128], result[64];
+    table1.store(table);  table2.store(table+64);
+    for (int i=0; i<64; i++) result[i] = table[index[i] & 127];
+    return Vec64c().load(result);
+}
+
+// lookup in table of 256 int8_t values.
+// The complete table of all possible 256 byte values is contained in four vectors
+// The index is treated as unsigned
+static inline Vec64c lookup256(Vec64c const index, Vec64c const table1, Vec64c const table2, Vec64c const table3, Vec64c const table4) {
+    int8_t table[256], result[64];
+    table1.store(table);  table2.store(table+64);  table3.store(table+128);  table4.store(table+192);
+    for (int i=0; i<64; i++) result[i] = table[index[i] & 255];
+    return Vec64c().load(result);
+}
+
+// lookup in table of 32 values
+static inline Vec32s lookup32(Vec32s const index, Vec32s const table1) {
+    int16_t table[32], result[32];
+    table1.store(table);
+    for (int i=0; i<32; i++) result[i] = table[index[i] & 31];
+    return Vec32s().load(result);
+}
+
+// lookup in table of 64 values
+static inline Vec32s lookup64(Vec32s const index, Vec32s const table1, Vec32s const table2) {
+    int16_t table[64], result[32];
+    table1.store(table);  table2.store(table+32);
+    for (int i=0; i<32; i++) result[i] = table[index[i] & 63];
+    return Vec32s().load(result);
+}
+
+// lookup in table of 128 values
+static inline Vec32s lookup128(Vec32s const index, Vec32s const table1, Vec32s const table2, Vec32s const table3, Vec32s const table4) {
+    int16_t table[128], result[32];
+    table1.store(table);  table2.store(table+32);  table3.store(table+64);  table4.store(table+96);
+    for (int i=0; i<32; i++) result[i] = table[index[i] & 127];
+    return Vec32s().load(result);
+}
+
+
+/*****************************************************************************
+*
+*          Byte shifts
+*
+*****************************************************************************/
+
+// Function shift_bytes_up: shift whole vector left by b bytes.
+template <unsigned int b>
+static inline Vec64c shift_bytes_up(Vec64c const a) {
+    int8_t dat[128];
+    if (b < 64) {
+        Vec64c(0).store(dat);
+        a.store(dat+b);
+        return Vec64c().load(dat);
+    }
+    else return 0;
+}
+
+// Function shift_bytes_down: shift whole vector right by b bytes
+template <unsigned int b>
+static inline Vec64c shift_bytes_down(Vec64c const a) {
+    int8_t dat[128];
+    if (b < 64) {
+        a.store(dat);
+        Vec64c(0).store(dat+64);
+        return Vec64c().load(dat+b);
+    }
+    else return 0;
+}
+
+
+/*****************************************************************************
+*
+*          Functions for conversion between integer sizes
+*
+*****************************************************************************/
+
+// Extend 8-bit integers to 16-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 32 elements to 16 bits with sign extension
+static inline Vec32s extend_low (Vec64c const a) {
+    return Vec32s(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 16 elements to 16 bits with sign extension
+static inline Vec32s extend_high (Vec64c const a) {
+    return Vec32s(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 16 elements to 16 bits with zero extension
+static inline Vec32us extend_low (Vec64uc const a) {
+    return Vec32us(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 19 elements to 16 bits with zero extension
+static inline Vec32us extend_high (Vec64uc const a) {
+    return Vec32us(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Extend 16-bit integers to 32-bit integers, signed and unsigned
+
+// Function extend_low : extends the low 8 elements to 32 bits with sign extension
+static inline Vec16i extend_low (Vec32s const a) {
+    return Vec16i(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with sign extension
+static inline Vec16i extend_high (Vec32s const a) {
+    return Vec16i(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+// Function extend_low : extends the low 8 elements to 32 bits with zero extension
+static inline Vec16ui extend_low (Vec32us const a) {
+    return Vec16ui(extend_low(a.get_low()), extend_high(a.get_low()));
+}
+
+// Function extend_high : extends the high 8 elements to 32 bits with zero extension
+static inline Vec16ui extend_high (Vec32us const a) {
+    return Vec16ui(extend_low(a.get_high()), extend_high(a.get_high()));
+}
+
+
+// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Overflow wraps around
+static inline Vec64c compress (Vec32s const low, Vec32s const high) {
+    return Vec64c(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Signed, with saturation
+static inline Vec64c compress_saturated (Vec32s const low, Vec32s const high) {
+    return Vec64c(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers
+// Unsigned, overflow wraps around
+static inline Vec64uc compress (Vec32us const low, Vec32us const high) {
+    return  Vec64uc(compress((Vec32s)low, (Vec32s)high));
+}
+
+// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
+// Unsigned, with saturation
+static inline Vec64uc compress_saturated (Vec32us const low, Vec32us const high) {
+    return Vec64uc(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec32s compress (Vec16i const low, Vec16i const high) {
+    return Vec32s(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Signed with saturation
+static inline Vec32s compress_saturated (Vec16i const low, Vec16i const high) {
+    return Vec32s(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Overflow wraps around
+static inline Vec32us compress (Vec16ui const low, Vec16ui const high) {
+    return Vec32us (compress((Vec16i)low, (Vec16i)high));
+}
+
+// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
+// Unsigned, with saturation
+static inline Vec32us compress_saturated (Vec16ui const low, Vec16ui const high) {
+    return Vec32us(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
+}
+
+
+/*****************************************************************************
+*
+*          Integer division operators
+*
+*          Please see the file vectori128.h for explanation.
+*
+*****************************************************************************/
+
+// vector operator / : divide each element by divisor
+
+// vector of 32 16-bit signed integers
+static inline Vec32s operator / (Vec32s const a, Divisor_s const d) {
+    return Vec32s(a.get_low() / d, a.get_high() / d);
+}
+
+// vector of 16 16-bit unsigned integers
+static inline Vec32us operator / (Vec32us const a, Divisor_us const d) {
+    return Vec32us(a.get_low() / d, a.get_high() / d);
+}
+
+// vector of 32 8-bit signed integers
+static inline Vec64c operator / (Vec64c const a, Divisor_s const d) {
+    return Vec64c(a.get_low() / d, a.get_high() / d);
+}
+
+// vector of 32 8-bit unsigned integers
+static inline Vec64uc operator / (Vec64uc const a, Divisor_us const d) {
+    return Vec64uc(a.get_low() / d, a.get_high() / d);
+}
+
+// vector operator /= : divide
+static inline Vec32s & operator /= (Vec32s & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec32us & operator /= (Vec32us & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec64c & operator /= (Vec64c & a, Divisor_s const d) {
+    a = a / d;
+    return a;
+}
+
+// vector operator /= : divide
+static inline Vec64uc & operator /= (Vec64uc & a, Divisor_us const d) {
+    a = a / d;
+    return a;
+}
+
+
+/*****************************************************************************
+*
+*          Integer division 2: divisor is a compile-time constant
+*
+*****************************************************************************/
+
+
+// Divide Vec32s by compile-time constant
+template <int d>
+static inline Vec32s divide_by_i(Vec32s const a) {
+    return Vec32s(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec32s a / const_int(d)
+template <int d>
+static inline Vec32s operator / (Vec32s const a, Const_int_t<d>) {
+    return Vec32s(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// define Vec32s a / const_uint(d)
+template <uint32_t d>
+static inline Vec32s operator / (Vec32s const a, Const_uint_t<d>) {
+    return Vec32s(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32s & operator /= (Vec32s & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32s & operator /= (Vec32s & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// Divide Vec32us by compile-time constant
+template <uint32_t d>
+static inline Vec32us divide_by_ui(Vec32us const a) {
+    return Vec32us( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
+}
+
+// define Vec32us a / const_uint(d)
+template <uint32_t d>
+static inline Vec32us operator / (Vec32us const a, Const_uint_t<d>) {
+    return divide_by_ui<d>(a);
+}
+
+// define Vec32us a / const_int(d)
+template <int d>
+static inline Vec32us operator / (Vec32us const a, Const_int_t<d>) {
+    static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous");
+    return divide_by_ui<d>(a);                                       // unsigned divide
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec32us & operator /= (Vec32us & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec32us & operator /= (Vec32us & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec64c a / const_int(d)
+template <int d>
+static inline Vec64c operator / (Vec64c const a, Const_int_t<d> b) {
+    return Vec64c( a.get_low() / b, a.get_high() / b);
+}
+
+// define Vec64c a / const_uint(d)
+template <uint32_t d>
+static inline Vec64c operator / (Vec64c const a, Const_uint_t<d> b) {
+    return Vec64c( a.get_low() / b, a.get_high() / b);
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec64c & operator /= (Vec64c & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec64c & operator /= (Vec64c & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// define Vec64uc a / const_uint(d)
+template <uint32_t d>
+static inline Vec64uc operator / (Vec64uc const a, Const_uint_t<d> b) {
+    return Vec64uc( a.get_low() / b, a.get_high() / b);
+}
+
+// define Vec64uc a / const_int(d)
+template <int d>
+static inline Vec64uc operator / (Vec64uc const a, Const_int_t<d> b) {
+    return Vec64uc( a.get_low() / b, a.get_high() / b);
+}
+
+// vector operator /= : divide
+template <uint32_t d>
+static inline Vec64uc & operator /= (Vec64uc & a, Const_uint_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+// vector operator /= : divide
+template <int32_t d>
+static inline Vec64uc & operator /= (Vec64uc & a, Const_int_t<d> b) {
+    a = a / b;
+    return a;
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif // VECTORI512S_H
diff --git a/Bwdif/VCL2/vectormath_common.h b/Bwdif/VCL2/vectormath_common.h
new file mode 100644
index 0000000..8e082b6
--- /dev/null
+++ b/Bwdif/VCL2/vectormath_common.h
@@ -0,0 +1,327 @@
+/***************************  vectormath_common.h   ****************************
+* Author:        Agner Fog
+* Date created:  2014-04-18
+* Last modified: 2020-06-08
+* Version:       2.01.03
+* Project:       vector classes
+* Description:
+* Header file containing common code for inline version of mathematical functions.
+*
+* For detailed instructions, see VectorClass.pdf
+*
+* (c) Copyright 2014-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef VECTORMATH_COMMON_H
+#define VECTORMATH_COMMON_H  2
+
+#ifdef VECTORMATH_LIB_H
+#error conflicting header files. More than one implementation of mathematical functions included
+#endif
+
+#include <cmath>
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20000
+#error Incompatible versions of vector class library mixed
+#endif
+
+
+/******************************************************************************
+                    Define NAN payload values
+******************************************************************************/
+#define NAN_LOG 0x101  // logarithm for x<0
+#define NAN_POW 0x102  // negative number raised to non-integer power
+#define NAN_HYP 0x104  // acosh for x<1 and atanh for abs(x)>1
+
+
+/******************************************************************************
+                    Define mathematical constants
+******************************************************************************/
+#define VM_PI       3.14159265358979323846           // pi
+#define VM_PI_2     1.57079632679489661923           // pi / 2
+#define VM_PI_4     0.785398163397448309616          // pi / 4
+#define VM_SQRT2    1.41421356237309504880           // sqrt(2)
+#define VM_LOG2E    1.44269504088896340736           // 1/log(2)
+#define VM_LOG10E   0.434294481903251827651          // 1/log(10)
+#define VM_LOG210   3.321928094887362347808          // log2(10)
+#define VM_LN2      0.693147180559945309417          // log(2)
+#define VM_LN10     2.30258509299404568402           // log(10)
+#define VM_SMALLEST_NORMAL  2.2250738585072014E-308  // smallest normal number, double
+#define VM_SMALLEST_NORMALF 1.17549435E-38f          // smallest normal number, float
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+/******************************************************************************
+      templates for producing infinite and nan in desired vector type
+******************************************************************************/
+template <class VTYPE>
+static inline VTYPE infinite_vec();
+
+template <>
+inline Vec2d infinite_vec<Vec2d>() {
+    return infinite2d();
+}
+
+template <>
+inline Vec4f infinite_vec<Vec4f>() {
+    return infinite4f();
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d infinite_vec<Vec4d>() {
+    return infinite4d();
+}
+
+template <>
+inline Vec8f infinite_vec<Vec8f>() {
+    return infinite8f();
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d infinite_vec<Vec8d>() {
+    return infinite8d();
+}
+
+template <>
+inline Vec16f infinite_vec<Vec16f>() {
+    return infinite16f();
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+/******************************************************************************
+*                 Detect NAN codes
+*
+* These functions return the code hidden in a NAN. The sign bit is ignored
+******************************************************************************/
+
+static inline Vec4ui nan_code(Vec4f const x) {
+    Vec4ui a = Vec4ui(reinterpret_i(x));
+    Vec4ui const n = 0x007FFFFF;
+    return select(Vec4ib(is_nan(x)), a & n, 0);
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec2uq nan_code(Vec2d const x) {
+    Vec2uq a = Vec2uq(reinterpret_i(x));
+    return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec8ui nan_code(Vec8f const x) {
+    Vec8ui a = Vec8ui(reinterpret_i(x));
+    Vec8ui const n = 0x007FFFFF;
+    return select(Vec8ib(is_nan(x)), a & n, 0);
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec4uq nan_code(Vec4d const x) {
+    Vec4uq a = Vec4uq(reinterpret_i(x));
+    return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+#if MAX_VECTOR_SIZE >= 512
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec16ui nan_code(Vec16f const x) {
+    Vec16ui a = Vec16ui(reinterpret_i(x));
+    Vec16ui const n = 0x007FFFFF;
+    return select(Vec16ib(is_nan(x)), a & n, 0);
+}
+
+// This function returns the code hidden in a NAN. The sign bit is ignored
+static inline Vec8uq nan_code(Vec8d const x) {
+    Vec8uq a = Vec8uq(reinterpret_i(x));
+    return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+/******************************************************************************
+                  templates for polynomials
+Using Estrin's scheme to make shorter dependency chains and use FMA, starting
+longest dependency chains first.
+******************************************************************************/
+
+// template <typedef VECTYPE, typedef CTYPE>
+template <class VTYPE, class CTYPE>
+static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) {
+    // calculates polynomial c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    //return = x2 * c2 + (x * c1 + c0);
+    return mul_add(x2, c2, mul_add(x, c1, c0));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
+    // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    //return (c2 + c3*x)*x2 + (c1*x + c0);
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
+    // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4);
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
+    // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4);
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
+    // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0)));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
+    // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
+    return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0)));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) {
+    // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
+    // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) {
+    // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x * x;
+    VTYPE x4 = x2 * x2;
+    //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) {
+    // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+        mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) {
+    // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(c9, x, c8), x8, mul_add(
+        mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+        mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) {
+    // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
+    return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8,
+        mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+            mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
+    // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    return mul_add(
+        mul_add(
+            mul_add(c13, x, c12), x4,
+            mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
+        mul_add(
+            mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
+            mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
+}
+
+
+template<class VTYPE, class CTYPE>
+static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
+    // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
+    // VTYPE may be a vector type, CTYPE is a scalar type
+    VTYPE x2 = x  * x;
+    VTYPE x4 = x2 * x2;
+    VTYPE x8 = x4 * x4;
+    // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
+    return mul_add(
+        mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
+        mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x)));
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif
diff --git a/Bwdif/VCL2/vectormath_exp.h b/Bwdif/VCL2/vectormath_exp.h
new file mode 100644
index 0000000..124b1a1
--- /dev/null
+++ b/Bwdif/VCL2/vectormath_exp.h
@@ -0,0 +1,2173 @@
+/****************************  vectormath_exp.h   ******************************
+* Author:        Agner Fog
+* Date created:  2014-04-18
+* Last modified: 2020-06-08
+* Version:       2.00.03
+* Project:       vector class library
+* Description:
+* Header file containing inline vector functions of logarithms, exponential
+* and power functions:
+* exp         exponential function
+* exp2        exponential function base 2
+* exp10       exponential function base 10
+* exmp1       exponential function minus 1
+* log         natural logarithm
+* log2        logarithm base 2
+* log10       logarithm base 10
+* log1p       natural logarithm of 1+x
+* cbrt        cube root
+* pow         raise vector elements to power
+* pow_ratio   raise vector elements to rational power
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and Vincenzo Innocente,
+*   2012, https://root.cern.ch/doc/v606_/md_math_vdt_ReadMe.html
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* For detailed instructions see vcl_manual.pdf
+*
+* (c) Copyright 2014-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef VECTORMATH_EXP_H
+#define VECTORMATH_EXP_H  1
+
+#include "vectormath_common.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+/******************************************************************************
+*                 Exponential functions
+******************************************************************************/
+
+// Helper functions, used internally:
+
+// This function calculates pow(2,n) where n must be an integer. Does not check for overflow or underflow
+static inline Vec2d vm_pow2n (Vec2d const n) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    Vec2d a = n + (bias + pow2_52);              // put n + bias in least significant bits
+    Vec2q b = reinterpret_i(a);                  // bit-cast to integer
+    Vec2q c = b << 52;                           // shift left 52 places to get into exponent field
+    Vec2d d = reinterpret_d(c);                  // bit-cast back to double
+    return d;
+}
+
+static inline Vec4f vm_pow2n (Vec4f const n) {
+    const float pow2_23 =  8388608.0;            // 2^23
+    const float bias = 127.0;                    // bias in exponent
+    Vec4f a = n + (bias + pow2_23);              // put n + bias in least significant bits
+    Vec4i b = reinterpret_i(a);                  // bit-cast to integer
+    Vec4i c = b << 23;                           // shift left 23 places to get into exponent field
+    Vec4f d = reinterpret_f(c);                  // bit-cast back to float
+    return d;
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d vm_pow2n (Vec4d const n) {
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    Vec4d a = n + (bias + pow2_52);              // put n + bias in least significant bits
+    Vec4q b = reinterpret_i(a);                  // bit-cast to integer
+    Vec4q c = b << 52;                           // shift left 52 places to get value into exponent field
+    Vec4d d = reinterpret_d(c);                  // bit-cast back to double
+    return d;
+}
+
+static inline Vec8f vm_pow2n (Vec8f const n) {
+    const float pow2_23 =  8388608.0;            // 2^23
+    const float bias = 127.0;                    // bias in exponent
+    Vec8f a = n + (bias + pow2_23);              // put n + bias in least significant bits
+    Vec8i b = reinterpret_i(a);                  // bit-cast to integer
+    Vec8i c = b << 23;                           // shift left 23 places to get into exponent field
+    Vec8f d = reinterpret_f(c);                  // bit-cast back to float
+    return d;
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d vm_pow2n (Vec8d const n) {
+#ifdef __AVX512ER__
+    return _mm512_exp2a23_round_pd(n, _MM_FROUND_NO_EXC); // this is exact only for integral n
+#else
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    Vec8d a = n + (bias + pow2_52);              // put n + bias in least significant bits
+    Vec8q b = Vec8q(reinterpret_i(a));           // bit-cast to integer
+    Vec8q c = b << 52;                           // shift left 52 places to get value into exponent field
+    Vec8d d = Vec8d(reinterpret_d(c));           // bit-cast back to double
+    return d;
+#endif
+}
+
+static inline Vec16f vm_pow2n (Vec16f const n) {
+#ifdef __AVX512ER__
+    return _mm512_exp2a23_round_ps(n, _MM_FROUND_NO_EXC);
+#else
+    const float pow2_23 =  8388608.0;            // 2^23
+    const float bias = 127.0;                    // bias in exponent
+    Vec16f a = n + (bias + pow2_23);             // put n + bias in least significant bits
+    Vec16i b = Vec16i(reinterpret_i(a));         // bit-cast to integer
+    Vec16i c = b << 23;                          // shift left 23 places to get into exponent field
+    Vec16f d = Vec16f(reinterpret_f(c));         // bit-cast back to float
+    return d;
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for exp function, double precision
+// The limit of abs(x) is defined by max_x below
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+// M1: 0 for exp, 1 for expm1
+// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
+
+#if true  // choose method
+
+// Taylor expansion
+template<typename VTYPE, int M1, int BA>
+static inline VTYPE exp_d(VTYPE const initial_x) {
+
+    // Taylor coefficients, 1/n!
+    // Not using minimax approximation because we prioritize precision close to x = 0
+    const double p2  = 1./2.;
+    const double p3  = 1./6.;
+    const double p4  = 1./24.;
+    const double p5  = 1./120.;
+    const double p6  = 1./720.;
+    const double p7  = 1./5040.;
+    const double p8  = 1./40320.;
+    const double p9  = 1./362880.;
+    const double p10 = 1./3628800.;
+    const double p11 = 1./39916800.;
+    const double p12 = 1./479001600.;
+    const double p13 = 1./6227020800.;
+
+    // maximum abs(x), value depends on BA, defined below
+    // The lower limit of x is slightly more restrictive than the upper limit.
+    // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
+    double max_x;
+
+    // data vectors
+    VTYPE  x, r, z, n2;
+
+    if constexpr (BA <= 1) { // exp(x)
+        max_x = BA == 0 ? 708.39 : 709.7;        // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in hyperbolic functions
+        const double ln2d_hi = 0.693145751953125;
+        const double ln2d_lo = 1.42860682030941723212E-6;
+        x  = initial_x;
+        r  = round(initial_x*VM_LOG2E);
+        // subtraction in two steps for higher precision
+        x = nmul_add(r, ln2d_hi, x);             //  x -= r * ln2d_hi;
+        x = nmul_add(r, ln2d_lo, x);             //  x -= r * ln2d_lo;
+    }
+    else if constexpr (BA == 2) { // pow(2,x)
+        max_x = 1022.0;
+        r  = round(initial_x);
+        x  = initial_x - r;
+        x *= VM_LN2;
+    }
+    else if constexpr (BA == 10) { // pow(10,x)
+        max_x = 307.65;
+        const double log10_2_hi = 0.30102999554947019; // log10(2) in two parts
+        const double log10_2_lo = 1.1451100899212592E-10;
+        x  = initial_x;
+        r  = round(initial_x*(VM_LOG2E*VM_LN10));
+        // subtraction in two steps for higher precision
+        x  = nmul_add(r, log10_2_hi, x);         //  x -= r * log10_2_hi;
+        x  = nmul_add(r, log10_2_lo, x);         //  x -= r * log10_2_lo;
+        x *= VM_LN10;
+    }
+    else  {  // undefined value of BA
+        return 0.;
+    }
+
+    z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
+
+    if constexpr (BA == 1) r--;  // 0.5 * exp(x)
+
+    // multiply by power of 2
+    n2 = vm_pow2n(r);
+
+    if constexpr (M1 == 0) {
+        // exp
+        z = (z + 1.0) * n2;
+    }
+    else {
+        // expm1
+        z = mul_add(z, n2, n2 - 1.0);            // z = z * n2 + (n2 - 1.0);
+#ifdef SIGNED_ZERO                               // pedantic preservation of signed zero
+        z = select(initial_x == 0., initial_x, z);
+#endif
+    }
+
+    // check for overflow
+    auto inrange  = abs(initial_x) < max_x;      // boolean vector
+    // check for INF and NAN
+    inrange &= is_finite(initial_x);
+
+    if (horizontal_and(inrange)) {
+        // fast normal path
+        return z;
+    }
+    else {
+        // overflow, underflow and NAN
+        r = select(sign_bit(initial_x), 0.-(M1&1), infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
+        z = select(inrange, z, r);                         // +/- underflow
+        z = select(is_nan(initial_x), initial_x, z);       // NAN goes through
+        return z;
+    }
+}
+
+#else
+
+// Pade expansion uses less code and fewer registers, but is slower
+template<typename VTYPE, int M1, int BA>
+static inline VTYPE exp_d(VTYPE const initial_x) {
+
+    // define constants
+    const double ln2p1   = 0.693145751953125;
+    const double ln2p2   = 1.42860682030941723212E-6;
+    const double log2e   = VM_LOG2E;
+    const double max_exp = 708.39;
+    // coefficients of pade polynomials
+    const double P0exp = 9.99999999999999999910E-1;
+    const double P1exp = 3.02994407707441961300E-2;
+    const double P2exp = 1.26177193074810590878E-4;
+    const double Q0exp = 2.00000000000000000009E0;
+    const double Q1exp = 2.27265548208155028766E-1;
+    const double Q2exp = 2.52448340349684104192E-3;
+    const double Q3exp = 3.00198505138664455042E-6;
+
+    VTYPE  x, r, xx, px, qx, y, n2;              // data vectors
+
+    x = initial_x;
+    r = round(initial_x*log2e);
+
+    // subtraction in one step would gives loss of precision
+    x -= r * ln2p1;
+    x -= r * ln2p2;
+
+    xx = x * x;
+
+    // px = x * P(x^2).
+    px = polynomial_2(xx, P0exp, P1exp, P2exp) * x;
+
+    // Evaluate Q(x^2).
+    qx = polynomial_3(xx, Q0exp, Q1exp, Q2exp, Q3exp);
+
+    // e^x = 1 + 2*P(x^2)/( Q(x^2) - P(x^2) )
+    y = (2.0 * px) / (qx - px);
+
+    // Get 2^n in double.
+    // n  = round_to_int64_limited(r);
+    // n2 = exp2(n);
+    n2 = vm_pow2n(r);  // this is faster
+
+    if constexpr (M1 == 0) {
+        // exp
+        y = (y + 1.0) * n2;
+    }
+    else {
+        // expm1
+        y = y * n2 + (n2 - 1.0);
+    }
+
+    // overflow
+    auto inrange  = abs(initial_x) < max_exp;
+    // check for INF and NAN
+    inrange &= is_finite(initial_x);
+
+    if (horizontal_and(inrange)) {
+        // fast normal path
+        return y;
+    }
+    else {
+        // overflow, underflow and NAN
+        r = select(sign_bit(initial_x), 0.-M1, infinite_vec<VTYPE>()); // value in case of overflow or INF
+        y = select(inrange, y, r);                                     // +/- overflow
+        y = select(is_nan(initial_x), initial_x, y);                   // NAN goes through
+        return y;
+    }
+}
+#endif
+
+// instances of exp_d template
+static inline Vec2d exp(Vec2d const x) {
+    return exp_d<Vec2d, 0, 0>(x);
+}
+
+static inline Vec2d expm1(Vec2d const x) {
+    return exp_d<Vec2d, 3, 0>(x);
+}
+
+static inline Vec2d exp2(Vec2d const x) {
+    return exp_d<Vec2d, 0, 2>(x);
+}
+
+static inline Vec2d exp10(Vec2d const x) {
+    return exp_d<Vec2d, 0, 10>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d exp(Vec4d const x) {
+    return exp_d<Vec4d, 0, 0>(x);
+}
+
+static inline Vec4d expm1(Vec4d const x) {
+    return exp_d<Vec4d, 3, 0>(x);
+}
+
+static inline Vec4d exp2(Vec4d const x) {
+    return exp_d<Vec4d, 0, 2>(x);
+}
+
+static inline Vec4d exp10(Vec4d const x) {
+    return exp_d<Vec4d, 0, 10>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d exp(Vec8d const x) {
+    return exp_d<Vec8d, 0, 0>(x);
+}
+
+static inline Vec8d expm1(Vec8d const x) {
+    return exp_d<Vec8d, 3, 0>(x);
+}
+
+static inline Vec8d exp2(Vec8d const x) {
+    return exp_d<Vec8d, 0, 2>(x);
+}
+
+static inline Vec8d exp10(Vec8d const x) {
+    return exp_d<Vec8d, 0, 10>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for exp function, single precision
+// The limit of abs(x) is defined by max_x below
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  float vector type
+// M1: 0 for exp, 1 for expm1
+// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
+
+template<typename VTYPE, int M1, int BA>
+static inline VTYPE exp_f(VTYPE const initial_x) {
+
+    // Taylor coefficients
+    const float P0expf   =  1.f/2.f;
+    const float P1expf   =  1.f/6.f;
+    const float P2expf   =  1.f/24.f;
+    const float P3expf   =  1.f/120.f;
+    const float P4expf   =  1.f/720.f;
+    const float P5expf   =  1.f/5040.f;
+
+    VTYPE  x, r, x2, z, n2;                      // data vectors
+
+    // maximum abs(x), value depends on BA, defined below
+    // The lower limit of x is slightly more restrictive than the upper limit.
+    // We are specifying the lower limit, except for BA = 1 because it is not used for negative x
+    float max_x;
+
+    if constexpr (BA <= 1) { // exp(x)
+        const float ln2f_hi  =  0.693359375f;
+        const float ln2f_lo  = -2.12194440e-4f;
+        max_x = (BA == 0) ? 87.3f : 89.0f;
+
+        x = initial_x;
+        r = round(initial_x*float(VM_LOG2E));
+        x = nmul_add(r, VTYPE(ln2f_hi), x);      //  x -= r * ln2f_hi;
+        x = nmul_add(r, VTYPE(ln2f_lo), x);      //  x -= r * ln2f_lo;
+    }
+    else if constexpr (BA == 2) {                // pow(2,x)
+        max_x = 126.f;
+        r = round(initial_x);
+        x = initial_x - r;
+        x = x * (float)VM_LN2;
+    }
+    else if constexpr (BA == 10) {               // pow(10,x)
+        max_x = 37.9f;
+        const float log10_2_hi = 0.301025391f;   // log10(2) in two parts
+        const float log10_2_lo = 4.60503907E-6f;
+        x = initial_x;
+        r = round(initial_x*float(VM_LOG2E*VM_LN10));
+        x = nmul_add(r, VTYPE(log10_2_hi), x);   //  x -= r * log10_2_hi;
+        x = nmul_add(r, VTYPE(log10_2_lo), x);   //  x -= r * log10_2_lo;
+        x = x * (float)VM_LN10;
+    }
+    else  {  // undefined value of BA
+        return 0.;
+    }
+
+    x2 = x * x;
+    z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf);
+    z = mul_add(z, x2, x);                       // z *= x2;  z += x;
+
+    if constexpr (BA == 1) r--;                  // 0.5 * exp(x)
+
+    // multiply by power of 2
+    n2 = vm_pow2n(r);
+
+    if constexpr (M1 == 0) {
+        // exp
+        z = (z + 1.0f) * n2;
+    }
+    else {
+        // expm1
+        z = mul_add(z, n2, n2 - 1.0f);           //  z = z * n2 + (n2 - 1.0f);
+#ifdef SIGNED_ZERO                               // pedantic preservation of signed zero
+        z = select(initial_x == 0.f, initial_x, z);
+#endif
+    }
+
+    // check for overflow
+    auto inrange  = abs(initial_x) < max_x;      // boolean vector
+    // check for INF and NAN
+    inrange &= is_finite(initial_x);
+
+    if (horizontal_and(inrange)) {
+        // fast normal path
+        return z;
+    }
+    else {
+        // overflow, underflow and NAN
+        r = select(sign_bit(initial_x), 0.f-(M1&1), infinite_vec<VTYPE>()); // value in case of +/- overflow or INF
+        z = select(inrange, z, r);                         // +/- underflow
+        z = select(is_nan(initial_x), initial_x, z);       // NAN goes through
+        return z;
+    }
+}
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512
+// forward declarations of fast 512 bit versions
+static Vec16f exp(Vec16f const x);
+static Vec16f exp2(Vec16f const x);
+static Vec16f exp10(Vec16f const x);
+#endif
+
+// instances of exp_f template
+static inline Vec4f exp(Vec4f const x) {
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512        // use faster 512 bit version
+    return _mm512_castps512_ps128(exp(Vec16f(_mm512_castps128_ps512(x))));
+#else
+    return exp_f<Vec4f, 0, 0>(x);
+#endif
+}
+
+static inline Vec4f expm1(Vec4f const x) {
+    return exp_f<Vec4f, 3, 0>(x);
+}
+
+static inline Vec4f exp2(Vec4f const x) {
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512        // use faster 512 bit version
+    return _mm512_castps512_ps128(exp2(Vec16f(_mm512_castps128_ps512(x))));
+#else
+    return exp_f<Vec4f, 0, 2>(x);
+#endif
+}
+
+static inline Vec4f exp10(Vec4f const x) {
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512        // use faster 512 bit version
+    return _mm512_castps512_ps128(exp10(Vec16f(_mm512_castps128_ps512(x))));
+#else
+    return exp_f<Vec4f, 0, 10>(x);
+#endif
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f exp(Vec8f const x) {
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512        // use faster 512 bit version
+    return _mm512_castps512_ps256(exp(Vec16f(_mm512_castps256_ps512(x))));
+#else
+    return exp_f<Vec8f, 0, 0>(x);
+#endif
+}
+
+static inline Vec8f expm1(Vec8f const x) {
+    return exp_f<Vec8f, 3, 0>(x);
+}
+
+static inline Vec8f exp2(Vec8f const x) {
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512        // use faster 512 bit version
+    return _mm512_castps512_ps256(exp2(Vec16f(_mm512_castps256_ps512(x))));
+#else
+    return exp_f<Vec8f, 0, 2>(x);
+#endif
+}
+
+static inline Vec8f exp10(Vec8f const x) {
+#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512        // use faster 512 bit version
+    return _mm512_castps512_ps256(exp10(Vec16f(_mm512_castps256_ps512(x))));
+#else
+    return exp_f<Vec8f, 0, 10>(x);
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f exp(Vec16f const x) {
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast exponential function
+#ifdef VCL_FASTEXP
+    // very fast, but less precise for large x:
+    return _mm512_exp2a23_round_ps(x*float(VM_LOG2E), _MM_FROUND_NO_EXC);
+#else
+    // best precision, also for large x:
+    const Vec16f log2e = float(VM_LOG2E);
+    const float ln2f_hi = 0.693359375f;
+    const float ln2f_lo = -2.12194440e-4f;
+    Vec16f x1 = x, r, y;
+    r = round(x1*log2e);
+    x1 = nmul_add(r, Vec16f(ln2f_hi), x1);       //  x -= r * ln2f_hi;
+    x1 = nmul_add(r, Vec16f(ln2f_lo), x1);       //  x -= r * ln2f_lo;
+    x1 = x1 * log2e;
+    y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC);
+    // y = vm_pow2n(r);
+    return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC);
+#endif // VCL_FASTEXP
+#else  // no AVX512ER, use above template
+    return exp_f<Vec16f, 0, 0>(x);
+#endif
+}
+
+static inline Vec16f expm1(Vec16f const x) {
+    return exp_f<Vec16f, 3, 0>(x);
+}
+
+static inline Vec16f exp2(Vec16f const x) {
+#ifdef __AVX512ER__
+    return Vec16f(_mm512_exp2a23_round_ps(x, _MM_FROUND_NO_EXC));
+#else
+    return exp_f<Vec16f, 0, 2>(x);
+#endif
+}
+
+static inline Vec16f exp10(Vec16f const x) {
+#ifdef __AVX512ER__  // AVX512ER instruction set includes fast exponential function
+#ifdef VCL_FASTEXP
+    // very fast, but less precise for large x:
+    return _mm512_exp2a23_round_ps(x*float(VM_LOG210), _MM_FROUND_NO_EXC);
+#else
+    // best precision, also for large x:
+    const float log10_2_hi = 0.301025391f;       // log10(2) in two parts
+    const float log10_2_lo = 4.60503907E-6f;
+    Vec16f x1 = x, r, y;
+    Vec16f log210 = float(VM_LOG210);
+    r = round(x1*log210);
+    x1 = nmul_add(r, Vec16f(log10_2_hi), x1);    //  x -= r * log10_2_hi
+    x1 = nmul_add(r, Vec16f(log10_2_lo), x1);    //  x -= r * log10_2_lo
+    x1 = x1 * log210;
+    // y = vm_pow2n(r);
+    y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC);
+    return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC);
+#endif // VCL_FASTEXP
+#else  // no AVX512ER, use above template
+    return exp_f<Vec16f, 0, 10>(x);
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+/******************************************************************************
+*                 Logarithm functions
+******************************************************************************/
+
+// Helper function: fraction_2(x) = fraction(x)*0.5
+
+// Modified fraction function:
+// Extract the fraction part of a floating point number, and divide by 2
+// The fraction function is defined in vectorf128.h etc.
+// fraction_2(x) = fraction(x)*0.5
+// This version gives half the fraction without extra delay
+// Does not work for x = 0
+static inline Vec4f fraction_2(Vec4f const a) {
+    Vec4ui t1 = _mm_castps_si128(a);                       // reinterpret as 32-bit integer
+    Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000);    // set exponent to 0 + bias
+    return _mm_castsi128_ps(t2);
+}
+
+static inline Vec2d fraction_2(Vec2d const a) {
+    Vec2uq t1 = _mm_castpd_si128(a);                       // reinterpret as 64-bit integer
+    Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
+    return _mm_castsi128_pd(t2);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f fraction_2(Vec8f const a) {
+#if defined (VECTORI256_H) && VECTORI256_H > 2             // 256 bit integer vectors are available, AVX2
+    Vec8ui t1 = _mm256_castps_si256(a);                    // reinterpret as 32-bit integer
+    Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000;            // set exponent to 0 + bias
+    return _mm256_castsi256_ps(t2);
+#else
+    return Vec8f(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+static inline Vec4d fraction_2(Vec4d const a) {
+#if VECTORI256_H > 1  // AVX2
+    Vec4uq t1 = _mm256_castpd_si256(a);                    // reinterpret as 64-bit integer
+    Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias
+    return _mm256_castsi256_pd(t2);
+#else
+    return Vec4d(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f fraction_2(Vec16f const a) {
+#if INSTRSET >= 9                                          // 512 bit integer vectors are available, AVX512
+    return _mm512_getmant_ps(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+    //return Vec16f(_mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5f;
+#else
+    return Vec16f(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+static inline Vec8d fraction_2(Vec8d const a) {
+#if INSTRSET >= 9                                          // 512 bit integer vectors are available, AVX512
+    return _mm512_getmant_pd(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero);
+    //return Vec8d(_mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5;
+#else
+    return Vec8d(fraction_2(a.get_low()), fraction_2(a.get_high()));
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Helper function: exponent_f(x) = exponent(x) as floating point number
+
+union vm_ufi {
+    float f;
+    uint32_t i;
+};
+
+union vm_udi {
+    double d;
+    uint64_t i;
+};
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec4f exponent_f(Vec4f const x) {
+#ifdef __AVX512VL__                              // AVX512VL
+    // prevent returning -inf for x=0
+    return _mm_maskz_getexp_ps(_mm_cmp_ps_mask(x,Vec4f(0.f),4), x);
+#else
+    const float pow2_23 =  8388608.0f;           // 2^23
+    const float bias = 127.f;                    // bias in exponent
+    const vm_ufi upow2_23 = {pow2_23};
+    Vec4ui a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec4ui b = a >> 23;                          // shift down exponent to low bits
+    Vec4ui c = b | Vec4ui(upow2_23.i);           // insert new exponent
+    Vec4f  d = reinterpret_f(c);                 // bit-cast back to double
+    Vec4f  e = d - (pow2_23 + bias);             // subtract magic number and bias
+    return e;
+#endif
+}
+
+static inline Vec2d exponent_f(Vec2d const x) {
+#ifdef __AVX512VL__                              // AVX512VL
+    // prevent returning -inf for x=0
+    //return _mm_maskz_getexp_pd(x != 0., x);
+    return _mm_maskz_getexp_pd(_mm_cmp_pd_mask(x,Vec2d(0.),4), x);
+
+#else
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    const vm_udi upow2_52 = {pow2_52};
+
+    Vec2uq a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec2uq b = a >> 52;                          // shift down exponent to low bits
+    Vec2uq c = b | Vec2uq(upow2_52.i);           // insert new exponent
+    Vec2d  d = reinterpret_d(c);                 // bit-cast back to double
+    Vec2d  e = d - (pow2_52 + bias);             // subtract magic number and bias
+    return e;
+#endif
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f exponent_f(Vec8f const x) {
+#ifdef __AVX512VL__                              // AVX512VL
+    // prevent returning -inf for x=0
+    //return _mm256_maskz_getexp_ps(x != 0.f, x);
+    return _mm256_maskz_getexp_ps(_mm256_cmp_ps_mask(x,Vec8f(0.f),4), x);
+#else
+    const float pow2_23 =  8388608.0f;           // 2^23
+    const float bias = 127.f;                    // bias in exponent
+    const vm_ufi upow2_23 = {pow2_23};
+    Vec8ui a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec8ui b = a >> 23;                          // shift down exponent to low bits
+    Vec8ui c = b | Vec8ui(upow2_23.i);           // insert new exponent
+    Vec8f  d = reinterpret_f(c);                 // bit-cast back to double
+    Vec8f  e = d - (pow2_23 + bias);             // subtract magic number and bias
+    return e;
+#endif
+}
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec4d exponent_f(Vec4d const x) {
+#ifdef __AVX512VL__                              // AVX512VL
+    // prevent returning -inf for x=0
+    //return _mm256_maskz_getexp_pd(x != 0., x);
+    return _mm256_maskz_getexp_pd(_mm256_cmp_pd_mask(x,Vec4d(0.),4), x);
+#else
+    const double pow2_52 = 4503599627370496.0;   // 2^52
+    const double bias = 1023.0;                  // bias in exponent
+    const vm_udi upow2_52 = {pow2_52};
+    Vec4uq a = reinterpret_i(x);                 // bit-cast x to integer
+    Vec4uq b = a >> 52;                          // shift down exponent to low bits
+    Vec4uq c = b | Vec4uq(upow2_52.i);           // insert new exponent
+    Vec4d  d = reinterpret_d(c);                 // bit-cast back to double
+    Vec4d  e = d - (pow2_52 + bias);             // subtract magic number and bias
+    return e;
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f exponent_f(Vec16f const x) {
+#if INSTRSET >= 9                                // AVX512
+    // prevent returning -inf for x=0
+    return _mm512_maskz_getexp_ps(x != 0.f, x);
+#else
+    return Vec16f(exponent_f(x.get_low()), exponent_f(x.get_high()));
+#endif
+}
+
+// extract exponent of a positive number x as a floating point number
+static inline Vec8d exponent_f(Vec8d const x) {
+#if INSTRSET >= 9                                // AVX512
+    // prevent returning -inf for x=0
+    return _mm512_maskz_getexp_pd(uint8_t(x != 0.), x);
+#else
+    return Vec8d(exponent_f(x.get_low()), exponent_f(x.get_high()));
+#endif
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+// Helper function: log_special_cases(x,r). Handle special cases for log function
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d log_special_cases(Vec8d const x1, Vec8d const r) {
+    Vec8d res = r;
+#if INSTRSET >= 10  // AVX512DQ
+    Vec8db specialcases = _mm512_fpclass_pd_mask(x1, 0x7E);// zero, subnormal, negative, +-inf
+    if (!horizontal_or(specialcases)) {
+        return res;            // normal path
+    }
+    res = _mm512_fixupimm_pd(res, x1, Vec8q(0x03530411),0);// handle most cases
+    res = select(Vec8db(_mm512_fpclass_pd_mask(x1, 0x26)),-infinite_vec<Vec8d>(),res);  // subnormal -> -INF
+    res = select(Vec8db(_mm512_fpclass_pd_mask(x1, 0x50)),nan_vec<Vec8d>(NAN_LOG),res); // negative -> specific NAN
+    return res;
+#else
+    Vec8db overflow = !is_finite(x1);
+    Vec8db underflow = x1 < VM_SMALLEST_NORMAL;  // denormals not supported by this functions
+    if (!horizontal_or(overflow | underflow)) {
+        return res;                              // normal path
+    }
+    // overflow and underflow
+    res = select(underflow, nan_vec<Vec8d>(NAN_LOG), res);                // x1  < 0 gives NAN
+    res = select(is_zero_or_subnormal(x1), -infinite_vec<Vec8d>(), res);  // x1 == 0 gives -INF
+    res = select(overflow, x1, res);                                      // INF or NAN goes through
+    res = select(is_inf(x1) & sign_bit(x1), nan_vec<Vec8d>(NAN_LOG), res);// -INF gives NAN
+    return res;
+#endif // INSTRSET
+}
+
+static inline Vec16f log_special_cases(Vec16f const x1, Vec16f const r) {
+    Vec16f res = r;
+#if INSTRSET >= 10  // AVX512DQ
+    Vec16fb specialcases = _mm512_fpclass_ps_mask(x1, 0x7E);  // zero, subnormal, negative, +-inf
+    if (!horizontal_or(specialcases)) {
+        return res;          // normal path
+    }
+    res = _mm512_fixupimm_ps(res, x1, Vec16i(0x03530411), 0); // handle most cases
+    res = select(Vec16fb(_mm512_fpclass_ps_mask(x1, 0x26)),-infinite_vec<Vec16f>(),res);  // subnormal -> -INF
+    res = select(Vec16fb(_mm512_fpclass_ps_mask(x1, 0x50)),nan_vec<Vec16f>(NAN_LOG),res); // negative -> specific NAN
+    return res;
+#else
+    Vec16fb overflow = !is_finite(x1);
+    Vec16fb underflow = x1 < VM_SMALLEST_NORMALF;// denormals not supported by this functions
+    if (!horizontal_or(overflow | underflow)) {
+        return res;                              // normal path
+    }
+    // overflow and underflow
+    res = select(underflow, nan_vec<Vec16f>(NAN_LOG), res);                // x1  < 0 gives NAN
+    res = select(is_zero_or_subnormal(x1), -infinite_vec<Vec16f>(), res);  // x1 == 0 gives -INF
+    res = select(overflow, x1, res);                                       // INF or NAN goes through
+    res = select(is_inf(x1) & sign_bit(x1), nan_vec<Vec16f>(NAN_LOG), res);// -INF gives NAN
+    return res;
+#endif // INSTRSET
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d log_special_cases(Vec4d const x1, Vec4d const r) {
+    Vec4d res = r;
+#if INSTRSET >= 10  // AVX512DQ AVX512VL
+    __mmask8 specialcases = _mm256_fpclass_pd_mask(x1, 0x7E);  // zero, subnormal, negative, +-inf
+    if (specialcases == 0) {
+        return res;          // normal path
+    }
+    res = _mm256_fixupimm_pd(res, x1, Vec4q(0x03530411), 0);   // handle most cases
+    res = _mm256_mask_mov_pd(res, _mm256_fpclass_pd_mask(x1, 0x26), -infinite_vec<Vec4d>());  // subnormal -> -INF
+    res = _mm256_mask_mov_pd(res, _mm256_fpclass_pd_mask(x1, 0x50), nan_vec<Vec4d>(NAN_LOG)); // negative -> specific NAN
+    return res;
+#else
+    Vec4db overflow = !is_finite(x1);
+    Vec4db underflow = x1 < VM_SMALLEST_NORMAL;  // denormals not supported by this functions
+    if (!horizontal_or(overflow | underflow)) {
+        return res;                              // normal path
+    }
+    // overflow and underflow
+    res = select(underflow, nan_vec<Vec4d>(NAN_LOG), res);                // x1  < 0 gives NAN
+    res = select(is_zero_or_subnormal(x1), -infinite_vec<Vec4d>(), res);  // x1 == 0 gives -INF
+    res = select(overflow, x1, res);                                      // INF or NAN goes through
+    res = select(is_inf(x1) & sign_bit(x1), nan_vec<Vec4d>(NAN_LOG), res);// -INF gives NAN
+    return res;
+#endif // INSTRSET
+}
+
+static inline Vec8f log_special_cases(Vec8f const x1, Vec8f const r) {
+    Vec8f res = r;
+#if INSTRSET >= 10  // AVX512DQ AVX512VL
+    __mmask8 specialcases = _mm256_fpclass_ps_mask(x1, 0x7E); // zero, subnormal, negative, +-inf
+    if (specialcases == 0) {
+        return res;          // normal path
+    }
+    res = _mm256_fixupimm_ps(res, x1, Vec8i(0x03530411), 0);  // handle most cases
+    res = _mm256_mask_mov_ps(res, _mm256_fpclass_ps_mask(x1, 0x26), -infinite_vec<Vec8f>());  // subnormal -> -INF
+    res = _mm256_mask_mov_ps(res, _mm256_fpclass_ps_mask(x1, 0x50), nan_vec<Vec8f>(NAN_LOG)); // negative -> specific NAN
+    return res;
+#else
+    Vec8fb overflow = !is_finite(x1);
+    Vec8fb underflow = x1 < VM_SMALLEST_NORMALF; // denormals not supported by this functions
+    if (!horizontal_or(overflow | underflow)) {
+        return res;                              // normal path
+    }
+    // overflow and underflow
+    res = select(underflow, nan_vec<Vec8f>(NAN_LOG), res);                // x1  < 0 gives NAN
+    res = select(is_zero_or_subnormal(x1), -infinite_vec<Vec8f>(), res);  // x1 == 0 gives -INF
+    res = select(overflow, x1, res);                                      // INF or NAN goes through
+    res = select(is_inf(x1) & sign_bit(x1), nan_vec<Vec8f>(NAN_LOG), res);// -INF gives NAN
+    return res;
+#endif // INSTRSET
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+static inline Vec2d log_special_cases(Vec2d const x1, Vec2d const r) {
+    Vec2d res = r;
+#if INSTRSET >= 10  // AVX512DQ AVX512VL
+    __mmask8 specialcases = _mm_fpclass_pd_mask(x1, 0x7E); // zero, subnormal, negative, +-inf
+    if (specialcases == 0) {
+        return res;            // normal path
+    }
+    res = _mm_fixupimm_pd(res, x1, Vec2q(0x03530411), 0);  // handle most cases
+    res = _mm_mask_mov_pd(res, _mm_fpclass_pd_mask(x1, 0x26), -infinite_vec<Vec2d>());  // subnormal -> -INF
+    res = _mm_mask_mov_pd(res, _mm_fpclass_pd_mask(x1, 0x50), nan_vec<Vec2d>(NAN_LOG)); // negative -> specific NAN
+    return res;
+#else
+    Vec2db overflow = !is_finite(x1);
+    Vec2db underflow = x1 < VM_SMALLEST_NORMAL;  // denormals not supported by this functions
+    if (!horizontal_or(overflow | underflow)) {
+        return res;                              // normal path
+    }
+    // overflow and underflow
+    res = select(underflow, nan_vec<Vec2d>(NAN_LOG), res);                // x1  < 0 gives NAN
+    res = select(is_zero_or_subnormal(x1), -infinite_vec<Vec2d>(), res);  // x1 == 0 gives -INF
+    res = select(overflow, x1, res);                                      // INF or NAN goes through
+    res = select(is_inf(x1) & sign_bit(x1), nan_vec<Vec2d>(NAN_LOG), res);// -INF gives NAN
+    return res;
+#endif // INSTRSET
+}
+
+static inline Vec4f log_special_cases(Vec4f const x1, Vec4f const r) {
+    Vec4f res = r;
+#if INSTRSET >= 10  // AVX512DQ AVX512VL
+    __mmask8 specialcases = _mm_fpclass_ps_mask(x1, 0x7E); // zero, subnormal, negative, +-inf
+    if (specialcases == 0) {
+        return res;          // normal path
+    }
+    res = _mm_fixupimm_ps(res, x1, Vec4i(0x03530411), 0);  // handle most cases
+    res = _mm_mask_mov_ps(res, _mm_fpclass_ps_mask(x1, 0x26), -infinite_vec<Vec4f>());  // subnormal -> -INF
+    res = _mm_mask_mov_ps(res, _mm_fpclass_ps_mask(x1, 0x50), nan_vec<Vec4f>(NAN_LOG)); // negative -> specific NAN
+    return res;
+#else
+    Vec4fb overflow = !is_finite(x1);
+    Vec4fb underflow = x1 < VM_SMALLEST_NORMALF; // denormals not supported by this functions
+    if (!horizontal_or(overflow | underflow)) {
+        return res;                              // normal path
+    }
+    // overflow and underflow
+    res = select(underflow, nan_vec<Vec4f>(NAN_LOG), res);                // x1  < 0 gives NAN
+    res = select(is_zero_or_subnormal(x1), -infinite_vec<Vec4f>(), res);  // x1 == 0 gives -INF
+    res = select(overflow, x1, res);                                      // INF or NAN goes through
+    res = select(is_inf(x1) & sign_bit(x1), nan_vec<Vec4f>(NAN_LOG), res);// -INF gives NAN
+    return res;
+#endif // INSTRSET
+}
+
+
+// log function, double precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// M1: 0 for log, 1 for log1p
+template<typename VTYPE, int M1>
+static inline VTYPE log_d(VTYPE const initial_x) {
+
+    // define constants
+    const double ln2_hi =  0.693359375;
+    const double ln2_lo = -2.121944400546905827679E-4;
+    const double P0log  =  7.70838733755885391666E0;
+    const double P1log  =  1.79368678507819816313E1;
+    const double P2log  =  1.44989225341610930846E1;
+    const double P3log  =  4.70579119878881725854E0;
+    const double P4log  =  4.97494994976747001425E-1;
+    const double P5log  =  1.01875663804580931796E-4;
+    const double Q0log  =  2.31251620126765340583E1;
+    const double Q1log  =  7.11544750618563894466E1;
+    const double Q2log  =  8.29875266912776603211E1;
+    const double Q3log  =  4.52279145837532221105E1;
+    const double Q4log  =  1.12873587189167450590E1;
+
+    VTYPE  x1, x, x2, px, qx, res, fe;           // data vectors
+
+    if constexpr (M1 == 0) {
+        x1 = initial_x;                          // log(x)
+    }
+    else {
+        x1 = initial_x + 1.0;                    // log(x+1)
+    }
+    // separate mantissa from exponent
+    // VTYPE x  = fraction(x1) * 0.5;
+    x  = fraction_2(x1);
+    fe = exponent_f(x1);
+
+    auto blend = x > VM_SQRT2*0.5;               // boolean vector
+    x  = if_add(!blend, x, x);                   // conditional add
+    fe = if_add(blend, fe, 1.);                  // conditional add
+
+    if constexpr (M1 == 0) {
+        // log(x). Expand around 1.0
+        x -= 1.0;
+    }
+    else {
+        // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
+        x = select(fe==0., initial_x, x - 1.0);
+    }
+
+    // rational form
+    px  = polynomial_5 (x, P0log, P1log, P2log, P3log, P4log, P5log);
+    x2  = x * x;
+    px *= x * x2;
+    qx  = polynomial_5n(x, Q0log, Q1log, Q2log, Q3log, Q4log);
+    res = px / qx ;
+
+    // add exponent
+    res  = mul_add(fe, ln2_lo, res);             // res += fe * ln2_lo;
+    res += nmul_add(x2, 0.5, x);                 // res += x  - 0.5 * x2;
+    res  = mul_add(fe, ln2_hi, res);             // res += fe * ln2_hi;
+#ifdef SIGNED_ZERO                               // pedantic preservation of signed zero
+    res = select(initial_x == 0., initial_x, res);
+#endif
+    // handle special cases, or return res
+    return log_special_cases(x1, res);
+}
+
+
+static inline Vec2d log(Vec2d const x) {
+    return log_d<Vec2d, 0>(x);
+}
+
+static inline Vec2d log1p(Vec2d const x) {
+    return log_d<Vec2d, 3>(x);
+}
+
+static inline Vec2d log2(Vec2d const x) {
+    return VM_LOG2E * log_d<Vec2d, 0>(x);
+}
+
+static inline Vec2d log10(Vec2d const x) {
+    return VM_LOG10E * log_d<Vec2d, 0>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d log(Vec4d const x) {
+    return log_d<Vec4d, 0>(x);
+}
+
+static inline Vec4d log1p(Vec4d const x) {
+    return log_d<Vec4d, 3>(x);
+}
+
+static inline Vec4d log2(Vec4d const x) {
+    return VM_LOG2E * log_d<Vec4d, 0>(x);
+}
+
+static inline Vec4d log10(Vec4d const x) {
+    return VM_LOG10E * log_d<Vec4d, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d log(Vec8d const x) {
+    return log_d<Vec8d, 0>(x);
+}
+
+static inline Vec8d log1p(Vec8d const x) {
+    return log_d<Vec8d, 3>(x);
+}
+
+static inline Vec8d log2(Vec8d const x) {
+    return VM_LOG2E * log_d<Vec8d, 0>(x);
+}
+
+static inline Vec8d log10(Vec8d const x) {
+    return VM_LOG10E * log_d<Vec8d, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// log function, single precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// M1: 0 for log, 1 for log1p
+template<typename VTYPE, int M1>
+static inline VTYPE log_f(VTYPE const initial_x) {
+
+    // define constants
+    const float ln2f_hi =  0.693359375f;
+    const float ln2f_lo = -2.12194440E-4f;
+    const float P0logf  =  3.3333331174E-1f;
+    const float P1logf  = -2.4999993993E-1f;
+    const float P2logf  =  2.0000714765E-1f;
+    const float P3logf  = -1.6668057665E-1f;
+    const float P4logf  =  1.4249322787E-1f;
+    const float P5logf  = -1.2420140846E-1f;
+    const float P6logf  =  1.1676998740E-1f;
+    const float P7logf  = -1.1514610310E-1f;
+    const float P8logf  =  7.0376836292E-2f;
+
+    VTYPE  x1, x, res, x2, fe;                   // data vectors
+
+    if constexpr (M1 == 0) {
+        x1 = initial_x;                          // log(x)
+    }
+    else {
+        x1 = initial_x + 1.0f;                   // log(x+1)
+    }
+
+    // separate mantissa from exponent
+    x = fraction_2(x1);
+    auto e = exponent(x1);                       // integer vector
+
+    auto blend = x > float(VM_SQRT2*0.5);        // boolean vector
+    x  = if_add(!blend, x, x);                   // conditional add
+    e  = if_add(decltype(e>e)(blend),  e, decltype(e)(1));  // conditional add
+    fe = to_float(e);
+
+    if constexpr (M1 == 0) {
+        // log(x). Expand around 1.0
+        x -= 1.0f;
+    }
+    else {
+        // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0
+        x = select(decltype(x>x)(e==0), initial_x, x - 1.0f);
+    }
+
+    // Taylor expansion
+    res = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
+    x2  = x*x;
+    res *= x2*x;
+
+    // add exponent
+    res  = mul_add(fe, ln2f_lo, res);            // res += ln2f_lo  * fe;
+    res += nmul_add(x2, 0.5f, x);                // res += x - 0.5f * x2;
+    res  = mul_add(fe, ln2f_hi, res);            // res += ln2f_hi  * fe;
+#ifdef SIGNED_ZERO                               // pedantic preservation of signed zero
+    res = select(initial_x == 0.f, initial_x, res);
+#endif
+    // handle special cases, or return res
+    return log_special_cases(x1, res);
+}
+
+static inline Vec4f log(Vec4f const x) {
+    return log_f<Vec4f, 0>(x);
+}
+
+static inline Vec4f log1p(Vec4f const x) {
+    return log_f<Vec4f, 3>(x);
+}
+
+static inline Vec4f log2(Vec4f const x) {
+    return float(VM_LOG2E) * log_f<Vec4f, 0>(x);
+}
+
+static inline Vec4f log10(Vec4f const x) {
+    return float(VM_LOG10E) * log_f<Vec4f, 0>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f log(Vec8f const x) {
+    return log_f<Vec8f, 0>(x);
+}
+
+static inline Vec8f log1p(Vec8f const x) {
+    return log_f<Vec8f, 3>(x);
+}
+
+static inline Vec8f log2(Vec8f const x) {
+    return float(VM_LOG2E) * log_f<Vec8f, 0>(x);
+}
+
+static inline Vec8f log10(Vec8f const x) {
+    return float(VM_LOG10E) * log_f<Vec8f, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f log(Vec16f const x) {
+    return log_f<Vec16f, 0>(x);
+}
+
+static inline Vec16f log1p(Vec16f const x) {
+    return log_f<Vec16f, 3>(x);
+}
+
+static inline Vec16f log2(Vec16f const x) {
+    return float(VM_LOG2E) * log_f<Vec16f, 0>(x);
+}
+
+static inline Vec16f log10(Vec16f const x) {
+    return float(VM_LOG10E) * log_f<Vec16f, 0>(x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+/******************************************************************************
+*           Cube root and reciprocal cube root
+******************************************************************************/
+
+// cube root template, double precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
+template<typename VTYPE, int CR>
+static inline VTYPE cbrt_d(VTYPE const x) {
+    const int iter = 7;     // iteration count of x^(-1/3) loop
+    int i;
+    typedef decltype(x < x) BVTYPE;              // boolean vector type
+    typedef decltype(roundi(x)) ITYPE64;         // 64 bit integer vector type
+    typedef decltype(roundi(compress(x,x))) ITYPE32; // 32 bit integer vector type
+
+    ITYPE32 m1, m2;
+    BVTYPE underflow;
+    ITYPE64 q1(0x5540000000000000ULL);           // exponent bias
+    ITYPE64 q2(0x0005555500000000ULL);           // exponent multiplier for 1/3
+    ITYPE64 q3(0x0010000000000000ULL);           // denormal limit
+
+    VTYPE  xa, xa3, a, a2;
+    const double one_third  = 1./3.;
+    const double four_third = 4./3.;
+
+    xa  = abs(x);
+    xa3 = one_third*xa;
+
+    // multiply exponent by -1/3
+    m1 = reinterpret_i(xa);
+    m2 = ITYPE32(q1) - (m1 >> 20) * ITYPE32(q2);
+    a  = reinterpret_d(m2);
+    underflow = BVTYPE(ITYPE64(m1) <= q3);       // true if denormal or zero
+
+    // Newton Raphson iteration. Warning: may overflow!
+    for (i = 0; i < iter-1; i++) {
+        a2 = a * a;
+        a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
+    }
+    // last iteration with better precision
+    a2 = a * a;
+    a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); // a = a + one_third*(a - xa*a2*a2);
+
+    if constexpr (CR == -1) {                    // reciprocal cube root
+        a = select(underflow, infinite_vec<VTYPE>(), a); // generate INF if underflow
+        a = select(is_inf(x), VTYPE(0), a);      // special case for INF                                                 // get sign
+        a = sign_combine(a, x);                  // get sign
+    }
+    else if constexpr (CR == 1) {                // cube root
+        a = a * a * x;
+        a = select(underflow, 0., a);            // generate 0 if underflow
+        a = select(is_inf(x), x, a);             // special case for INF
+#ifdef SIGNED_ZERO
+        a = a | (x & VTYPE(-0.0));                      // get sign of x
+#endif
+    }
+    else if constexpr (CR == 2) {                // cube root squared
+        a = a * xa;
+        a = select(underflow, 0., a);            // generate 0 if underflow
+        a = select(is_inf(x), xa, a);            // special case for INF
+    }
+    return a;
+}
+
+// template instances for cbrt and reciprocal_cbrt
+
+// cube root
+static inline Vec2d cbrt(Vec2d const x) {
+    return cbrt_d<Vec2d, 1> (x);
+}
+
+// reciprocal cube root
+static inline Vec2d reciprocal_cbrt(Vec2d const x) {
+    return cbrt_d<Vec2d, -1> (x);
+}
+
+// square cube root
+static inline Vec2d square_cbrt(Vec2d const x) {
+    return cbrt_d<Vec2d, 2> (x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d cbrt(Vec4d const x) {
+    return cbrt_d<Vec4d, 1> (x);
+}
+
+static inline Vec4d reciprocal_cbrt(Vec4d const x) {
+    return cbrt_d<Vec4d, -1> (x);
+}
+
+static inline Vec4d square_cbrt(Vec4d const x) {
+    return cbrt_d<Vec4d, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec8d cbrt(Vec8d const x) {
+    return cbrt_d<Vec8d, 1> (x);
+}
+
+static inline Vec8d reciprocal_cbrt(Vec8d const x) {
+    return cbrt_d<Vec8d, -1> (x);
+}
+
+static inline Vec8d square_cbrt(Vec8d const x) {
+    return cbrt_d<Vec8d, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// cube root template, single precision
+// template parameters:
+// VTYPE:  f.p. vector type
+// CR:     -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared
+template<typename VTYPE, int CR>
+static inline VTYPE cbrt_f(VTYPE const x) {
+
+    const int iter = 4;                          // iteration count of x^(-1/3) loop
+    int i;
+
+    typedef decltype(roundi(x)) ITYPE;           // integer vector type
+    typedef decltype(x < x) BVTYPE;              // boolean vector type
+
+    VTYPE  xa, xa3, a, a2;
+    ITYPE  m1, m2;
+    BVTYPE underflow;
+    ITYPE  q1(0x54800000U);                      // exponent bias
+    ITYPE  q2(0x002AAAAAU);                      // exponent multiplier for 1/3
+    ITYPE  q3(0x00800000U);                      // denormal limit
+    const  float one_third  = float(1./3.);
+    const  float four_third = float(4./3.);
+
+    xa  = abs(x);
+    xa3 = one_third*xa;
+
+    // multiply exponent by -1/3
+    m1 = reinterpret_i(xa);
+    m2 = q1 - (m1 >> 23) * q2;
+    a  = reinterpret_f(m2);
+
+    underflow = BVTYPE(m1 <= q3);                // true if denormal or zero
+
+    // Newton Raphson iteration
+    for (i = 0; i < iter-1; i++) {
+        a2 = a*a;
+        a = nmul_add(xa3, a2*a2, four_third*a);  // a = four_third*a - xa3*a2*a2;
+    }
+    // last iteration with better precision
+    a2 = a*a;
+    a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); //a = a + one_third*(a - xa*a2*a2);
+
+    if constexpr (CR == -1) {                    // reciprocal cube root
+        // generate INF if underflow
+        a = select(underflow, infinite_vec<VTYPE>(), a);
+        a = select(is_inf(x), VTYPE(0), a);      // special case for INF                                                 // get sign
+        a = sign_combine(a, x);
+    }
+    else if constexpr (CR == 1) {                // cube root
+        a = a * a * x;
+        a = select(underflow, 0.f, a);           // generate 0 if underflow
+        a = select(is_inf(x), x, a);             // special case for INF
+#ifdef SIGNED_ZERO
+        a = a | (x & VTYPE(-0.0f));                     // get sign of x
+#endif
+    }
+    else if constexpr (CR == 2) {                // cube root squared
+        a = a * xa;                              // abs only to fix -INF
+        a = select(underflow, 0., a);            // generate 0 if underflow
+        a = select(is_inf(x), xa, a);            // special case for INF
+    }
+    return a;
+}
+
+// template instances for cbrt and reciprocal_cbrt
+
+// cube root
+static inline Vec4f cbrt(Vec4f const x) {
+    return cbrt_f<Vec4f, 1> (x);
+}
+
+// reciprocal cube root
+static inline Vec4f reciprocal_cbrt(Vec4f const x) {
+    return cbrt_f<Vec4f, -1> (x);
+}
+
+// square cube root
+static inline Vec4f square_cbrt(Vec4f const x) {
+    return cbrt_f<Vec4f, 2> (x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec8f cbrt(Vec8f const x) {
+    return cbrt_f<Vec8f, 1> (x);
+}
+
+static inline Vec8f reciprocal_cbrt(Vec8f const x) {
+    return cbrt_f<Vec8f, -1> (x);
+}
+
+static inline Vec8f square_cbrt(Vec8f const x) {
+    return cbrt_f<Vec8f, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+static inline Vec16f cbrt(Vec16f const x) {
+    return cbrt_f<Vec16f, 1> (x);
+}
+
+static inline Vec16f reciprocal_cbrt(Vec16f const x) {
+    return cbrt_f<Vec16f, -1> (x);
+}
+
+static inline Vec16f square_cbrt(Vec16f const x) {
+    return cbrt_f<Vec16f, 2> (x);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+/* ****************************************************************************
+                    pow functions
+*******************************************************************************
+Note about standard conformance:
+This implementation of a pow function differs from the IEEE 754-2008 floating
+point standard regarding nan propagation.
+The standard has pow(nan,0) = 1, and pow(1,nan) = 1, probably for historic reasons.
+The present implementation is guaranteed to always propagate nan's for reasons
+explained in this report:
+Agner Fog: "NAN propagation versus fault trapping in floating point code", 2019,
+https://www.agner.org/optimize/nan_propagation.pdf
+
+The standard defines another function, powr, which propagates NAN's, but powr
+will be less useful to programmers because it does not allow integer powers of
+negative x.
+
+******************************************************************************/
+
+// Helper functions:
+
+#if MAX_VECTOR_SIZE >= 512
+
+// Helper function for power function: insert special values of pow(x,y) when x=0:
+// y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan
+static inline Vec8d wm_pow_case_x0(Vec8db const xiszero, Vec8d const y, Vec8d const z) {
+#if INSTRSET >= 9
+    const __m512i table = Vec8q(0x85858A00);
+    return _mm512_mask_fixupimm_pd(z, uint8_t(xiszero), y, table, 0);
+#else
+    return select(xiszero, select(y < 0., infinite_vec<Vec8d>(), select(y == 0., Vec8d(1.), Vec8d(0.))), z);
+#endif
+}
+
+// Helper function for power function: insert special values of pow(x,y) when x=0:
+// y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan
+static inline Vec16f wm_pow_case_x0(Vec16fb const xiszero, Vec16f const y, Vec16f const z) {
+#if INSTRSET >= 9
+    const __m512i table = Vec16ui(0x85858A00);
+    return _mm512_mask_fixupimm_ps(z, xiszero, y, table, 0);
+#else
+    return select(xiszero, select(y < 0.f, infinite_vec<Vec16f>(), select(y == 0.f, Vec16f(1.f), Vec16f(0.f))), z);
+#endif
+}
+
+#endif
+
+#if MAX_VECTOR_SIZE >= 256
+
+static inline Vec4d wm_pow_case_x0(Vec4db const xiszero, Vec4d const y, Vec4d const z) {
+//#if INSTRSET >= 10
+    //const __m256i table = Vec4q(0x85858A00);
+    //return _mm256_mask_fixupimm_pd(z, xiszero, y, table, 0);
+//#else
+    return select(xiszero, select(y < 0., infinite_vec<Vec4d>(), select(y == 0., Vec4d(1.), Vec4d(0.))), z);
+//#endif
+}
+
+static inline Vec8f wm_pow_case_x0(Vec8fb const xiszero, Vec8f const y, Vec8f const z) {
+    return select(xiszero, select(y < 0.f, infinite_vec<Vec8f>(), select(y == 0.f, Vec8f(1.f), Vec8f(0.f))), z);
+}
+
+#endif
+
+static inline Vec2d wm_pow_case_x0(Vec2db const xiszero, Vec2d const y, Vec2d const z) {
+//#if INSTRSET >= 10
+//    const __m128i table = Vec2q(0x85858A00);
+//    return _mm_mask_fixupimm_pd(z, xiszero, y, table, 0);
+//#else
+    return select(xiszero, select(y < 0., infinite_vec<Vec2d>(), select(y == 0., Vec2d(1.), Vec2d(0.))), z);
+//#endif
+}
+
+static inline Vec4f wm_pow_case_x0(Vec4fb const xiszero, Vec4f const y, Vec4f const z) {
+    return select(xiszero, select(y < 0.f, infinite_vec<Vec4f>(), select(y == 0.f, Vec4f(1.f), Vec4f(0.f))), z);
+}
+
+
+// ****************************************************************************
+//                pow template, double precision
+// ****************************************************************************
+// Calculate x to the power of y.
+
+// Precision is important here because rounding errors get multiplied by y.
+// The logarithm is calculated with extra precision, and the exponent is
+// calculated separately.
+// The logarithm is calculated by Pade approximation with 6'th degree
+// polynomials. A 7'th degree would be preferred for best precision by high y.
+// The alternative method: log(x) = z + z^3*R(z)/S(z), where z = 2(x-1)/(x+1)
+// did not give better precision.
+
+// Template parameters:
+// VTYPE:  data vector type
+template <typename VTYPE>
+static inline VTYPE pow_template_d(VTYPE const x0, VTYPE const y) {
+
+    // define constants
+    const double ln2d_hi = 0.693145751953125;           // log(2) in extra precision, high bits
+    const double ln2d_lo = 1.42860682030941723212E-6;   // low bits of log(2)
+    const double log2e   = VM_LOG2E;                    // 1/log(2)
+
+    // coefficients for Pade polynomials
+    const double P0logl =  2.0039553499201281259648E1;
+    const double P1logl =  5.7112963590585538103336E1;
+    const double P2logl =  6.0949667980987787057556E1;
+    const double P3logl =  2.9911919328553073277375E1;
+    const double P4logl =  6.5787325942061044846969E0;
+    const double P5logl =  4.9854102823193375972212E-1;
+    const double P6logl =  4.5270000862445199635215E-5;
+    const double Q0logl =  6.0118660497603843919306E1;
+    const double Q1logl =  2.1642788614495947685003E2;
+    const double Q2logl =  3.0909872225312059774938E2;
+    const double Q3logl =  2.2176239823732856465394E2;
+    const double Q4logl =  8.3047565967967209469434E1;
+    const double Q5logl =  1.5062909083469192043167E1;
+
+    // Taylor coefficients for exp function, 1/n!
+    const double p2  = 1./2.;
+    const double p3  = 1./6.;
+    const double p4  = 1./24.;
+    const double p5  = 1./120.;
+    const double p6  = 1./720.;
+    const double p7  = 1./5040.;
+    const double p8  = 1./40320.;
+    const double p9  = 1./362880.;
+    const double p10 = 1./3628800.;
+    const double p11 = 1./39916800.;
+    const double p12 = 1./479001600.;
+    const double p13 = 1./6227020800.;
+
+    typedef decltype(roundi(x0)) ITYPE;          // integer vector type
+    typedef decltype(x0 < x0) BVTYPE;            // boolean vector type
+
+    // data vectors
+    VTYPE x, x1, x2;                             // x variable
+    VTYPE px, qx, ef, yr, v;                     // calculation of logarithm
+    VTYPE lg, lg1, lg2;
+    VTYPE lgerr, x2err;
+    VTYPE e1, e2, ee;
+    VTYPE e3, z, z1;                             // calculation of exp and pow
+    VTYPE yodd(0);                               // has sign bit set if y is an odd integer
+    // integer vectors
+    ITYPE ei, ej;
+    // boolean vectors
+    BVTYPE blend, xzero, xsign;                  // x conditions
+    BVTYPE overflow, underflow, xfinite, yfinite, efinite; // error conditions
+
+    // remove sign
+    x1 = abs(x0);
+
+    // Separate mantissa from exponent
+    // This gives the mantissa * 0.5
+    x  = fraction_2(x1);
+
+    // reduce range of x = +/- sqrt(2)/2
+    blend = x > VM_SQRT2*0.5;
+    x  = if_add(!blend, x, x);                   // conditional add
+
+    // Pade approximation
+    // Higher precision than in log function. Still higher precision wanted
+    x -= 1.0;
+    x2 = x*x;
+    px = polynomial_6  (x, P0logl, P1logl, P2logl, P3logl, P4logl, P5logl, P6logl);
+    px *= x * x2;
+    qx = polynomial_6n (x, Q0logl, Q1logl, Q2logl, Q3logl, Q4logl, Q5logl);
+    lg1 = px / qx;
+
+    // extract exponent
+    ef = exponent_f(x1);
+    ef = if_add(blend, ef, 1.);                  // conditional add
+
+    // multiply exponent by y
+    // nearest integer e1 goes into exponent of result, remainder yr is added to log
+    e1 = round(ef * y);
+    yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here
+
+    // add initial terms to Pade expansion
+    lg = nmul_add(0.5, x2, x) + lg1;             // lg = (x - 0.5 * x2) + lg1;
+    // calculate rounding errors in lg
+    // rounding error in multiplication 0.5*x*x
+    x2err = mul_sub_x(0.5*x, x, 0.5*x2);
+    // rounding error in additions and subtractions
+    lgerr = mul_add(0.5, x2, lg - x) - lg1;      // lgerr = ((lg - x) + 0.5 * x2) - lg1;
+
+    // extract something for the exponent
+    e2 = round(lg * y * VM_LOG2E);
+    // subtract this from lg, with extra precision
+    v = mul_sub_x(lg, y, e2 * ln2d_hi);
+    v = nmul_add(e2, ln2d_lo, v);                // v -= e2 * ln2d_lo;
+
+    // add remainder from ef * y
+    v = mul_add(yr, VM_LN2, v);                  // v += yr * VM_LN2;
+
+    // correct for previous rounding errors
+    v = nmul_add(lgerr + x2err, y, v);           // v -= (lgerr + x2err) * y;
+
+    // exp function
+
+    // extract something for the exponent if possible
+    x = v;
+    e3 = round(x*log2e);
+    // high precision multiplication not needed here because abs(e3) <= 1
+    x = nmul_add(e3, VM_LN2, x);                 // x -= e3 * VM_LN2;
+
+    z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13);
+    z = z + 1.0;
+
+    // contributions to exponent
+    ee = e1 + e2 + e3;
+    //ei = round_to_int64_limited(ee);
+    ei = roundi(ee);
+    // biased exponent of result:
+    ej = ei + (ITYPE(reinterpret_i(z)) >> 52);
+    // check exponent for overflow and underflow
+    overflow  = BVTYPE(ej >= 0x07FF) | (ee >  3000.);
+    underflow = BVTYPE(ej <= 0x0000) | (ee < -3000.);
+
+    // add exponent by integer addition
+    z = reinterpret_d(ITYPE(reinterpret_i(z)) + (ei << 52));
+
+    // check for special cases
+    xfinite   = is_finite(x0);
+    yfinite   = is_finite(y);
+    efinite   = is_finite(ee);
+    xzero     = is_zero_or_subnormal(x0);
+    xsign     = sign_bit(x0);  // sign of x0. include -0.
+
+    // check for overflow and underflow
+    if (horizontal_or(overflow | underflow)) {
+        // handle errors
+        z = select(underflow, VTYPE(0.), z);
+        z = select(overflow, infinite_vec<VTYPE>(), z);
+    }
+
+    // check for x == 0
+    z = wm_pow_case_x0(xzero, y, z);
+    //z = select(xzero, select(y < 0., infinite_vec<VTYPE>(), select(y == 0., VTYPE(1.), VTYPE(0.))), z);
+
+    // check for sign of x (include -0.). y must be integer
+    if (horizontal_or(xsign)) {
+        // test if y is an integer
+        BVTYPE yinteger = y == round(y);
+        // test if y is odd: convert to int and shift bit 0 into position of sign bit.
+        // this will be 0 if overflow
+        yodd = reinterpret_d(roundi(y) << 63);
+        z1 = select(yinteger, z | yodd,                    // y is integer. get sign if y is odd
+            select(x0 == 0., z, nan_vec<VTYPE>(NAN_POW))); // NAN unless x0 == -0.
+        yodd = select(yinteger, yodd, 0.);                 // yodd used below. only if y is integer
+        z = select(xsign, z1, z);
+    }
+
+    // check for range errors
+    if (horizontal_and(xfinite & yfinite & (efinite | xzero))) {
+        // fast return if no special cases
+        return z;
+    }
+
+    // handle special error cases: y infinite
+    z1 = select(yfinite & efinite, z,
+        select(x1 == 1., VTYPE(1.),
+            select((x1 > 1.) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.)));
+
+    // handle x infinite
+    z1 = select(xfinite, z1,
+        select(y == 0., VTYPE(1.),
+            select(y < 0., yodd & z,      // 0.0 with the sign of z from above
+                abs(x0) | (x0 & yodd)))); // get sign of x0 only if y is odd integer
+
+    // Always propagate nan:
+    // Deliberately differing from the IEEE-754 standard which has pow(0,nan)=1, and pow(1,nan)=1
+    z1 = select(is_nan(x0)|is_nan(y), x0+y, z1);
+
+    return z1;
+}
+
+
+//This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
+//template <typename TT> static Vec2d pow(Vec2d const a, TT n);
+
+// instantiations of pow_template_d:
+template <>
+inline Vec2d pow<Vec2d>(Vec2d const x, Vec2d const y) {
+    return pow_template_d(x, y);
+}
+
+template <>
+inline Vec2d pow<double>(Vec2d const x, double const y) {
+    return pow_template_d<Vec2d>(x, y);
+}
+template <>
+inline Vec2d pow<float>(Vec2d const x, float const y) {
+    return pow_template_d<Vec2d>(x, (double)y);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec4d pow<Vec4d>(Vec4d const x, Vec4d const y) {
+    return pow_template_d(x, y);
+}
+
+template <>
+inline Vec4d pow<double>(Vec4d const x, double const y) {
+    return pow_template_d<Vec4d>(x, y);
+}
+
+template <>
+inline Vec4d pow<float>(Vec4d const x, float const y) {
+    return pow_template_d<Vec4d>(x, (double)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec8d pow<Vec8d>(Vec8d const x, Vec8d const y) {
+    return pow_template_d(x, y);
+}
+
+template <>
+inline Vec8d pow<double>(Vec8d const x, double const y) {
+    return pow_template_d<Vec8d>(x, y);
+}
+
+template <>
+inline Vec8d pow<float>(Vec8d const x, float const y) {
+    return pow_template_d<Vec8d>(x, (double)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// ****************************************************************************
+//                pow template, single precision
+// ****************************************************************************
+
+// Template parameters:
+// VTYPE:  data vector type
+// Calculate x to the power of y
+template <typename VTYPE>
+static inline VTYPE pow_template_f(VTYPE const x0, VTYPE const y) {
+
+    // define constants
+    const float ln2f_hi  =  0.693359375f;        // log(2), split in two for extended precision
+    const float ln2f_lo  = -2.12194440e-4f;
+    const float log2e    =  float(VM_LOG2E);     // 1/log(2)
+
+    const float P0logf  =  3.3333331174E-1f;     // coefficients for logarithm expansion
+    const float P1logf  = -2.4999993993E-1f;
+    const float P2logf  =  2.0000714765E-1f;
+    const float P3logf  = -1.6668057665E-1f;
+    const float P4logf  =  1.4249322787E-1f;
+    const float P5logf  = -1.2420140846E-1f;
+    const float P6logf  =  1.1676998740E-1f;
+    const float P7logf  = -1.1514610310E-1f;
+    const float P8logf  =  7.0376836292E-2f;
+
+    const float p2expf   =  1.f/2.f;             // coefficients for Taylor expansion of exp
+    const float p3expf   =  1.f/6.f;
+    const float p4expf   =  1.f/24.f;
+    const float p5expf   =  1.f/120.f;
+    const float p6expf   =  1.f/720.f;
+    const float p7expf   =  1.f/5040.f;
+
+    typedef decltype(roundi(x0)) ITYPE;          // integer vector type
+    typedef decltype(x0 < x0) BVTYPE;            // boolean vector type
+
+    // data vectors
+    VTYPE x, x1, x2;                             // x variable
+    VTYPE ef, e1, e2, e3, ee;                    // exponent
+    VTYPE yr;                                    // remainder
+    VTYPE lg, lg1, lgerr, x2err, v;              // logarithm
+    VTYPE z, z1;                                 // pow(x,y)
+    VTYPE yodd(0);                               // has sign bit set if y is an odd integer
+    // integer vectors
+    ITYPE ei, ej;                                // exponent
+    // boolean vectors
+    BVTYPE blend, xzero, xsign;                  // x conditions
+    BVTYPE overflow, underflow, xfinite, yfinite, efinite; // error conditions
+
+    // remove sign
+    x1 = abs(x0);
+
+    // Separate mantissa from exponent
+    // This gives the mantissa * 0.5
+    x  = fraction_2(x1);
+
+    // reduce range of x = +/- sqrt(2)/2
+    blend = x > float(VM_SQRT2 * 0.5);
+    x  = if_add(!blend, x, x);                   // conditional add
+
+    // Taylor expansion, high precision
+    x   -= 1.0f;
+    x2   = x * x;
+    lg1  = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf);
+    lg1 *= x2 * x;
+
+    // extract exponent
+    ef = exponent_f(x1);
+    ef = if_add(blend, ef, 1.0f);                // conditional add
+
+    // multiply exponent by y
+    // nearest integer e1 goes into exponent of result, remainder yr is added to log
+    e1 = round(ef * y);
+    yr = mul_sub_x(ef, y, e1);                   // calculate remainder yr. precision very important here
+
+    // add initial terms to expansion
+    lg = nmul_add(0.5f, x2, x) + lg1;            // lg = (x - 0.5f * x2) + lg1;
+
+    // calculate rounding errors in lg
+    // rounding error in multiplication 0.5*x*x
+    x2err = mul_sub_x(0.5f*x, x, 0.5f * x2);
+    // rounding error in additions and subtractions
+    lgerr = mul_add(0.5f, x2, lg - x) - lg1;     // lgerr = ((lg - x) + 0.5f * x2) - lg1;
+
+    // extract something for the exponent
+    e2 = round(lg * y * float(VM_LOG2E));
+    // subtract this from lg, with extra precision
+    v = mul_sub_x(lg, y, e2 * ln2f_hi);
+    v = nmul_add(e2, ln2f_lo, v);                // v -= e2 * ln2f_lo;
+
+    // correct for previous rounding errors
+    v -= mul_sub(lgerr + x2err, y, yr * float(VM_LN2)); // v -= (lgerr + x2err) * y - yr * float(VM_LN2) ;
+
+    // exp function
+
+    // extract something for the exponent if possible
+    x = v;
+    e3 = round(x*log2e);
+    // high precision multiplication not needed here because abs(e3) <= 1
+    x = nmul_add(e3, float(VM_LN2), x);          // x -= e3 * float(VM_LN2);
+
+    // Taylor polynomial
+    x2  = x  * x;
+    z = polynomial_5(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf)*x2 + x + 1.0f;
+
+    // contributions to exponent
+    ee = e1 + e2 + e3;
+    ei = roundi(ee);
+    // biased exponent of result:
+    ej = ei + (ITYPE(reinterpret_i(z)) >> 23);
+    // check exponent for overflow and underflow
+    overflow  = BVTYPE(ej >= 0x0FF) | (ee >  300.f);
+    underflow = BVTYPE(ej <= 0x000) | (ee < -300.f);
+
+    // add exponent by integer addition
+    z = reinterpret_f(ITYPE(reinterpret_i(z)) + (ei << 23)); // the extra 0x10000 is shifted out here
+
+    // check for special cases
+    xfinite   = is_finite(x0);
+    yfinite   = is_finite(y);
+    efinite   = is_finite(ee);
+
+    xzero     = is_zero_or_subnormal(x0);
+    xsign     = sign_bit(x0);  // x is negative or -0.
+
+    // check for overflow and underflow
+    if (horizontal_or(overflow | underflow)) {
+        // handle errors
+        z = select(underflow, VTYPE(0.f), z);
+        z = select(overflow, infinite_vec<VTYPE>(), z);
+    }
+
+    // check for x == 0
+    z = wm_pow_case_x0(xzero, y, z);
+    //z = select(xzero, select(y < 0.f, infinite_vec<VTYPE>(), select(y == 0.f, VTYPE(1.f), VTYPE(0.f))), z);
+
+    // check for sign of x (include -0.). y must be integer
+    if (horizontal_or(xsign)) {
+        // test if y is an integer
+        BVTYPE yinteger = y == round(y);
+        // test if y is odd: convert to int and shift bit 0 into position of sign bit.
+        // this will be 0 if overflow
+        yodd = reinterpret_f(roundi(y) << 31);
+        z1 = select(yinteger, z | yodd,                    // y is integer. get sign if y is odd
+            select(x0 == 0.f, z, nan_vec<VTYPE>(NAN_POW)));// NAN unless x0 == -0.
+        yodd = select(yinteger, yodd, 0);                  // yodd used below. only if y is integer
+        z = select(xsign, z1, z);
+    }
+
+    // check for range errors
+    if (horizontal_and(xfinite & yfinite & (efinite | xzero))) {
+        return z;            // fast return if no special cases
+    }
+
+    // handle special error cases: y infinite
+    z1 = select(yfinite & efinite, z,
+        select(x1 == 1.f, VTYPE(1.f),
+            select((x1 > 1.f) ^ sign_bit(y), infinite_vec<VTYPE>(), 0.f)));
+
+    // handle x infinite
+    z1 = select(xfinite, z1,
+        select(y == 0.f, VTYPE(1.f),
+            select(y < 0.f, yodd & z,     // 0.0 with the sign of z from above
+                abs(x0) | (x0 & yodd)))); // get sign of x0 only if y is odd integer
+
+    // Always propagate nan:
+    // Deliberately differing from the IEEE-754 standard which has pow(0,nan)=1, and pow(1,nan)=1
+    z1 = select(is_nan(x0)|is_nan(y), x0+y, z1);
+    return z1;
+}
+
+//This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined:
+//template <typename TT> static Vec4f pow(Vec4f const a, TT n);
+
+template <>
+inline Vec4f pow<Vec4f>(Vec4f const x, Vec4f const y) {
+    return pow_template_f(x, y);
+}
+
+template <>
+inline Vec4f pow<float>(Vec4f const x, float const y) {
+    return pow_template_f<Vec4f>(x, y);
+}
+
+template <>
+inline Vec4f pow<double>(Vec4f const x, double const y) {
+    return pow_template_f<Vec4f>(x, (float)y);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+
+template <>
+inline Vec8f pow<Vec8f>(Vec8f const x, Vec8f const y) {
+    return pow_template_f(x, y);
+}
+
+template <>
+inline Vec8f pow<float>(Vec8f const x, float const y) {
+    return pow_template_f<Vec8f>(x, y);
+}
+template <>
+inline Vec8f pow<double>(Vec8f const x, double const y) {
+    return pow_template_f<Vec8f>(x, (float)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+
+template <>
+inline Vec16f pow<Vec16f>(Vec16f const x, Vec16f const y) {
+    return pow_template_f(x, y);
+}
+
+template <>
+inline Vec16f pow<float>(Vec16f const x, float const y) {
+    return pow_template_f<Vec16f>(x, y);
+}
+
+template <>
+inline Vec16f pow<double>(Vec16f const x, double const y) {
+    return pow_template_f<Vec16f>(x, (float)y);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             power function with rational exponent
+// *************************************************************
+
+// macro to call template power_rational
+#define pow_ratio(x, a, b) (power_rational<decltype(x+x), a, b> (x))
+
+// Power function with rational exponent: pow(x,a/b)
+template <typename V, int a0, int b0>
+V power_rational (V const x) {
+
+    // constexpr lambda to reduce rational number a/b
+    auto reduce_rational = [](int const aa, int const bb) constexpr {
+        int a = aa, b = bb;
+        if (b < 0) {
+            a = -a; b = -b;                           // make b positive
+        }
+        while ((((a | b) & 1) == 0) && b > 0) {       // prime factor 2
+            a /= 2;  b /= 2;
+        }
+        while (a % 3 == 0 && b % 3 == 0 && b > 0) {   // prime factor 3
+            a /= 3;  b /= 3;
+        }
+        while (a % 5 == 0 && b % 5 == 0 && b > 0) {   // prime factor 5
+            a /= 5;  b /= 5;
+        }
+        return bb / b;                                // return common denominator
+    };
+    constexpr int d = reduce_rational(a0, b0);
+    constexpr int a = a0 / d;
+    constexpr int b = b0 / d;
+
+    // special cases
+    if constexpr (a == 0) return V(1.f);
+
+    else if constexpr (b == 1) return pow_n<V,a>(x);
+
+    else if constexpr (b == 2) {
+        V y, t = sqrt(x);
+        if constexpr (a == 1) y = t;
+        else if constexpr (a == -1) y = V(1.f) / t;
+        else {
+            constexpr int a2 = a > 0 ? a / 2 : (a - 1) / 2;
+            y = pow_n<V, a2>(x) * t;
+        }
+#ifdef SIGNED_ZERO
+        y = abs(y);    // pow(-0., a/2.) must be +0.
+#endif
+        return y;
+    }
+
+    else if constexpr (b == 3) {
+        V y;
+        constexpr int a3 = a % 3;
+        if constexpr (a3 == -2) {
+            V t = reciprocal_cbrt(x);
+            t *= t;
+            if constexpr (a == -2) y = t;
+            else y = t / pow_n<V, (-a-2)/3>(x);
+        }
+        else if constexpr (a3 == -1) {
+            V t = reciprocal_cbrt(x);
+            if constexpr (a == -1) y = t;
+            else y = t / pow_n<V, (-a-1)/3>(x);       // fail if INF
+        }
+        else if constexpr (a3 == 1) {
+            V t = cbrt(x);
+            if constexpr (a == 1) y = t;
+            else y = t * pow_n<V, a/3>(x);
+        }
+        else if constexpr (a3 == 2) {
+            V t = square_cbrt(x);
+            if constexpr (a == 2) y = t;
+            else y = t * pow_n<V, a/3>(x);
+        }
+        return y;
+    }
+
+    else if constexpr (b == 4) {
+        constexpr int a4 = a % 4;
+        V s1, s2, y;
+        s1 = sqrt(x);
+        if ((a & 1) == 1) s2 = sqrt(s1);
+
+        if constexpr (a4 == -3) {
+            y = s2 / pow_n<V, 1+(-a)/4>(x);
+        }
+        else if constexpr (a4 == -1) {
+            if constexpr (a != -1) s2 *= pow_n<V, (-a)/4>(x);
+            y = V(1.f) / s2;
+        }
+        else if constexpr (a4 == 1) {
+            if constexpr (a == 1) y = s2;
+            else y = s2 * pow_n<V, a/4>(x);
+        }
+        else if constexpr (a4 == 3) {
+            V t = s1 * s2;
+            if constexpr (a != 3) t *= pow_n<V, a/4>(x);
+            y = t;
+        }
+#ifdef SIGNED_ZERO
+        y = abs(y);
+#endif
+        return y;
+    }
+
+    else if constexpr (b == 6) {
+        constexpr int a6 = a % 6;
+        V y;
+        if constexpr (a6 == -5) {
+            V t = cbrt(sqrt(x)) / x;
+            if constexpr (a != -5) t /= pow_n<V, (-a)/6>(x);
+            y = t;
+        }
+        else if constexpr (a6 == -1) {
+            V t = reciprocal_cbrt(sqrt(x));
+            if constexpr (a != -1) t /= pow_n<V, (-a)/6>(x);
+            y = t;
+        }
+        else if constexpr (a6 == 1) {
+            V t = cbrt(sqrt(x));
+            if constexpr (a != 1) t *= pow_n<V, a/6>(x);
+            y = t;
+        }
+        else if constexpr (a6 == 5) {
+            V s1 = sqrt(x);
+            V t = cbrt(s1);
+            t = t*t*s1;
+            if constexpr (a != 5) t *= pow_n<V, a/6>(x);
+            y = t;
+        }
+#ifdef SIGNED_ZERO
+        y = abs(y);
+#endif
+        return y;
+    }
+
+    else if constexpr (b == 8) {
+        V s1 = sqrt(x);                // x^(1/2)
+        V s2 = sqrt(s1);               // x^(1/4)
+        V s3 = sqrt(s2);               // x^(1/8)
+        V y;
+        constexpr int a8 = a % 8;
+        if constexpr (a8 == -7) {
+            y = s3 / pow_n<V, 1+(-a)/8>(x);
+        }
+        else if constexpr (a8 == -5) {
+            y = s3 * (s2 / pow_n<V, 1+(-a)/8>(x));
+        }
+        else if constexpr (a8 == -3) {
+            y = s3 * (s1 / pow_n<V, 1+(-a)/8>(x));
+        }
+        else if constexpr (a8 == -1) {
+            if constexpr (a != -1) s3 *= pow_n<V, (-a)/8>(x);
+            y = V(1.f) / s3;
+        }
+        else if constexpr (a8 == 1) {
+            if constexpr (a == 1) y = s3;
+            else y = s3 * pow_n<V, a/8>(x);
+        }
+        else if constexpr (a8 == 3) {
+            V t = s2 * s3;
+            if constexpr (a != 3) t *= pow_n<V, a/8>(x);
+            y = t;
+        }
+        else if constexpr (a8 == 5) {
+            V t = s1 * s3;
+            if constexpr (a != 5) t *= pow_n<V, a/8>(x);
+            y = t;
+        }
+        else if constexpr (a8 == 7) {
+            V t = s2 * s3;
+            if constexpr (a != 7) s1 *= pow_n<V, a/8>(x);
+            t *= s1;
+            y = t;
+        }
+#ifdef SIGNED_ZERO
+        y = abs(y);
+#endif
+        return y;
+    }
+
+    else {
+        // general case
+        V y = x;
+        // negative x allowed when b odd or a even
+        // (if a is even then either b is odd or a/b can be reduced,
+        // but we can check a even anyway at no cost to be sure)
+        if constexpr (((b | ~a) & 1) == 1) y = abs(y);
+        y = pow(y, (double(a) / double(b)));
+        if constexpr ((a & b & 1) == 1) y = sign_combine(y, x); // apply sign if a and b both odd
+        return y;
+    }
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif  // VECTORMATH_EXP_H
diff --git a/Bwdif/VCL2/vectormath_hyp.h b/Bwdif/VCL2/vectormath_hyp.h
new file mode 100644
index 0000000..6c0efa8
--- /dev/null
+++ b/Bwdif/VCL2/vectormath_hyp.h
@@ -0,0 +1,717 @@
+/****************************  vectormath_hyp.h   ******************************
+* Author:        Agner Fog
+* Date created:  2014-07-09
+* Last modified: 2019-08-01
+* Version:       2.00.00
+* Project:       vector class library
+* Description:
+* Header file containing inline vector functions of hyperbolic and inverse
+* hyperbolic functions:
+* sinh        hyperbolic sine
+* cosh        hyperbolic cosine
+* tanh        hyperbolic tangent
+* asinh       inverse hyperbolic sine
+* acosh       inverse hyperbolic cosine
+* atanh       inverse hyperbolic tangent
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* For detailed instructions, see vectormath_common.h and vcl_manual.pdf
+*
+* (c) Copyright 2014-2019 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef VECTORMATH_HYP_H
+#define VECTORMATH_HYP_H  1
+
+#include "vectormath_exp.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+/******************************************************************************
+*                 Hyperbolic functions
+******************************************************************************/
+
+// Template for sinh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE sinh_d(VTYPE const x0) {
+// The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // Coefficients
+    const double p0 = -3.51754964808151394800E5;
+    const double p1 = -1.15614435765005216044E4;
+    const double p2 = -1.63725857525983828727E2;
+    const double p3 = -7.89474443963537015605E-1;
+
+    const double q0 = -2.11052978884890840399E6;
+    const double q1 =  3.61578279834431989373E4;
+    const double q2 = -2.77711081420602794433E2;
+    const double q3 =  1.0;
+
+    // data vectors
+    VTYPE  x, x2, y1, y2;
+
+    x = abs(x0);
+    auto x_small = x <= 1.0;                     // use Pade approximation if abs(x) <= 1
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3);
+        y1 = mul_add(y1, x*x2, x);               // y1 = x + x2*(x*y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 =  exp_d<VTYPE, 0, 1>(x);             //   0.5 * exp(x)
+        y2 -= 0.25 / y2;                         // - 0.5 * exp(-x)
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+    // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision
+
+    return y1;
+}
+
+// instances of sinh_d template
+static inline Vec2d sinh(Vec2d const x) {
+    return sinh_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d sinh(Vec4d const x) {
+    return sinh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d sinh(Vec8d const x) {
+    return sinh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for sinh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE sinh_f(VTYPE const x0) {
+// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // Coefficients
+    const float r0 = 1.66667160211E-1f;
+    const float r1 = 8.33028376239E-3f;
+    const float r2 = 2.03721912945E-4f;
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+
+    x = abs(x0);
+    auto x_small = x <= 1.0f;                    // use polynomial approximation if abs(x) <= 1
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_2(x2, r0, r1, r2);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 =  exp_f<VTYPE, 0, 1>(x);             //   0.5 * exp(x)
+        y2 -= 0.25f / y2;                        // - 0.5 * exp(-x)
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+    // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision
+
+    return y1;
+}
+
+// instances of sinh_f template
+static inline Vec4f sinh(Vec4f const x) {
+    return sinh_f(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f sinh(Vec8f const x) {
+    return sinh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f sinh(Vec16f const x) {
+    return sinh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for cosh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE cosh_d(VTYPE const x0) {
+// The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // data vectors
+    VTYPE x, y;
+    x  = abs(x0);
+    y  = exp_d<VTYPE, 0, 1>(x);                  //   0.5 * exp(x)
+    y += 0.25 / y;                               // + 0.5 * exp(-x)
+    return y;
+}
+
+// instances of sinh_d template
+static inline Vec2d cosh(Vec2d const x) {
+    return cosh_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d cosh(Vec4d const x) {
+    return cosh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d cosh(Vec8d const x) {
+    return cosh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for cosh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE cosh_f(VTYPE const x0) {
+// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // data vectors
+    VTYPE x, y;
+    x  = abs(x0);
+    y  = exp_f<VTYPE, 0, 1>(x);                  //   0.5 * exp(x)
+    y += 0.25f / y;                              // + 0.5 * exp(-x)
+    return y;
+}
+
+// instances of sinh_d template
+static inline Vec4f cosh(Vec4f const x) {
+    return cosh_f(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f cosh(Vec8f const x) {
+    return cosh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f cosh(Vec16f const x) {
+    return cosh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for tanh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE tanh_d(VTYPE const x0) {
+
+    // Coefficients
+    const double p0 = -1.61468768441708447952E3;
+    const double p1 = -9.92877231001918586564E1;
+    const double p2 = -9.64399179425052238628E-1;
+
+    const double q0 =  4.84406305325125486048E3;
+    const double q1 =  2.23548839060100448583E3;
+    const double q2 =  1.12811678491632931402E2;
+    const double q3 =  1.0;
+
+    // data vectors
+    VTYPE  x, x2, y1, y2;
+
+    x = abs(x0);
+    auto x_small = x <= 0.625;                   // use Pade approximation if abs(x) <= 5/8
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = exp(x+x);                           // exp(2*x)
+        y2 = 1.0 - 2.0 / (y2 + 1.0);             // tanh(x)
+    }
+    auto x_big = x > 350.;
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = select(x_big,  1.0, y1);                // avoid overflow
+    y1 = sign_combine(y1, x0);                   // get original sign
+    return y1;
+}
+
+// instances of tanh_d template
+static inline Vec2d tanh(Vec2d const x) {
+    return tanh_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d tanh(Vec4d const x) {
+    return tanh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d tanh(Vec8d const x) {
+    return tanh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for tanh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE tanh_f(VTYPE const x0) {
+// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
+
+    // Coefficients
+    const float r0 = -3.33332819422E-1f;
+    const float r1 =  1.33314422036E-1f;
+    const float r2 = -5.37397155531E-2f;
+    const float r3 =  2.06390887954E-2f;
+    const float r4 = -5.70498872745E-3f;
+
+    // data vectors
+    VTYPE x, x2, y1, y2;
+
+    x = abs(x0);
+    auto x_small = x <= 0.625f;                  // use polynomial approximation if abs(x) <= 5/8
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x*x;
+        y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = exp(x+x);                           // exp(2*x)
+        y2 = 1.0f - 2.0f / (y2 + 1.0f);          // tanh(x)
+    }
+    auto x_big = x > 44.4f;
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = select(x_big,  1.0f, y1);               // avoid overflow
+    y1 = sign_combine(y1, x0);                   // get original sign
+    return y1;
+}
+
+// instances of tanh_f template
+static inline Vec4f tanh(Vec4f const x) {
+    return tanh_f(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f tanh(Vec8f const x) {
+    return tanh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f tanh(Vec16f const x) {
+    return tanh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+/******************************************************************************
+*                 Inverse hyperbolic functions
+******************************************************************************/
+
+// Template for asinh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE asinh_d(VTYPE const x0) {
+
+    // Coefficients
+    const double p0 = -5.56682227230859640450E0;
+    const double p1 = -9.09030533308377316566E0;
+    const double p2 = -4.37390226194356683570E0;
+    const double p3 = -5.91750212056387121207E-1;
+    const double p4 = -4.33231683752342103572E-3;
+
+    const double q0 =  3.34009336338516356383E1;
+    const double q1 =  6.95722521337257608734E1;
+    const double q2 =  4.86042483805291788324E1;
+    const double q3 =  1.28757002067426453537E1;
+    const double q4 =  1.0;
+
+    // data vectors
+    VTYPE  x, x2, y1, y2;
+
+    x2 = x0 * x0;
+    x  = abs(x0);
+    auto x_small = x <= 0.533;                   // use Pade approximation if abs(x) <= 0.5
+    // Both methods give the highest error close to 0.5.
+    // This limit is adjusted for minimum error
+    auto x_huge  = x > 1.E20;                    // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x + sqrt(x2 + 1.0));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x) + VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+    return y1;
+}
+
+// instances of asinh_d template
+static inline Vec2d asinh(Vec2d const x) {
+    return asinh_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d asinh(Vec4d const x) {
+    return asinh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d asinh(Vec8d const x) {
+    return asinh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for asinh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE asinh_f(VTYPE const x0) {
+
+    // Coefficients
+    const float r0 = -1.6666288134E-1f;
+    const float r1 =  7.4847586088E-2f;
+    const float r2 = -4.2699340972E-2f;
+    const float r3 =  2.0122003309E-2f;
+
+    // data vectors
+    VTYPE  x, x2, y1, y2;
+
+    x2 = x0 * x0;
+    x  = abs(x0);
+    auto x_small = x <= 0.51f;                   // use polynomial approximation if abs(x) <= 0.5
+    auto x_huge  = x > 1.E10f;                   // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = polynomial_3(x2, r0, r1, r2, r3);
+        y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x + sqrt(x2 + 1.0f));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x) + (float)VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+    return y1;
+}
+
+// instances of asinh_f template
+static inline Vec4f asinh(Vec4f const x) {
+    return asinh_f(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f asinh(Vec8f const x) {
+    return asinh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f asinh(Vec16f const x) {
+    return asinh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for acosh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE acosh_d(VTYPE const x0) {
+
+    // Coefficients
+    const double p0 = 1.10855947270161294369E5;
+    const double p1 = 1.08102874834699867335E5;
+    const double p2 = 3.43989375926195455866E4;
+    const double p3 = 3.94726656571334401102E3;
+    const double p4 = 1.18801130533544501356E2;
+
+    const double q0 = 7.83869920495893927727E4;
+    const double q1 = 8.29725251988426222434E4;
+    const double q2 = 2.97683430363289370382E4;
+    const double q3 = 4.15352677227719831579E3;
+    const double q4 = 1.86145380837903397292E2;
+    const double q5 = 1.0;
+
+    // data vectors
+    VTYPE  x1, y1, y2;
+
+    x1      = x0 - 1.0;
+    auto undef   = x0 < 1.0;                     // result is NAN
+    auto x_small = x1 < 0.49;                    // use Pade approximation if abs(x-1) < 0.5
+    auto x_huge  = x1 > 1.E20;                   // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5));
+        // x < 1 generates NAN
+        y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x0) + VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    return y1;
+}
+
+// instances of acosh_d template
+static inline Vec2d acosh(Vec2d const x) {
+    return acosh_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d acosh(Vec4d const x) {
+    return acosh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d acosh(Vec8d const x) {
+    return acosh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for acosh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE acosh_f(VTYPE const x0) {
+
+    // Coefficients
+    const float r0 =  1.4142135263E0f;
+    const float r1 = -1.1784741703E-1f;
+    const float r2 =  2.6454905019E-2f;
+    const float r3 = -7.5272886713E-3f;
+    const float r4 =  1.7596881071E-3f;
+
+    // data vectors
+    VTYPE  x1, y1, y2;
+
+    x1      = x0 - 1.0f;
+    auto undef   = x0 < 1.0f;                    // result is NAN
+    auto x_small = x1 < 0.49f;                   // use Pade approximation if abs(x-1) < 0.5
+    auto x_huge  = x1 > 1.E10f;                  // simple approximation, avoid overflow
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4);
+        // x < 1 generates NAN
+        y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
+        if (horizontal_or(x_huge)) {
+            // At least one element needs huge method to avoid overflow
+            y2 = select(x_huge, log(x0) + (float)VM_LN2, y2);
+        }
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    return y1;
+}
+
+// instances of acosh_f template
+static inline Vec4f acosh(Vec4f const x) {
+    return acosh_f(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f acosh(Vec8f const x) {
+    return acosh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f acosh(Vec16f const x) {
+    return acosh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for atanh function, double precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE atanh_d(VTYPE const x0) {
+
+    // Coefficients
+    const double p0 = -3.09092539379866942570E1;
+    const double p1 =  6.54566728676544377376E1;
+    const double p2 = -4.61252884198732692637E1;
+    const double p3 =  1.20426861384072379242E1;
+    const double p4 = -8.54074331929669305196E-1;
+
+    const double q0 = -9.27277618139601130017E1;
+    const double q1 =  2.52006675691344555838E2;
+    const double q2 = -2.49839401325893582852E2;
+    const double q3 =  1.08938092147140262656E2;
+    const double q4 = -1.95638849376911654834E1;
+    const double q5 =  1.0;
+
+    // data vectors
+    VTYPE  x, x2, y1, y2, y3;
+
+    x  = abs(x0);
+    auto x_small = x < 0.5;                      // use Pade approximation if abs(x) < 0.5
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x * x;
+        y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5);
+        y1 = mul_add(y1, x2*x, x);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log((1.0+x)/(1.0-x)) * 0.5;
+        // check if out of range
+        y3 = select(x == 1.0, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
+        y2 = select(x >= 1.0, y3, y2);
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+    return y1;
+}
+
+// instances of atanh_d template
+static inline Vec2d atanh(Vec2d const x) {
+    return atanh_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d atanh(Vec4d const x) {
+    return atanh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d atanh(Vec8d const x) {
+    return atanh_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// Template for atanh function, single precision
+// This function does not produce denormals
+// Template parameters:
+// VTYPE:  double vector type
+template<typename VTYPE>
+static inline VTYPE atanh_f(VTYPE const x0) {
+
+    // Coefficients
+    const float r0 = 3.33337300303E-1f;
+    const float r1 = 1.99782164500E-1f;
+    const float r2 = 1.46691431730E-1f;
+    const float r3 = 8.24370301058E-2f;
+    const float r4 = 1.81740078349E-1f;
+
+    // data vectors
+    VTYPE  x, x2, y1, y2, y3;
+
+    x  = abs(x0);
+    auto x_small = x < 0.5f;                     // use polynomial approximation if abs(x) < 0.5
+
+    if (horizontal_or(x_small)) {
+        // At least one element needs small method
+        x2 = x * x;
+        y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
+        y1 = mul_add(y1, x2*x, x);
+    }
+    if (!horizontal_and(x_small)) {
+        // At least one element needs big method
+        y2 = log((1.0f+x)/(1.0f-x)) * 0.5f;
+        // check if out of range
+        y3 = select(x == 1.0f, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
+        y2 = select(x >= 1.0f, y3, y2);
+    }
+    y1 = select(x_small, y1, y2);                // choose method
+    y1 = sign_combine(y1, x0);                   // get original sign
+    return y1;
+}
+
+// instances of atanh_f template
+static inline Vec4f atanh(Vec4f const x) {
+    return atanh_f(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f atanh(Vec8f const x) {
+    return atanh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f atanh(Vec16f const x) {
+    return atanh_f(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif
diff --git a/Bwdif/VCL2/vectormath_lib.h b/Bwdif/VCL2/vectormath_lib.h
new file mode 100644
index 0000000..172fbbb
--- /dev/null
+++ b/Bwdif/VCL2/vectormath_lib.h
@@ -0,0 +1,2026 @@
+/****************************  vectormath_lib.h   *****************************
+* Author:        Agner Fog
+* Date created:  2012-05-30
+* Last modified: 2019-08-01
+* Version:       2.00.00
+* Project:       vector class library
+* Description:
+* Header file defining mathematical functions on floating point vectors
+* using Intel SVML library
+*
+* Instructions to use SVML library:
+* Include this file and link with svml
+*
+* Alternatively, use the inline math functions by including
+* vectormath_exp.h for power and exponential functions
+* vectormath_trig.h for trigonometric functions
+* vectormath_hyp.h for hyperbolic functions
+*
+* For detailed instructions, see vcl_manual.pdf
+*
+* (c) Copyright 2012-2019 Agner Fog.
+* Apache License version 2.0 or later.
+\*****************************************************************************/
+
+// check combination of header files
+#ifndef VECTORMATH_LIB_H
+#define VECTORMATH_LIB_H  1
+
+#ifdef VECTORMATH_COMMON_H
+#error conflicting header files. More than one implementation of mathematical functions included
+#else
+
+#include "vectorclass.h"     // make sure vector classes are defined first
+
+#ifdef   VCL_NAMESPACE
+namespace VCL_NAMESPACE {    // optional name space
+#endif
+
+
+#ifdef __INTEL_COMPILER
+/*****************************************************************************
+*
+*      128-bit vector functions using Intel compiler
+*
+*****************************************************************************/
+
+// exponential and power functions
+static inline Vec4f exp(Vec4f const x) {    // exponential function
+    return _mm_exp_ps(x);
+}
+static inline Vec2d exp(Vec2d const x) {    // exponential function
+    return _mm_exp_pd(x);
+}
+static inline Vec4f expm1(Vec4f const x) {  // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm_expm1_ps(x);
+}
+static inline Vec2d expm1(Vec2d const x) {  // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm_expm1_pd(x);
+}
+static inline Vec4f exp2(Vec4f const x) {   // pow(2,x)
+    return _mm_exp2_ps(x);
+}
+static inline Vec2d exp2(Vec2d const x) {   // pow(2,x)
+    return _mm_exp2_pd(x);
+}
+static inline Vec4f exp10(Vec4f const x) {  // pow(10,x)
+    return _mm_exp10_ps(x);
+}
+static inline Vec2d exp10(Vec2d const x) {  // pow(10,x)
+    return _mm_exp10_pd(x);
+}
+static inline Vec4f pow(Vec4f const a, Vec4f const b) {    // pow(a,b) = a to the power of b
+    return _mm_pow_ps(a, b);
+}
+static inline Vec4f pow(Vec4f const a, float const b) {    // pow(a,b) = a to the power of b
+    return _mm_pow_ps(a, Vec4f(b));
+}
+static inline Vec2d pow(Vec2d const a, Vec2d const b) {    // pow(a,b) = a to the power of b
+    return _mm_pow_pd(a, b);
+}
+static inline Vec2d pow(Vec2d const a, double const b) {   // pow(a,b) = a to the power of b
+    return _mm_pow_pd(a, Vec2d(b));
+}
+static inline Vec4f cbrt(Vec4f const x) {   // pow(x,1/3)
+    return _mm_cbrt_ps(x);
+}
+static inline Vec2d cbrt(Vec2d const x) {   // pow(x,1/3)
+    return _mm_cbrt_pd(x);
+}
+// logarithms
+static inline Vec4f log(Vec4f const x) {    // natural logarithm
+    return _mm_log_ps(x);
+}
+static inline Vec2d log(Vec2d const x) {    // natural logarithm
+    return _mm_log_pd(x);
+}
+static inline Vec4f log1p(Vec4f const x) {  // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm_log1p_ps(x);
+}
+static inline Vec2d log1p(Vec2d const x) {  // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm_log1p_pd(x);
+}
+static inline Vec4f log2(Vec4f const x) {   // logarithm base 2
+    return _mm_log2_ps(x);
+}
+static inline Vec2d log2(Vec2d const x) {   // logarithm base 2
+    return _mm_log2_pd(x);
+}
+static inline Vec4f log10(Vec4f const x) {  // logarithm base 10
+    return _mm_log10_ps(x);
+}
+static inline Vec2d log10(Vec2d const x) {  // logarithm base 10
+    return _mm_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec4f sin(Vec4f const x) {    // sine
+    return _mm_sin_ps(x);
+}
+static inline Vec2d sin(Vec2d const x) {    // sine
+    return _mm_sin_pd(x);
+}
+static inline Vec4f cos(Vec4f const x) {    // cosine
+    return _mm_cos_ps(x);
+}
+static inline Vec2d cos(Vec2d const x) {    // cosine
+    return _mm_cos_pd(x);
+}
+static inline Vec4f sincos(Vec4f * pcos, Vec4f const x) {  // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128 r_sin, r_cos;
+    r_sin = _mm_sincos_ps(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec2d sincos(Vec2d * pcos, Vec2d const x) {  // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128d r_sin, r_cos;
+    r_sin = _mm_sincos_pd(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec4f tan(Vec4f const x) {    // tangent
+    return _mm_tan_ps(x);
+}
+static inline Vec2d tan(Vec2d const x) {    // tangent
+    return _mm_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin(Vec4f const x) {   // inverse sine
+    return _mm_asin_ps(x);
+}
+static inline Vec2d asin(Vec2d const x) {   // inverse sine
+    return _mm_asin_pd(x);
+}
+
+static inline Vec4f acos(Vec4f const x) {   // inverse cosine
+    return _mm_acos_ps(x);
+}
+static inline Vec2d acos(Vec2d const x) {   // inverse cosine
+    return _mm_acos_pd(x);
+}
+
+static inline Vec4f atan(Vec4f const x) {   // inverse tangent
+    return _mm_atan_ps(x);
+}
+static inline Vec2d atan(Vec2d const x) {   // inverse tangent
+    return _mm_atan_pd(x);
+}
+static inline Vec4f atan2(Vec4f const a, Vec4f const b) {  // inverse tangent of a/b
+    return _mm_atan2_ps(a, b);
+}
+static inline Vec2d atan2(Vec2d const a, Vec2d const b) {  // inverse tangent of a/b
+    return _mm_atan2_pd(a, b);
+}
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec4f sinh(Vec4f const x) {   // hyperbolic sine
+    return _mm_sinh_ps(x);
+}
+static inline Vec2d sinh(Vec2d const x) {   // hyperbolic sine
+    return _mm_sinh_pd(x);
+}
+static inline Vec4f cosh(Vec4f const x) {   // hyperbolic cosine
+    return _mm_cosh_ps(x);
+}
+static inline Vec2d cosh(Vec2d const x) {   // hyperbolic cosine
+    return _mm_cosh_pd(x);
+}
+static inline Vec4f tanh(Vec4f const x) {   // hyperbolic tangent
+    return _mm_tanh_ps(x);
+}
+static inline Vec2d tanh(Vec2d const x) {   // hyperbolic tangent
+    return _mm_tanh_pd(x);
+}
+static inline Vec4f asinh(Vec4f const x) {  // inverse hyperbolic sine
+    return _mm_asinh_ps(x);
+}
+static inline Vec2d asinh(Vec2d const x) {  // inverse hyperbolic sine
+    return _mm_asinh_pd(x);
+}
+static inline Vec4f acosh(Vec4f const x) {  // inverse hyperbolic cosine
+    return _mm_acosh_ps(x);
+}
+static inline Vec2d acosh(Vec2d const x) {  // inverse hyperbolic cosine
+    return _mm_acosh_pd(x);
+}
+static inline Vec4f atanh(Vec4f const x) {  // inverse hyperbolic tangent
+    return _mm_atanh_ps(x);
+}
+static inline Vec2d atanh(Vec2d const x) {  // inverse hyperbolic tangent
+    return _mm_atanh_pd(x);
+}
+
+// error function
+static inline Vec4f erf(Vec4f const x) {    // error function
+    return _mm_erf_ps(x);
+}
+static inline Vec2d erf(Vec2d const x) {    // error function
+    return _mm_erf_pd(x);
+}
+static inline Vec4f erfc(Vec4f const x) {   // error function complement
+    return _mm_erfc_ps(x);
+}
+static inline Vec2d erfc(Vec2d const x) {   // error function complement
+    return _mm_erfc_pd(x);
+}
+static inline Vec4f erfinv(Vec4f const x) { // inverse error function
+    return _mm_erfinv_ps(x);
+}
+static inline Vec2d erfinv(Vec2d const x) { // inverse error function
+    return _mm_erfinv_pd(x);
+}
+
+static inline Vec4f cdfnorm(Vec4f const x) {     // cumulative normal distribution function
+    return _mm_cdfnorm_ps(x);
+}
+static inline Vec2d cdfnorm(Vec2d const x) {     // cumulative normal distribution function
+    return _mm_cdfnorm_pd(x);
+}
+static inline Vec4f cdfnorminv(Vec4f const x) {  // inverse cumulative normal distribution function
+    return _mm_cdfnorminv_ps(x);
+}
+static inline Vec2d cdfnorminv(Vec2d const x) {  // inverse cumulative normal distribution function
+    return _mm_cdfnorminv_pd(x);
+}
+
+#else
+/*****************************************************************************
+*
+*      128-bit vector functions using other compiler than Intel
+*
+*****************************************************************************/
+
+#if (defined(_WIN64) || defined(__CYGWIN__)) && defined(__x86_64__)
+// fix incompatible calling convention in Win64
+#if defined(_MSC_VER) || defined(__clang__)
+#define V_VECTORCALL __vectorcall
+#else
+// gcc. Change this if future gcc version supports __vectorcall
+#define V_VECTORCALL __attribute__((sysv_abi))  // this is inefficient but it works
+#endif
+#else  // not Win64. Vectors are transferred in registers by default
+#define V_VECTORCALL
+#endif
+
+// External function prototypes, 128-bit vectors
+extern "C" {
+    extern __m128  V_VECTORCALL __svml_expf4       (__m128);
+    extern __m128d V_VECTORCALL __svml_exp2        (__m128d);
+    extern __m128  V_VECTORCALL __svml_expm1f4     (__m128);
+    extern __m128d V_VECTORCALL __svml_expm12      (__m128d);
+    extern __m128  V_VECTORCALL __svml_exp2f4      (__m128);
+    extern __m128d V_VECTORCALL __svml_exp22       (__m128d);
+    extern __m128  V_VECTORCALL __svml_exp10f4     (__m128);
+    extern __m128d V_VECTORCALL __svml_exp102      (__m128d);
+    extern __m128  V_VECTORCALL __svml_powf4       (__m128,  __m128);
+    extern __m128d V_VECTORCALL __svml_pow2        (__m128d, __m128d);
+    extern __m128  V_VECTORCALL __svml_cbrtf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_cbrt2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_invsqrtf4   (__m128);
+    extern __m128d V_VECTORCALL __svml_invsqrt2    (__m128d);
+    extern __m128  V_VECTORCALL __svml_logf4       (__m128);
+    extern __m128d V_VECTORCALL __svml_log2        (__m128d);
+    extern __m128  V_VECTORCALL __svml_log1pf4     (__m128);
+    extern __m128d V_VECTORCALL __svml_log1p2      (__m128d);
+    extern __m128  V_VECTORCALL __svml_log2f4      (__m128);
+    extern __m128d V_VECTORCALL __svml_log22       (__m128d);
+    extern __m128  V_VECTORCALL __svml_log10f4     (__m128);
+    extern __m128d V_VECTORCALL __svml_log102      (__m128d);
+    extern __m128  V_VECTORCALL __svml_sinf4       (__m128);
+    extern __m128d V_VECTORCALL __svml_sin2        (__m128d);
+    extern __m128  V_VECTORCALL __svml_cosf4       (__m128);
+    extern __m128d V_VECTORCALL __svml_cos2        (__m128d);
+    extern __m128  V_VECTORCALL __svml_sincosf4    (__m128);  // cos returned in xmm1
+    extern __m128d V_VECTORCALL __svml_sincos2     (__m128d); // cos returned in xmm1
+    extern __m128  V_VECTORCALL __svml_tanf4       (__m128);
+    extern __m128d V_VECTORCALL __svml_tan2        (__m128d);
+    extern __m128  V_VECTORCALL __svml_asinf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_asin2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_acosf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_acos2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_atanf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_atan2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_atan2f4     (__m128,  __m128);
+    extern __m128d V_VECTORCALL __svml_atan22      (__m128d, __m128d);
+    extern __m128  V_VECTORCALL __svml_sinhf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_sinh2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_coshf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_cosh2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_tanhf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_tanh2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_asinhf4     (__m128);
+    extern __m128d V_VECTORCALL __svml_asinh2      (__m128d);
+    extern __m128  V_VECTORCALL __svml_acoshf4     (__m128);
+    extern __m128d V_VECTORCALL __svml_acosh2      (__m128d);
+    extern __m128  V_VECTORCALL __svml_atanhf4     (__m128);
+    extern __m128d V_VECTORCALL __svml_atanh2      (__m128d);
+    extern __m128  V_VECTORCALL __svml_erff4       (__m128);
+    extern __m128d V_VECTORCALL __svml_erf2        (__m128d);
+    extern __m128  V_VECTORCALL __svml_erfcf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_erfc2       (__m128d);
+    extern __m128  V_VECTORCALL __svml_erfinvf4    (__m128);
+    extern __m128d V_VECTORCALL __svml_erfinv2     (__m128d);
+    extern __m128  V_VECTORCALL __svml_cdfnormf4   (__m128);
+    extern __m128d V_VECTORCALL __svml_cdfnorm2    (__m128d);
+    extern __m128  V_VECTORCALL __svml_cdfnorminvf4(__m128);
+    extern __m128d V_VECTORCALL __svml_cdfnorminv2 (__m128d);
+    extern __m128  V_VECTORCALL __svml_cexpf4      (__m128);
+    extern __m128d V_VECTORCALL __svml_cexp2       (__m128d);
+}
+
+
+/*****************************************************************************
+*
+*      Function definitions
+*
+*****************************************************************************/
+
+// exponential and power functions
+static inline Vec4f exp (Vec4f const x) {   // exponential function
+    return  __svml_expf4(x);
+}
+static inline Vec2d exp (Vec2d const x) {   // exponential function
+    return  __svml_exp2(x);
+}
+
+static inline Vec4f expm1 (Vec4f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1
+    return  __svml_expm1f4(x);
+}
+static inline Vec2d expm1 (Vec2d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1
+    return  __svml_expm12(x);
+}
+
+static inline Vec4f exp2 (Vec4f const x) {  // pow(2,x)
+    return  __svml_exp2f4(x);
+}
+static inline Vec2d exp2 (Vec2d const x) {  // pow(2,x)
+    return  __svml_exp22(x);
+}
+
+static inline Vec4f exp10 (Vec4f const x) { // pow(10,x)
+    return  __svml_exp10f4(x);
+}
+static inline Vec2d exp10 (Vec2d const x) { // pow(10,x)
+    return  __svml_exp102(x);
+}
+
+static inline Vec4f pow (Vec4f const a, Vec4f const b) {   // pow(a,b) = a to the power of b
+    return  __svml_powf4(a,b);
+}
+
+static inline Vec4f pow (Vec4f const a, float const b) {   // pow(a,b) = a to the power of b
+    return  __svml_powf4(a,Vec4f(b));
+}
+static inline Vec2d pow (Vec2d const a, Vec2d const b) {   // pow(a,b) = a to the power of b
+    return  __svml_pow2(a,b);
+}
+static inline Vec2d pow (Vec2d const a, double const b) {  // pow(a,b) = a to the power of b
+    return  __svml_pow2(a,Vec2d(b));
+}
+
+static inline Vec4f cbrt (Vec4f const x) {  // pow(x,1/3)
+    return  __svml_cbrtf4(x);
+}
+static inline Vec2d cbrt (Vec2d const x) {  // pow(x,1/3)
+    return  __svml_cbrt2(x);
+}
+
+// logarithms
+static inline Vec4f log (Vec4f const x) {   // natural logarithm
+    return  __svml_logf4(x);
+}
+static inline Vec2d log (Vec2d const x) {   // natural logarithm
+    return  __svml_log2(x);
+}
+
+static inline Vec4f log1p (Vec4f const x) { // log(1+x)
+    return  __svml_log1pf4(x);
+}
+static inline Vec2d log1p (Vec2d const x) { // log(1+x)
+    return  __svml_log1p2(x);
+}
+
+static inline Vec4f log2 (Vec4f const x) {  // logarithm base 2
+    return  __svml_log2f4(x);
+}
+static inline Vec2d log2 (Vec2d const x) {  // logarithm base 2
+    return  __svml_log22(x);
+}
+
+static inline Vec4f log10 (Vec4f const x) { // logarithm base 10
+    return  __svml_log10f4(x);
+}
+static inline Vec2d log10 (Vec2d const x) { // logarithm base 10
+    return  __svml_log102(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec4f sin (Vec4f const x) {   // sine
+    return  __svml_sinf4(x);
+}
+static inline Vec2d sin (Vec2d const x) {   // sine
+    return  __svml_sin2(x);
+}
+
+static inline Vec4f cos (Vec4f const x) {   // cosine
+    return  __svml_cosf4(x);
+}
+static inline Vec2d cos (Vec2d const x) {   // cosine
+    return  __svml_cos2(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec4f sincos (Vec4f * pcos, Vec4f const x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128 r_sin, r_cos;
+    r_sin = __svml_sincosf4(x);
+#if defined(__unix__) || defined(__GNUC__)
+    //   __asm__ ( "call V_VECTORCALL __svml_sincosf4 \n movaps %%xmm0, %0 \n movaps %%xmm1, %1" : "=m"(r_sin), "=m"(r_cos) : "xmm0"(x) );
+    __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm movaps r_cos, xmm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec2d sincos (Vec2d * pcos, Vec2d const x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m128d r_sin, r_cos;
+    r_sin = __svml_sincos2(x);
+#if defined(__unix__) || defined(__GNUC__)
+    __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm movapd r_cos, xmm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+#endif // inline assembly available
+
+static inline Vec4f tan (Vec4f const x) {   // tangent
+    return  __svml_tanf4(x);
+}
+static inline Vec2d tan (Vec2d const x) {   // tangent
+    return  __svml_tan2(x);
+}
+
+// inverse trigonometric functions
+static inline Vec4f asin (Vec4f const x) {  // inverse sine
+    return  __svml_asinf4(x);
+}
+static inline Vec2d asin (Vec2d const x) {  // inverse sine
+    return  __svml_asin2(x);
+}
+
+static inline Vec4f acos (Vec4f const x) {  // inverse cosine
+    return  __svml_acosf4(x);
+}
+static inline Vec2d acos (Vec2d const x) {  // inverse cosine
+    return  __svml_acos2(x);
+}
+
+static inline Vec4f atan (Vec4f const x) {  // inverse tangent
+    return  __svml_atanf4(x);
+}
+static inline Vec2d atan (Vec2d const x) {  // inverse tangent
+    return  __svml_atan2(x);
+}
+
+static inline Vec4f atan2 (Vec4f const a, Vec4f const b) { // inverse tangent of a/b
+    return  __svml_atan2f4(a,b);
+}
+static inline Vec2d atan2 (Vec2d const a, Vec2d const b) { // inverse tangent of a/b
+    return  __svml_atan22(a,b);
+}
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec4f sinh (Vec4f const x) {  // hyperbolic sine
+    return  __svml_sinhf4(x);
+}
+static inline Vec2d sinh (Vec2d const x) {  // hyperbolic sine
+    return  __svml_sinh2(x);
+}
+
+static inline Vec4f cosh (Vec4f const x) {  // hyperbolic cosine
+    return  __svml_coshf4(x);
+}
+static inline Vec2d cosh (Vec2d const x) {  // hyperbolic cosine
+    return  __svml_cosh2(x);
+}
+
+static inline Vec4f tanh (Vec4f const x) {  // hyperbolic tangent
+    return  __svml_tanhf4(x);
+}
+static inline Vec2d tanh (Vec2d const x) {  // hyperbolic tangent
+    return  __svml_tanh2(x);
+}
+
+static inline Vec4f asinh (Vec4f const x) { // inverse hyperbolic sine
+    return  __svml_asinhf4(x);
+}
+static inline Vec2d asinh (Vec2d const x) { // inverse hyperbolic sine
+    return  __svml_asinh2(x);
+}
+
+static inline Vec4f acosh (Vec4f const x) { // inverse hyperbolic cosine
+    return  __svml_acoshf4(x);
+}
+static inline Vec2d acosh (Vec2d const x) { // inverse hyperbolic cosine
+    return  __svml_acosh2(x);
+}
+
+static inline Vec4f atanh (Vec4f const x) { // inverse hyperbolic tangent
+    return  __svml_atanhf4(x);
+}
+static inline Vec2d atanh (Vec2d const x) { // inverse hyperbolic tangent
+    return  __svml_atanh2(x);
+}
+
+// error function
+static inline Vec4f erf (Vec4f const x) {   // error function
+    return  __svml_erff4(x);
+}
+static inline Vec2d erf (Vec2d const x) {   // error function
+    return  __svml_erf2(x);
+}
+
+static inline Vec4f erfc (Vec4f const x) {  // error function complement
+    return  __svml_erfcf4(x);
+}
+static inline Vec2d erfc (Vec2d const x) {  // error function complement
+    return  __svml_erfc2(x);
+}
+
+static inline Vec4f erfinv (Vec4f const x) {     // inverse error function
+    return  __svml_erfinvf4(x);
+}
+static inline Vec2d erfinv (Vec2d const x) {     // inverse error function
+    return  __svml_erfinv2(x);
+}
+
+static inline Vec4f cdfnorm (Vec4f const x) {    // cumulative normal distribution function
+    return  __svml_cdfnormf4(x);
+}
+static inline Vec2d cdfnorm (Vec2d const x) {    // cumulative normal distribution function
+    return  __svml_cdfnorm2(x);
+}
+
+static inline Vec4f cdfnorminv (Vec4f const x) { // inverse cumulative normal distribution function
+    return  __svml_cdfnorminvf4(x);
+}
+static inline Vec2d cdfnorminv (Vec2d const x) { // inverse cumulative normal distribution function
+    return  __svml_cdfnorminv2(x);
+}
+
+#endif   // __INTEL_COMPILER
+
+#if defined (MAX_VECTOR_SIZE) && MAX_VECTOR_SIZE >= 256  // 256 bit vectors
+
+#if defined (VECTORF256_H)   // 256-bit vector registers supported
+
+#ifdef __INTEL_COMPILER
+/*****************************************************************************
+*
+*      256-bit vector functions using Intel compiler
+*
+*****************************************************************************/
+// exponential and power functions
+static inline Vec8f exp(Vec8f const x) {    // exponential function
+    return _mm256_exp_ps(x);
+}
+static inline Vec4d exp(Vec4d const x) {    // exponential function
+    return _mm256_exp_pd(x);
+}
+static inline Vec8f expm1(Vec8f const x) {  // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm256_expm1_ps(x);
+}
+static inline Vec4d expm1(Vec4d const x) {  // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm256_expm1_pd(x);
+}
+static inline Vec8f exp2(Vec8f const x) {   // pow(2,x)
+    return _mm256_exp2_ps(x);
+}
+static inline Vec4d exp2(Vec4d const x) {   // pow(2,x)
+    return _mm256_exp2_pd(x);
+}
+static inline Vec8f exp10(Vec8f const x) {  // pow(10,x)
+    return _mm256_exp10_ps(x);
+}
+static inline Vec4d exp10(Vec4d const x) {  // pow(10,x)
+    return _mm256_exp10_pd(x);
+}
+static inline Vec8f pow(Vec8f const a, Vec8f const b) {    // pow(a,b) = a to the power of b
+    return _mm256_pow_ps(a, b);
+}
+static inline Vec8f pow(Vec8f const a, float const b) {    // pow(a,b) = a to the power of b
+    return _mm256_pow_ps(a, Vec8f(b));
+}
+static inline Vec4d pow(Vec4d const a, Vec4d const b) {    // pow(a,b) = a to the power of b
+    return _mm256_pow_pd(a, b);
+}
+static inline Vec4d pow(Vec4d const a, double const b) {   // pow(a,b) = a to the power of b
+    return _mm256_pow_pd(a, Vec4d(b));
+}
+static inline Vec8f cbrt(Vec8f const x) {   // pow(x,1/3)
+    return _mm256_cbrt_ps(x);
+}
+static inline Vec4d cbrt(Vec4d const x) {   // pow(x,1/3)
+    return _mm256_cbrt_pd(x);
+}
+// logarithms
+static inline Vec8f log(Vec8f const x) {    // natural logarithm
+    return _mm256_log_ps(x);
+}
+static inline Vec4d log(Vec4d const x) {    // natural logarithm
+    return _mm256_log_pd(x);
+}
+static inline Vec8f log1p(Vec8f const x) {  // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm256_log1p_ps(x);
+}
+static inline Vec4d log1p(Vec4d const x) {  // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm256_log1p_pd(x);
+}
+static inline Vec8f log2(Vec8f const x) {   // logarithm base 2
+    return _mm256_log2_ps(x);
+}
+static inline Vec4d log2(Vec4d const x) {   // logarithm base 2
+    return _mm256_log2_pd(x);
+}
+static inline Vec8f log10(Vec8f const x) {  // logarithm base 10
+    return _mm256_log10_ps(x);
+}
+static inline Vec4d log10(Vec4d const x) {  // logarithm base 10
+    return _mm256_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec8f sin(Vec8f const x) {    // sine
+    return _mm256_sin_ps(x);
+}
+static inline Vec4d sin(Vec4d const x) {    // sine
+    return _mm256_sin_pd(x);
+}
+static inline Vec8f cos(Vec8f const x) {    // cosine
+    return _mm256_cos_ps(x);
+}
+static inline Vec4d cos(Vec4d const x) {    // cosine
+    return _mm256_cos_pd(x);
+}
+static inline Vec8f sincos(Vec8f * pcos, Vec8f const x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256 r_sin, r_cos;
+    r_sin = _mm256_sincos_ps(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec4d sincos(Vec4d * pcos, Vec4d const x) {  // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256d r_sin, r_cos;
+    r_sin = _mm256_sincos_pd(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec8f tan(Vec8f const x) {    // tangent
+    return _mm256_tan_ps(x);
+}
+static inline Vec4d tan(Vec4d const x) {    // tangent
+    return _mm256_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin(Vec8f const x) {   // inverse sine
+    return _mm256_asin_ps(x);
+}
+static inline Vec4d asin(Vec4d const x) {   // inverse sine
+    return _mm256_asin_pd(x);
+}
+
+static inline Vec8f acos(Vec8f const x) {   // inverse cosine
+    return _mm256_acos_ps(x);
+}
+static inline Vec4d acos(Vec4d const x) {   // inverse cosine
+    return _mm256_acos_pd(x);
+}
+
+static inline Vec8f atan(Vec8f const x) {   // inverse tangent
+    return _mm256_atan_ps(x);
+}
+static inline Vec4d atan(Vec4d const x) {   // inverse tangent
+    return _mm256_atan_pd(x);
+}
+static inline Vec8f atan2(Vec8f const a, Vec8f const b) {  // inverse tangent of a/b
+    return _mm256_atan2_ps(a, b);
+}
+static inline Vec4d atan2(Vec4d const a, Vec4d const b) {  // inverse tangent of a/b
+    return _mm256_atan2_pd(a, b);
+}
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh(Vec8f const x) {   // hyperbolic sine
+    return _mm256_sinh_ps(x);
+}
+static inline Vec4d sinh(Vec4d const x) {   // hyperbolic sine
+    return _mm256_sinh_pd(x);
+}
+static inline Vec8f cosh(Vec8f const x) {   // hyperbolic cosine
+    return _mm256_cosh_ps(x);
+}
+static inline Vec4d cosh(Vec4d const x) {   // hyperbolic cosine
+    return _mm256_cosh_pd(x);
+}
+static inline Vec8f tanh(Vec8f const x) {   // hyperbolic tangent
+    return _mm256_tanh_ps(x);
+}
+static inline Vec4d tanh(Vec4d const x) {   // hyperbolic tangent
+    return _mm256_tanh_pd(x);
+}
+static inline Vec8f asinh(Vec8f const x) {  // inverse hyperbolic sine
+    return _mm256_asinh_ps(x);
+}
+static inline Vec4d asinh(Vec4d const x) {  // inverse hyperbolic sine
+    return _mm256_asinh_pd(x);
+}
+static inline Vec8f acosh(Vec8f const x) {  // inverse hyperbolic cosine
+    return _mm256_acosh_ps(x);
+}
+static inline Vec4d acosh(Vec4d const x) {  // inverse hyperbolic cosine
+    return _mm256_acosh_pd(x);
+}
+static inline Vec8f atanh(Vec8f const x) {  // inverse hyperbolic tangent
+    return _mm256_atanh_ps(x);
+}
+static inline Vec4d atanh(Vec4d const x) {  // inverse hyperbolic tangent
+    return _mm256_atanh_pd(x);
+}
+
+// error function
+static inline Vec8f erf(Vec8f const x) {    // error function
+    return _mm256_erf_ps(x);
+}
+static inline Vec4d erf(Vec4d const x) {    // error function
+    return _mm256_erf_pd(x);
+}
+static inline Vec8f erfc(Vec8f const x) {   // error function complement
+    return _mm256_erfc_ps(x);
+}
+static inline Vec4d erfc(Vec4d const x) {   // error function complement
+    return _mm256_erfc_pd(x);
+}
+static inline Vec8f erfinv(Vec8f const x) { // inverse error function
+    return _mm256_erfinv_ps(x);
+}
+static inline Vec4d erfinv(Vec4d const x) { // inverse error function
+    return _mm256_erfinv_pd(x);
+}
+
+static inline Vec8f cdfnorm(Vec8f const x) {     // cumulative normal distribution function
+    return _mm256_cdfnorm_ps(x);
+}
+static inline Vec4d cdfnorm(Vec4d const x) {     // cumulative normal distribution function
+    return _mm256_cdfnorm_pd(x);
+}
+static inline Vec8f cdfnorminv(Vec8f const x) {  // inverse cumulative normal distribution function
+    return _mm256_cdfnorminv_ps(x);
+}
+static inline Vec4d cdfnorminv(Vec4d const x) {  // inverse cumulative normal distribution function
+    return _mm256_cdfnorminv_pd(x);
+}
+
+
+#else    // __INTEL_COMPILER
+/*****************************************************************************
+*
+*      256-bit vector functions using other compiler than Intel
+*
+*****************************************************************************/
+// External function prototypes, 256-bit vectors
+extern "C" {
+    extern __m256  V_VECTORCALL __svml_expf8       (__m256);
+    extern __m256d V_VECTORCALL __svml_exp4        (__m256d);
+    extern __m256  V_VECTORCALL __svml_expm1f8     (__m256);
+    extern __m256d V_VECTORCALL __svml_expm14      (__m256d);
+    extern __m256  V_VECTORCALL __svml_exp2f8      (__m256);
+    extern __m256d V_VECTORCALL __svml_exp24       (__m256d);
+    extern __m256  V_VECTORCALL __svml_exp10f8     (__m256);
+    extern __m256d V_VECTORCALL __svml_exp104      (__m256d);
+    extern __m256  V_VECTORCALL __svml_powf8       (__m256,  __m256);
+    extern __m256d V_VECTORCALL __svml_pow4        (__m256d, __m256d);
+    extern __m256  V_VECTORCALL __svml_cbrtf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_cbrt4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_invsqrtf8   (__m256);
+    extern __m256d V_VECTORCALL __svml_invsqrt4    (__m256d);
+    extern __m256  V_VECTORCALL __svml_logf8       (__m256);
+    extern __m256d V_VECTORCALL __svml_log4        (__m256d);
+    extern __m256  V_VECTORCALL __svml_log1pf8     (__m256);
+    extern __m256d V_VECTORCALL __svml_log1p4      (__m256d);
+    extern __m256  V_VECTORCALL __svml_log2f8      (__m256);
+    extern __m256d V_VECTORCALL __svml_log24       (__m256d);
+    extern __m256  V_VECTORCALL __svml_log10f8     (__m256);
+    extern __m256d V_VECTORCALL __svml_log104      (__m256d);
+    extern __m256  V_VECTORCALL __svml_sinf8       (__m256);
+    extern __m256d V_VECTORCALL __svml_sin4        (__m256d);
+    extern __m256  V_VECTORCALL __svml_cosf8       (__m256);
+    extern __m256d V_VECTORCALL __svml_cos4        (__m256d);
+    extern __m256  V_VECTORCALL __svml_sincosf8    (__m256);  // cos returned in ymm1
+    extern __m256d V_VECTORCALL __svml_sincos4     (__m256d); // cos returned in ymm1
+    extern __m256  V_VECTORCALL __svml_tanf8       (__m256);
+    extern __m256d V_VECTORCALL __svml_tan4        (__m256d);
+    extern __m256  V_VECTORCALL __svml_asinf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_asin4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_acosf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_acos4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_atanf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_atan4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_atan2f8     (__m256, __m256);
+    extern __m256d V_VECTORCALL __svml_atan24      (__m256d, __m256d);
+    extern __m256  V_VECTORCALL __svml_sinhf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_sinh4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_coshf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_cosh4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_tanhf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_tanh4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_asinhf8     (__m256);
+    extern __m256d V_VECTORCALL __svml_asinh4      (__m256d);
+    extern __m256  V_VECTORCALL __svml_acoshf8     (__m256);
+    extern __m256d V_VECTORCALL __svml_acosh4      (__m256d);
+    extern __m256  V_VECTORCALL __svml_atanhf8     (__m256);
+    extern __m256d V_VECTORCALL __svml_atanh4      (__m256d);
+    extern __m256  V_VECTORCALL __svml_erff8       (__m256);
+    extern __m256d V_VECTORCALL __svml_erf4        (__m256d);
+    extern __m256  V_VECTORCALL __svml_erfcf8      (__m256);
+    extern __m256d V_VECTORCALL __svml_erfc4       (__m256d);
+    extern __m256  V_VECTORCALL __svml_erfinvf8    (__m256);
+    extern __m256d V_VECTORCALL __svml_erfinv4     (__m256d);
+    extern __m256  V_VECTORCALL __svml_cdfnorminvf8(__m256);
+    extern __m256d V_VECTORCALL __svml_cdfnorminv4 (__m256d);
+    extern __m256  V_VECTORCALL __svml_cdfnormf8   (__m256);
+    extern __m256d V_VECTORCALL __svml_cdfnorm4    (__m256d);
+    //extern __m256  V_VECTORCALL __svml_cexpf8      (__m256);
+    //extern __m256d V_VECTORCALL __svml_cexp4       (__m256d);
+}
+
+
+// exponential and power functions
+static inline Vec8f exp (Vec8f const x) {   // exponential function
+    return  __svml_expf8(x);
+}
+static inline Vec4d exp (Vec4d const x) {   // exponential function
+    return  __svml_exp4(x);
+}
+static inline Vec8f expm1 (Vec8f const x) { // exp(x)-1
+    return  __svml_expm1f8(x);
+}
+static inline Vec4d expm1 (Vec4d const x) { // exp(x)-1
+    return  __svml_expm14(x);
+}
+static inline Vec8f exp2 (Vec8f const x) {  // pow(2,x)
+    return  __svml_exp2f8(x);
+}
+static inline Vec4d exp2 (Vec4d const x) {  // pow(2,x)
+    return  __svml_exp24(x);
+}
+static inline Vec8f exp10 (Vec8f const x) { // pow(10,x)
+    return  __svml_exp10f8(x);
+}
+static inline Vec4d exp10 (Vec4d const x) { // pow(10,x)
+    return  __svml_exp104(x);
+}
+static inline Vec8f pow (Vec8f const a, Vec8f const b) {   // pow(a,b) = a to the power of b
+    return  __svml_powf8(a,b);
+}
+static inline Vec8f pow (Vec8f const a, float const b) {   // pow(a,b) = a to the power of b
+    return  __svml_powf8(a,Vec8f(b));
+}
+static inline Vec4d pow (Vec4d const a, Vec4d const b) {   // pow(a,b) = a to the power of b
+    return  __svml_pow4(a,b);
+}
+static inline Vec4d pow (Vec4d const a, double const b) {  // pow(a,b) = a to the power of b
+    return  __svml_pow4(a,Vec4d(b));
+}
+static inline Vec8f cbrt (Vec8f const x) {  // pow(x,1/3)
+    return  __svml_cbrtf8(x);
+}
+static inline Vec4d cbrt (Vec4d const x) {  // pow(x,1/3)
+    return  __svml_cbrt4(x);
+}
+
+// logarithms
+static inline Vec8f log (Vec8f const x) {   // natural logarithm
+    return  __svml_logf8(x);
+}
+static inline Vec4d log (Vec4d const x) {   // natural logarithm
+    return  __svml_log4(x);
+}
+static inline Vec8f log1p (Vec8f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return  __svml_log1pf8(x);
+}
+static inline Vec4d log1p (Vec4d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return  __svml_log1p4(x);
+}
+static inline Vec8f log2 (Vec8f const x) {  // logarithm base 2
+    return  __svml_log2f8(x);
+}
+static inline Vec4d log2 (Vec4d const x) {  // logarithm base 2
+    return  __svml_log24(x);
+}
+static inline Vec8f log10 (Vec8f const x) { // logarithm base 10
+    return  __svml_log10f8(x);
+}
+static inline Vec4d log10 (Vec4d const x) { // logarithm base 10
+    return  __svml_log104(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin (Vec8f const x) {   // sine
+    return  __svml_sinf8(x);
+}
+static inline Vec4d sin (Vec4d const x) {   // sine
+    return  __svml_sin4(x);
+}
+static inline Vec8f cos (Vec8f const x) {   // cosine
+    return  __svml_cosf8(x);
+}
+static inline Vec4d cos (Vec4d const x) {   // cosine
+    return  __svml_cos4(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256 r_sin, r_cos;
+    r_sin = __svml_sincosf8(x);
+#if defined(__unix__) || defined(__GNUC__)
+    __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm vmovaps r_cos, ymm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const x) {   // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m256d r_sin, r_cos;
+    r_sin = __svml_sincos4(x);
+#if defined(__unix__) || defined(__GNUC__)
+    __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(r_cos));
+#else // Windows
+    _asm vmovapd r_cos, ymm1;
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+#endif // inline assembly available
+
+static inline Vec8f tan (Vec8f const x) {   // tangent
+    return  __svml_tanf8(x);
+}
+static inline Vec4d tan (Vec4d const x) {   // tangent
+    return  __svml_tan4(x);
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin (Vec8f const x) {  // inverse sine
+    return  __svml_asinf8(x);
+}
+static inline Vec4d asin (Vec4d const x) {  // inverse sine
+    return  __svml_asin4(x);
+}
+static inline Vec8f acos (Vec8f const x) {  // inverse cosine
+    return  __svml_acosf8(x);
+}
+static inline Vec4d acos (Vec4d const x) {  // inverse cosine
+    return  __svml_acos4(x);
+}
+static inline Vec8f atan (Vec8f const x) {  // inverse tangent
+    return  __svml_atanf8(x);
+}
+static inline Vec4d atan (Vec4d const x) {  // inverse tangent
+    return  __svml_atan4(x);
+}
+static inline Vec8f atan2 (Vec8f const a, Vec8f const b) { // inverse tangent of a/b
+    return  __svml_atan2f8(a,b);
+}
+static inline Vec4d atan2 (Vec4d const a, Vec4d const b) { // inverse tangent of a/b
+    return  __svml_atan24(a,b);
+}
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec8f sinh (Vec8f const x) {  // hyperbolic sine
+    return  __svml_sinhf8(x);
+}
+static inline Vec4d sinh (Vec4d const x) {  // hyperbolic sine
+    return  __svml_sinh4(x);
+}
+static inline Vec8f cosh (Vec8f const x) {  // hyperbolic cosine
+    return  __svml_coshf8(x);
+}
+static inline Vec4d cosh (Vec4d const x) {  // hyperbolic cosine
+    return  __svml_cosh4(x);
+}
+static inline Vec8f tanh (Vec8f const x) {  // hyperbolic tangent
+    return  __svml_tanhf8(x);
+}
+static inline Vec4d tanh (Vec4d const x) {  // hyperbolic tangent
+    return  __svml_tanh4(x);
+}
+static inline Vec8f asinh (Vec8f const x) { // inverse hyperbolic sine
+    return  __svml_asinhf8(x);
+}
+static inline Vec4d asinh (Vec4d const x) { // inverse hyperbolic sine
+    return  __svml_asinh4(x);
+}
+static inline Vec8f acosh (Vec8f const x) { // inverse hyperbolic cosine
+    return  __svml_acoshf8(x);
+}
+static inline Vec4d acosh (Vec4d const x) { // inverse hyperbolic cosine
+    return  __svml_acosh4(x);
+}
+
+static inline Vec8f atanh (Vec8f const x) { // inverse hyperbolic tangent
+    return  __svml_atanhf8(x);
+}
+static inline Vec4d atanh (Vec4d const x) { // inverse hyperbolic tangent
+    return  __svml_atanh4(x);
+}
+
+// error function
+static inline Vec8f erf (Vec8f const x) {   // error function
+    return  __svml_erff8(x);
+}
+static inline Vec4d erf (Vec4d const x) {   // error function
+    return  __svml_erf4(x);
+}
+static inline Vec8f erfc (Vec8f const x) {  // error function complement
+    return  __svml_erfcf8(x);
+}
+static inline Vec4d erfc (Vec4d const x) {  // error function complement
+    return  __svml_erfc4(x);
+}
+static inline Vec8f erfinv (Vec8f const x) {     // inverse error function
+    return  __svml_erfinvf8(x);
+}
+static inline Vec4d erfinv (Vec4d const x) {     // inverse error function
+    return  __svml_erfinv4(x);
+}
+
+static inline Vec8f cdfnorm (Vec8f const x) {    // cumulative normal distribution function
+    return  __svml_cdfnormf8(x);
+}
+static inline Vec4d cdfnorm (Vec4d const x) {    // cumulative normal distribution function
+    return  __svml_cdfnorm4(x);
+}
+static inline Vec8f cdfnorminv (Vec8f const x) { // inverse cumulative normal distribution function
+    return  __svml_cdfnorminvf8(x);
+}
+static inline Vec4d cdfnorminv (Vec4d const x) { // inverse cumulative normal distribution function
+    return  __svml_cdfnorminv4(x);
+}
+
+#endif   // __INTEL_COMPILER
+
+#else    // VECTORF256_H
+
+/*****************************************************************************
+*
+*      256-bit vector functions emulated with 128-bit vectors
+*
+*****************************************************************************/
+// exponential and power functions
+static inline Vec8f exp (Vec8f const x) {        // exponential function
+    return Vec8f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec4d exp (Vec4d const x) {        // exponential function
+    return Vec4d(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec8f expm1 (Vec8f const x) {      // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec8f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec4d expm1 (Vec4d const x) {      // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec4d(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec8f exp2 (Vec8f const x) {       // pow(2,x)
+    return Vec8f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec4d exp2 (Vec4d const x) {       // pow(2,x)
+    return Vec4d(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec8f exp10 (Vec8f const x) {      // pow(10,x)
+    return Vec8f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec4d exp10 (Vec4d const x) {      // pow(10,x)
+    return Vec4d(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec8f pow (Vec8f const a, Vec8f const b) {   // pow(a,b) = a to the power of b
+    return Vec8f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec8f pow (Vec8f const a, float const b) {   // pow(a,b) = a to the power of b
+    return Vec8f(pow(a.get_low(),b), pow(a.get_high(),b));
+}
+static inline Vec4d pow (Vec4d const a, Vec4d const b) {   // pow(a,b) = a to the power of b
+    return Vec4d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec4d pow (Vec4d const a, double const b) {  // pow(a,b) = a to the power of b
+    return Vec4d(pow(a.get_low(),b), pow(a.get_high(),b));
+}
+static inline Vec8f cbrt (Vec8f const x) {   // pow(x,1/3)
+    return Vec8f(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+static inline Vec4d cbrt (Vec4d const x) {   // pow(x,1/3)
+    return Vec4d(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+
+// logarithms
+static inline Vec8f log (Vec8f const x) {   // natural logarithm
+    return Vec8f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec4d log (Vec4d const x) {   // natural logarithm
+    return Vec4d(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec8f log1p (Vec8f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec8f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec4d log1p (Vec4d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec4d(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec8f log2 (Vec8f const x) {  // logarithm base 2
+    return Vec8f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec4d log2 (Vec4d const x) {  // logarithm base 2
+    return Vec4d(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec8f log10 (Vec8f const x) { // logarithm base 10
+    return Vec8f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec4d log10 (Vec4d const x) { // logarithm base 10
+    return Vec4d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec8f sin (Vec8f const x) {   // sine
+    return Vec8f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec4d sin (Vec4d const x) {   // sine
+    return Vec4d(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec8f cos (Vec8f const x) {   // cosine
+    return Vec8f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec4d cos (Vec4d const x) {   // cosine
+    return Vec4d(cos(x.get_low()), cos(x.get_high()));
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec8f sincos (Vec8f * pcos, Vec8f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec4f r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low());
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec8f(r_cos0, r_cos1);
+    return Vec8f(r_sin0, r_sin1);
+}
+static inline Vec4d sincos (Vec4d * pcos, Vec4d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec2d r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low());
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec4d(r_cos0, r_cos1);
+    return Vec4d(r_sin0, r_sin1);
+}
+#endif // inline assembly available
+
+static inline Vec8f tan (Vec8f const x) {   // tangent
+    return Vec8f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec4d tan (Vec4d const x) {   // tangent
+    return Vec4d(tan(x.get_low()), tan(x.get_high()));
+}
+
+// inverse trigonometric functions
+static inline Vec8f asin (Vec8f const x) {  // inverse sine
+    return Vec8f(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec4d asin (Vec4d const x) {  // inverse sine
+    return Vec4d(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec8f acos (Vec8f const x) {  // inverse cosine
+    return Vec8f(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec4d acos (Vec4d const x) {  // inverse cosine
+    return Vec4d(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec8f atan (Vec8f const x) {  // inverse tangent
+    return Vec8f(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec4d atan (Vec4d const x) {  // inverse tangent
+    return Vec4d(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec8f atan2 (Vec8f const a, Vec8f const b) { // inverse tangent of a/b
+    return Vec8f(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
+}
+static inline Vec4d atan2 (Vec4d const a, Vec4d const b) { // inverse tangent of a/b
+    return Vec4d(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
+}
+
+// hyperbolic functions
+static inline Vec8f sinh (Vec8f const x) {  // hyperbolic sine
+    return Vec8f(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec4d sinh (Vec4d const x) {  // hyperbolic sine
+    return Vec4d(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec8f cosh (Vec8f const x) {  // hyperbolic cosine
+    return Vec8f(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec4d cosh (Vec4d const x) {  // hyperbolic cosine
+    return Vec4d(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec8f tanh (Vec8f const x) {  // hyperbolic tangent
+    return Vec8f(tanh(x.get_low()), tanh(x.get_high()));
+}
+static inline Vec4d tanh (Vec4d const x) {  // hyperbolic tangent
+    return Vec4d(tanh(x.get_low()), tanh(x.get_high()));
+}
+
+// inverse hyperbolic functions
+static inline Vec8f asinh (Vec8f const x) { // inverse hyperbolic sine
+    return Vec8f(asinh(x.get_low()), asinh(x.get_high()));
+}
+static inline Vec4d asinh (Vec4d const x) { // inverse hyperbolic sine
+    return Vec4d(asinh(x.get_low()), asinh(x.get_high()));
+}
+static inline Vec8f acosh (Vec8f const x) { // inverse hyperbolic cosine
+    return Vec8f(acosh(x.get_low()), acosh(x.get_high()));
+}
+static inline Vec4d acosh (Vec4d const x) { // inverse hyperbolic cosine
+    return Vec4d(acosh(x.get_low()), acosh(x.get_high()));
+}
+static inline Vec8f atanh (Vec8f const x) { // inverse hyperbolic tangent
+    return Vec8f(atanh(x.get_low()), atanh(x.get_high()));
+}
+static inline Vec4d atanh (Vec4d const x) { // inverse hyperbolic tangent
+    return Vec4d(atanh(x.get_low()), atanh(x.get_high()));
+}
+
+// error function
+static inline Vec8f erf (Vec8f const x) {   // error function
+    return Vec8f(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec4d erf (Vec4d const x) {   // error function
+    return Vec4d(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec8f erfc (Vec8f const x) {  // error function complement
+    return Vec8f(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec4d erfc (Vec4d const x) {  // error function complement
+    return Vec4d(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec8f erfinv (Vec8f const x) {     // inverse error function
+    return Vec8f(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+static inline Vec4d erfinv (Vec4d const x) {     // inverse error function
+    return Vec4d(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+
+static inline Vec8f cdfnorm (Vec8f const x) {    // cumulative normal distribution function
+    return Vec8f(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+static inline Vec4d cdfnorm (Vec4d const x) {    // cumulative normal distribution function
+    return Vec4d(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+static inline Vec8f cdfnorminv (Vec8f const x) { // inverse cumulative normal distribution function
+    return Vec8f(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+static inline Vec4d cdfnorminv (Vec4d const x) { // inverse cumulative normal distribution function
+    return Vec4d(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+
+#endif   // VECTORF256_H
+
+#endif   // MAX_VECTOR_SIZE >= 256
+
+#if defined (MAX_VECTOR_SIZE) && MAX_VECTOR_SIZE >= 512    // 512 bit vectors
+
+#if defined (VECTORF512_H)  // 512-bit vector registers supported
+
+#ifdef __INTEL_COMPILER
+/*****************************************************************************
+*
+*      512-bit vector functions using Intel compiler
+*
+*****************************************************************************/
+
+// exponential and power functions
+static inline Vec16f exp(Vec16f const x) {       // exponential function
+    return _mm512_exp_ps(x);
+}
+static inline Vec8d exp(Vec8d const x) {         // exponential function
+    return _mm512_exp_pd(x);
+}
+static inline Vec16f expm1(Vec16f const x) {     // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm512_expm1_ps(x);
+}
+static inline Vec8d expm1(Vec8d const x) {       // exp(x)-1. Avoids loss of precision if x is close to 1
+    return _mm512_expm1_pd(x);
+}
+static inline Vec16f exp2(Vec16f const x) {      // pow(2,x)
+    return _mm512_exp2_ps(x);
+}
+static inline Vec8d exp2(Vec8d const x) {        // pow(2,x)
+    return _mm512_exp2_pd(x);
+}
+static inline Vec16f exp10(Vec16f const x) {     // pow(10,x)
+    return _mm512_exp10_ps(x);
+}
+static inline Vec8d exp10(Vec8d const x) {       // pow(10,x)
+    return _mm512_exp10_pd(x);
+}
+static inline Vec16f pow(Vec16f const a, Vec16f const b) { // pow(a,b) = a to the power of b
+    return _mm512_pow_ps(a, b);
+}
+static inline Vec16f pow(Vec16f const a, float const b) {  // pow(a,b) = a to the power of b
+    return _mm512_pow_ps(a, Vec16f(b));
+}
+static inline Vec8d pow(Vec8d const a, Vec8d const b) {    // pow(a,b) = a to the power of b
+    return _mm512_pow_pd(a, b);
+}
+static inline Vec8d pow(Vec8d const a, double const b) {   // pow(a,b) = a to the power of b
+    return _mm512_pow_pd(a, Vec8d(b));
+}
+static inline Vec16f cbrt(Vec16f const x) {      // pow(x,1/3)
+    return _mm512_cbrt_ps(x);
+}
+static inline Vec8d cbrt(Vec8d const x) {        // pow(x,1/3)
+    return _mm512_cbrt_pd(x);
+}
+// logarithms
+static inline Vec16f log(Vec16f const x) {       // natural logarithm
+    return _mm512_log_ps(x);
+}
+static inline Vec8d log(Vec8d const x) {         // natural logarithm
+    return _mm512_log_pd(x);
+}
+static inline Vec16f log1p(Vec16f const x) {     // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm512_log1p_ps(x);
+}
+static inline Vec8d log1p(Vec8d const x) {       // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return _mm512_log1p_pd(x);
+}
+static inline Vec16f log2(Vec16f const x) {      // logarithm base 2
+    return _mm512_log2_ps(x);
+}
+static inline Vec8d log2(Vec8d const x) {        // logarithm base 2
+    return _mm512_log2_pd(x);
+}
+static inline Vec16f log10(Vec16f const x) {     // logarithm base 10
+    return _mm512_log10_ps(x);
+}
+static inline Vec8d log10(Vec8d const x) {       // logarithm base 10
+    return _mm512_log10_pd(x);
+}
+
+// trigonometric functions
+static inline Vec16f sin(Vec16f const x) {       // sine
+    return _mm512_sin_ps(x);
+}
+static inline Vec8d sin(Vec8d const x) {         // sine
+    return _mm512_sin_pd(x);
+}
+static inline Vec16f cos(Vec16f const x) {       // cosine
+    return _mm512_cos_ps(x);
+}
+static inline Vec8d cos(Vec8d const x) {         // cosine
+    return _mm512_cos_pd(x);
+}
+static inline Vec16f sincos(Vec16f * pcos, Vec16f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m512 r_sin, r_cos;
+    r_sin = _mm512_sincos_ps(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec8d sincos(Vec8d * pcos, Vec8d const x) {    // sine and cosine. sin(x) returned, cos(x) in pcos
+    __m512d r_sin, r_cos;
+    r_sin = _mm512_sincos_pd(&r_cos, x);
+    *pcos = r_cos;
+    return r_sin;
+}
+static inline Vec16f tan(Vec16f const x) {       // tangent
+    return _mm512_tan_ps(x);
+}
+static inline Vec8d tan(Vec8d const x) {         // tangent
+    return _mm512_tan_pd(x);
+}
+
+// inverse trigonometric functions
+static inline Vec16f asin(Vec16f const x) {      // inverse sine
+    return _mm512_asin_ps(x);
+}
+static inline Vec8d asin(Vec8d const x) {        // inverse sine
+    return _mm512_asin_pd(x);
+}
+
+static inline Vec16f acos(Vec16f const x) {      // inverse cosine
+    return _mm512_acos_ps(x);
+}
+static inline Vec8d acos(Vec8d const x) {        // inverse cosine
+    return _mm512_acos_pd(x);
+}
+
+static inline Vec16f atan(Vec16f const x) {      // inverse tangent
+    return _mm512_atan_ps(x);
+}
+static inline Vec8d atan(Vec8d const x) {        // inverse tangent
+    return _mm512_atan_pd(x);
+}
+static inline Vec16f atan2(Vec16f const a, Vec16f const b) { // inverse tangent of a/b
+    return _mm512_atan2_ps(a, b);
+}
+static inline Vec8d atan2(Vec8d const a, Vec8d const b) {    // inverse tangent of a/b
+    return _mm512_atan2_pd(a, b);
+}
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec16f sinh(Vec16f const x) {      // hyperbolic sine
+    return _mm512_sinh_ps(x);
+}
+static inline Vec8d sinh(Vec8d const x) {        // hyperbolic sine
+    return _mm512_sinh_pd(x);
+}
+static inline Vec16f cosh(Vec16f const x) {      // hyperbolic cosine
+    return _mm512_cosh_ps(x);
+}
+static inline Vec8d cosh(Vec8d const x) {        // hyperbolic cosine
+    return _mm512_cosh_pd(x);
+}
+static inline Vec16f tanh(Vec16f const x) {      // hyperbolic tangent
+    return _mm512_tanh_ps(x);
+}
+static inline Vec8d tanh(Vec8d const x) {        // hyperbolic tangent
+    return _mm512_tanh_pd(x);
+}
+static inline Vec16f asinh(Vec16f const x) {     // inverse hyperbolic sine
+    return _mm512_asinh_ps(x);
+}
+static inline Vec8d asinh(Vec8d const x) {       // inverse hyperbolic sine
+    return _mm512_asinh_pd(x);
+}
+static inline Vec16f acosh(Vec16f const x) {     // inverse hyperbolic cosine
+    return _mm512_acosh_ps(x);
+}
+static inline Vec8d acosh(Vec8d const x) {       // inverse hyperbolic cosine
+    return _mm512_acosh_pd(x);
+}
+static inline Vec16f atanh(Vec16f const x) {     // inverse hyperbolic tangent
+    return _mm512_atanh_ps(x);
+}
+static inline Vec8d atanh(Vec8d const x) {       // inverse hyperbolic tangent
+    return _mm512_atanh_pd(x);
+}
+
+// error function
+static inline Vec16f erf(Vec16f const x) {       // error function
+    return _mm512_erf_ps(x);
+}
+static inline Vec8d erf(Vec8d const x) {         // error function
+    return _mm512_erf_pd(x);
+}
+static inline Vec16f erfc(Vec16f const x) {      // error function complement
+    return _mm512_erfc_ps(x);
+}
+static inline Vec8d erfc(Vec8d const x) {        // error function complement
+    return _mm512_erfc_pd(x);
+}
+static inline Vec16f erfinv(Vec16f const x) {    // inverse error function
+    return _mm512_erfinv_ps(x);
+}
+static inline Vec8d erfinv(Vec8d const x) {      // inverse error function
+    return _mm512_erfinv_pd(x);
+}
+
+static inline Vec16f cdfnorm(Vec16f const x) {   // cumulative normal distribution function
+    return _mm512_cdfnorm_ps(x);
+}
+static inline Vec8d cdfnorm(Vec8d const x) {     // cumulative normal distribution function
+    return _mm512_cdfnorm_pd(x);
+}
+static inline Vec16f cdfnorminv(Vec16f const x) {// inverse cumulative normal distribution function
+    return _mm512_cdfnorminv_ps(x);
+}
+static inline Vec8d cdfnorminv(Vec8d const x) {  // inverse cumulative normal distribution function
+    return _mm512_cdfnorminv_pd(x);
+}
+
+#else    // __INTEL_COMPILER
+/*****************************************************************************
+*
+*      512-bit vector functions using other compiler than Intel
+*
+*****************************************************************************/
+
+// External function prototypes, 512-bit vectors
+extern "C" {
+    extern __m512  V_VECTORCALL __svml_expf16       (__m512);
+    extern __m512d V_VECTORCALL __svml_exp8        (__m512d);
+    extern __m512  V_VECTORCALL __svml_expm1f16     (__m512);
+    extern __m512d V_VECTORCALL __svml_expm18      (__m512d);
+    extern __m512  V_VECTORCALL __svml_exp2f16      (__m512);
+    extern __m512d V_VECTORCALL __svml_exp28       (__m512d);
+    extern __m512  V_VECTORCALL __svml_exp10f16     (__m512);
+    extern __m512d V_VECTORCALL __svml_exp108      (__m512d);
+    extern __m512  V_VECTORCALL __svml_powf16       (__m512,  __m512);
+    extern __m512d V_VECTORCALL __svml_pow8        (__m512d, __m512d);
+    extern __m512  V_VECTORCALL __svml_cbrtf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_cbrt8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_invsqrtf16   (__m512);
+    extern __m512d V_VECTORCALL __svml_invsqrt8    (__m512d);
+    extern __m512  V_VECTORCALL __svml_logf16       (__m512);
+    extern __m512d V_VECTORCALL __svml_log8        (__m512d);
+    extern __m512  V_VECTORCALL __svml_log1pf16     (__m512);
+    extern __m512d V_VECTORCALL __svml_log1p8      (__m512d);
+    extern __m512  V_VECTORCALL __svml_log2f16      (__m512);
+    extern __m512d V_VECTORCALL __svml_log28       (__m512d);
+    extern __m512  V_VECTORCALL __svml_log10f16     (__m512);
+    extern __m512d V_VECTORCALL __svml_log108      (__m512d);
+    extern __m512  V_VECTORCALL __svml_sinf16       (__m512);
+    extern __m512d V_VECTORCALL __svml_sin8        (__m512d);
+    extern __m512  V_VECTORCALL __svml_cosf16       (__m512);
+    extern __m512d V_VECTORCALL __svml_cos8        (__m512d);
+    extern __m512  V_VECTORCALL __svml_sincosf16    (__m512); // cos returned in ymm1
+    extern __m512d V_VECTORCALL __svml_sincos8     (__m512d); // cos returned in ymm1
+    extern __m512  V_VECTORCALL __svml_tanf16       (__m512);
+    extern __m512d V_VECTORCALL __svml_tan8        (__m512d);
+    extern __m512  V_VECTORCALL __svml_asinf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_asin8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_acosf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_acos8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_atanf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_atan8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_atan2f16     (__m512, __m512);
+    extern __m512d V_VECTORCALL __svml_atan28      (__m512d, __m512d);
+    extern __m512  V_VECTORCALL __svml_sinhf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_sinh8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_coshf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_cosh8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_tanhf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_tanh8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_asinhf16     (__m512);
+    extern __m512d V_VECTORCALL __svml_asinh8      (__m512d);
+    extern __m512  V_VECTORCALL __svml_acoshf16     (__m512);
+    extern __m512d V_VECTORCALL __svml_acosh8      (__m512d);
+    extern __m512  V_VECTORCALL __svml_atanhf16     (__m512);
+    extern __m512d V_VECTORCALL __svml_atanh8      (__m512d);
+    extern __m512  V_VECTORCALL __svml_erff16       (__m512);
+    extern __m512d V_VECTORCALL __svml_erf8        (__m512d);
+    extern __m512  V_VECTORCALL __svml_erfcf16      (__m512);
+    extern __m512d V_VECTORCALL __svml_erfc8       (__m512d);
+    extern __m512  V_VECTORCALL __svml_erfinvf16    (__m512);
+    extern __m512d V_VECTORCALL __svml_erfinv8     (__m512d);
+    extern __m512  V_VECTORCALL __svml_cdfnorminvf16(__m512);
+    extern __m512d V_VECTORCALL __svml_cdfnorminv8 (__m512d);
+    extern __m512  V_VECTORCALL __svml_cdfnormf16   (__m512);
+    extern __m512d V_VECTORCALL __svml_cdfnorm8    (__m512d);
+    //extern __m512  V_VECTORCALL __svml_cexpf16    (__m512);
+    //extern __m512d V_VECTORCALL __svml_cexp8     (__m512d);
+}
+
+
+// exponential and power functions
+static inline Vec16f exp (Vec16f const x) {      // exponential function
+    return  __svml_expf16(x);
+}
+static inline Vec8d exp (Vec8d const x) {        // exponential function
+    return  __svml_exp8(x);
+}
+static inline Vec16f expm1 (Vec16f const x) {    // exp(x)-1
+    return  __svml_expm1f16(x);
+}
+static inline Vec8d expm1 (Vec8d const x) {      // exp(x)-1
+    return  __svml_expm18(x);
+}
+static inline Vec16f exp2 (Vec16f const x) {     // pow(2,x)
+    return  __svml_exp2f16(x);
+}
+static inline Vec8d exp2 (Vec8d const x) {       // pow(2,x)
+    return  __svml_exp28(x);
+}
+static inline Vec16f exp10 (Vec16f const x) {    // pow(10,x)
+    return  __svml_exp10f16(x);
+}
+static inline Vec8d exp10 (Vec8d const x) {      // pow(10,x)
+    return  __svml_exp108(x);
+}
+static inline Vec16f pow (Vec16f const a, Vec16f const b) {  // pow(a,b) = a to the power of b
+    return  __svml_powf16(a,b);
+}
+static inline Vec16f pow (Vec16f const a, float const b) {   // pow(a,b) = a to the power of b
+    return  __svml_powf16(a,Vec16f(b));
+}
+static inline Vec8d pow (Vec8d const a, Vec8d const b) {     // pow(a,b) = a to the power of b
+    return  __svml_pow8(a,b);
+}
+static inline Vec8d pow (Vec8d const a, double const b) {    // pow(a,b) = a to the power of b
+    return  __svml_pow8(a,Vec8d(b));
+}
+static inline Vec16f cbrt (Vec16f const x) {     // pow(x,1/3)
+    return  __svml_cbrtf16(x);
+}
+static inline Vec8d cbrt (Vec8d const x) {       // pow(x,1/3)
+    return  __svml_cbrt8(x);
+}
+
+// logarithms
+static inline Vec16f log (Vec16f const x) {      // natural logarithm
+    return  __svml_logf16(x);
+}
+static inline Vec8d log (Vec8d const x) {        // natural logarithm
+    return  __svml_log8(x);
+}
+static inline Vec16f log1p (Vec16f const x) {    // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return  __svml_log1pf16(x);
+}
+static inline Vec8d log1p (Vec8d const x) {      // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return  __svml_log1p8(x);
+}
+static inline Vec16f log2 (Vec16f const x) {     // logarithm base 2
+    return  __svml_log2f16(x);
+}
+static inline Vec8d log2 (Vec8d const x) {       // logarithm base 2
+    return  __svml_log28(x);
+}
+static inline Vec16f log10 (Vec16f const x) {    // logarithm base 10
+    return  __svml_log10f16(x);
+}
+static inline Vec8d log10 (Vec8d const x) {      // logarithm base 10
+    return  __svml_log108(x);
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec16f sin (Vec16f const x) {      // sine
+    return  __svml_sinf16(x);
+}
+static inline Vec8d sin (Vec8d const x) {        // sine
+    return  __svml_sin8(x);
+}
+static inline Vec16f cos (Vec16f const x) {      // cosine
+    return  __svml_cosf16(x);
+}
+static inline Vec8d cos (Vec8d const x) {        // cosine
+    return  __svml_cos8(x);
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) //|| !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+// sine and cosine. sin(x) returned, cos(x) in pcos
+static inline Vec16f sincos (Vec16f * pcos, Vec16f const x) {
+    __m512 r_sin, r_cos;
+    r_sin = __svml_sincosf16(x);
+#if defined(__unix__) || defined(__GNUC__)
+    __asm__ __volatile__ ( "vmovaps %%zmm1, %0":"=m"(r_cos));
+#else // Windows
+    // _asm vmovaps r_cos, zmm1; // does not work in VS 2019
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+// sine and cosine. sin(x) returned, cos(x) in pcos
+static inline Vec8d sincos (Vec8d * pcos, Vec8d const x) {
+    __m512d r_sin, r_cos;
+    r_sin = __svml_sincos8(x);
+#if defined(__unix__) || defined(__GNUC__)
+    __asm__ __volatile__ ( "vmovaps %%zmm1, %0":"=m"(r_cos));
+#else // Windows
+    // _asm vmovapd r_cos, zmm1;  // does not work in VS 2019
+#endif
+    *pcos = r_cos;
+    return r_sin;
+}
+#endif // inline assembly available
+
+static inline Vec16f tan (Vec16f const x) {      // tangent
+    return  __svml_tanf16(x);
+}
+static inline Vec8d tan (Vec8d const x) {        // tangent
+    return  __svml_tan8(x);
+}
+
+// inverse trigonometric functions
+static inline Vec16f asin (Vec16f const x) {     // inverse sine
+    return  __svml_asinf16(x);
+}
+static inline Vec8d asin (Vec8d const x) {       // inverse sine
+    return  __svml_asin8(x);
+}
+static inline Vec16f acos (Vec16f const x) {     // inverse cosine
+    return  __svml_acosf16(x);
+}
+static inline Vec8d acos (Vec8d const x) {       // inverse cosine
+    return  __svml_acos8(x);
+}
+static inline Vec16f atan (Vec16f const x) {     // inverse tangent
+    return  __svml_atanf16(x);
+}
+static inline Vec8d atan (Vec8d const x) {       // inverse tangent
+    return  __svml_atan8(x);
+}
+static inline Vec16f atan2 (Vec16f const a, Vec16f const b) {// inverse tangent of a/b
+    return  __svml_atan2f16(a,b);
+}
+static inline Vec8d atan2 (Vec8d const a, Vec8d const b) {   // inverse tangent of a/b
+    return  __svml_atan28(a,b);
+}
+
+// hyperbolic functions and inverse hyperbolic functions
+static inline Vec16f sinh (Vec16f const x) {     // hyperbolic sine
+    return  __svml_sinhf16(x);
+}
+static inline Vec8d sinh (Vec8d const x) {       // hyperbolic sine
+    return  __svml_sinh8(x);
+}
+static inline Vec16f cosh (Vec16f const x) {     // hyperbolic cosine
+    return  __svml_coshf16(x);
+}
+static inline Vec8d cosh (Vec8d const x) {       // hyperbolic cosine
+    return  __svml_cosh8(x);
+}
+static inline Vec16f tanh (Vec16f const x) {     // hyperbolic tangent
+    return  __svml_tanhf16(x);
+}
+static inline Vec8d tanh (Vec8d const x) {       // hyperbolic tangent
+    return  __svml_tanh8(x);
+}
+static inline Vec16f asinh (Vec16f const x) {    // inverse hyperbolic sine
+    return  __svml_asinhf16(x);
+}
+static inline Vec8d asinh (Vec8d const x) {      // inverse hyperbolic sine
+    return  __svml_asinh8(x);
+}
+static inline Vec16f acosh (Vec16f const x) {    // inverse hyperbolic cosine
+    return  __svml_acoshf16(x);
+}
+static inline Vec8d acosh (Vec8d const x) {      // inverse hyperbolic cosine
+    return  __svml_acosh8(x);
+}
+static inline Vec16f atanh (Vec16f const x) {    // inverse hyperbolic tangent
+    return  __svml_atanhf16(x);
+}
+static inline Vec8d atanh (Vec8d const x) {      // inverse hyperbolic tangent
+    return  __svml_atanh8(x);
+}
+
+// error function
+static inline Vec16f erf (Vec16f const x) {      // error function
+    return  __svml_erff16(x);
+}
+static inline Vec8d erf (Vec8d const x) {        // error function
+    return  __svml_erf8(x);
+}
+static inline Vec16f erfc (Vec16f const x) {     // error function complement
+    return  __svml_erfcf16(x);
+}
+static inline Vec8d erfc (Vec8d const x) {       // error function complement
+    return  __svml_erfc8(x);
+}
+static inline Vec16f erfinv (Vec16f const x) {   // inverse error function
+    return  __svml_erfinvf16(x);
+}
+static inline Vec8d erfinv (Vec8d const x) {     // inverse error function
+    return  __svml_erfinv8(x);
+}
+
+static inline Vec16f cdfnorm (Vec16f const x) {  // cumulative normal distribution function
+    return  __svml_cdfnormf16(x);
+}
+static inline Vec8d cdfnorm (Vec8d const x) {    // cumulative normal distribution function
+    return  __svml_cdfnorm8(x);
+}
+static inline Vec16f cdfnorminv (Vec16f const x) {  // inverse cumulative normal distribution function
+    return  __svml_cdfnorminvf16(x);
+}
+static inline Vec8d cdfnorminv (Vec8d const x) {    // inverse cumulative normal distribution function
+    return  __svml_cdfnorminv8(x);
+}
+
+#endif   // __INTEL_COMPILER
+
+#else    // VECTORF512_H
+/*****************************************************************************
+*
+*      512-bit vector functions emulated with 256-bit vectors
+*
+*****************************************************************************/
+
+// exponential and power functions
+static inline Vec16f exp (Vec16f const x) {      // exponential function
+    return Vec16f(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec8d exp (Vec8d const x) {        // exponential function
+    return Vec8d(exp(x.get_low()), exp(x.get_high()));
+}
+static inline Vec16f expm1 (Vec16f const x) {    // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec16f(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec8d expm1 (Vec8d const x) {      // exp(x)-1. Avoids loss of precision if x is close to 1
+    return Vec8d(expm1(x.get_low()), expm1(x.get_high()));
+}
+static inline Vec16f exp2 (Vec16f const x) {     // pow(2,x)
+    return Vec16f(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec8d exp2 (Vec8d const x) {       // pow(2,x)
+    return Vec8d(exp2(x.get_low()), exp2(x.get_high()));
+}
+static inline Vec16f exp10 (Vec16f const x) {    // pow(10,x)
+    return Vec16f(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec8d exp10 (Vec8d const x) {      // pow(10,x)
+    return Vec8d(exp10(x.get_low()), exp10(x.get_high()));
+}
+static inline Vec16f pow (Vec16f const a, Vec16f const b) {  // pow(a,b) = a to the power of b
+    return Vec16f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec16f pow (Vec16f const a, float const b) {   // pow(a,b) = a to the power of b
+    return Vec16f(pow(a.get_low(),b), pow(a.get_high(),b));
+}
+static inline Vec8d pow (Vec8d const a, Vec8d const b) {     // pow(a,b) = a to the power of b
+    return Vec8d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high()));
+}
+static inline Vec8d pow (Vec8d const a, double const b) {    // pow(a,b) = a to the power of b
+    return Vec8d(pow(a.get_low(),b), pow(a.get_high(),b));
+}
+static inline Vec16f cbrt (Vec16f const x) {     // pow(x,1/3)
+    return Vec16f(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+static inline Vec8d cbrt (Vec8d const x) {       // pow(x,1/3)
+    return Vec8d(cbrt(x.get_low()), cbrt(x.get_high()));
+}
+
+// logarithms
+static inline Vec16f log (Vec16f const x) {      // natural logarithm
+    return Vec16f(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec8d log (Vec8d const x) {        // natural logarithm
+    return Vec8d(log(x.get_low()), log(x.get_high()));
+}
+static inline Vec16f log1p (Vec16f const x) {    // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec16f(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec8d log1p (Vec8d const x) {      // log(1+x). Avoids loss of precision if 1+x is close to 1
+    return Vec8d(log1p(x.get_low()), log1p(x.get_high()));
+}
+static inline Vec16f log2 (Vec16f const x) {     // logarithm base 2
+    return Vec16f(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec8d log2 (Vec8d const x) {       // logarithm base 2
+    return Vec8d(log2(x.get_low()), log2(x.get_high()));
+}
+static inline Vec16f log10 (Vec16f const x) {    // logarithm base 10
+    return Vec16f(log10(x.get_low()), log10(x.get_high()));
+}
+static inline Vec8d log10 (Vec8d const x) {      // logarithm base 10
+    return Vec8d(log10(x.get_low()), log10(x.get_high()));
+}
+
+// trigonometric functions (angles in radians)
+static inline Vec16f sin (Vec16f const x) {      // sine
+    return Vec16f(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec8d sin (Vec8d const x) {        // sine
+    return Vec8d(sin(x.get_low()), sin(x.get_high()));
+}
+static inline Vec16f cos (Vec16f const x) {      // cosine
+    return Vec16f(cos(x.get_low()), cos(x.get_high()));
+}
+static inline Vec8d cos (Vec8d const x) {        // cosine
+    return Vec8d(cos(x.get_low()), cos(x.get_high()));
+}
+
+#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER)
+// no inline assembly in 64 bit MS compiler
+static inline Vec16f sincos (Vec16f * pcos, Vec16f const x) {  // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec8f r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low());
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec16f(r_cos0, r_cos1);
+    return Vec16f(r_sin0, r_sin1);
+}
+static inline Vec8d sincos (Vec8d * pcos, Vec8d const x) {     // sine and cosine. sin(x) returned, cos(x) in pcos
+    Vec4d r_sin0, r_sin1, r_cos0, r_cos1;
+    r_sin0 = sincos(&r_cos0, x.get_low());
+    r_sin1 = sincos(&r_cos1, x.get_high());
+    *pcos = Vec8d(r_cos0, r_cos1);
+    return Vec8d(r_sin0, r_sin1);
+}
+#endif // inline assembly available
+
+
+static inline Vec16f tan (Vec16f const x) {      // tangent
+    return Vec16f(tan(x.get_low()), tan(x.get_high()));
+}
+static inline Vec8d tan (Vec8d const x) {        // tangent
+    return Vec8d(tan(x.get_low()), tan(x.get_high()));
+}
+
+// inverse trigonometric functions
+static inline Vec16f asin (Vec16f const x) {     // inverse sine
+    return Vec16f(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec8d asin (Vec8d const x) {       // inverse sine
+    return Vec8d(asin(x.get_low()), asin(x.get_high()));
+}
+static inline Vec16f acos (Vec16f const x) {     // inverse cosine
+    return Vec16f(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec8d acos (Vec8d const x) {       // inverse cosine
+    return Vec8d(acos(x.get_low()), acos(x.get_high()));
+}
+static inline Vec16f atan (Vec16f const x) {     // inverse tangent
+    return Vec16f(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec8d atan (Vec8d const x) {       // inverse tangent
+    return Vec8d(atan(x.get_low()), atan(x.get_high()));
+}
+static inline Vec16f atan2 (Vec16f const a, Vec16f const b) {  // inverse tangent of a/b
+    return Vec16f(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
+}
+static inline Vec8d atan2 (Vec8d const a, Vec8d const b) {     // inverse tangent of a/b
+    return Vec8d(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high()));
+}
+
+// hyperbolic functions
+static inline Vec16f sinh (Vec16f const x) {     // hyperbolic sine
+    return Vec16f(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec8d sinh (Vec8d const x) {       // hyperbolic sine
+    return Vec8d(sinh(x.get_low()), sinh(x.get_high()));
+}
+static inline Vec16f cosh (Vec16f const x) {     // hyperbolic cosine
+    return Vec16f(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec8d cosh (Vec8d const x) {       // hyperbolic cosine
+    return Vec8d(cosh(x.get_low()), cosh(x.get_high()));
+}
+static inline Vec16f tanh (Vec16f const x) {     // hyperbolic tangent
+    return Vec16f(tanh(x.get_low()), tanh(x.get_high()));
+}
+static inline Vec8d tanh (Vec8d const x) {       // hyperbolic tangent
+    return Vec8d(tanh(x.get_low()), tanh(x.get_high()));
+}
+
+// inverse hyperbolic functions
+static inline Vec16f asinh (Vec16f const x) {    // inverse hyperbolic sine
+    return Vec16f(asinh(x.get_low()), asinh(x.get_high()));
+}
+static inline Vec8d asinh (Vec8d const x) {      // inverse hyperbolic sine
+    return Vec8d(asinh(x.get_low()), asinh(x.get_high()));
+}
+static inline Vec16f acosh (Vec16f const x) {    // inverse hyperbolic cosine
+    return Vec16f(acosh(x.get_low()), acosh(x.get_high()));
+}
+static inline Vec8d acosh (Vec8d const x) {      // inverse hyperbolic cosine
+    return Vec8d(acosh(x.get_low()), acosh(x.get_high()));
+}
+static inline Vec16f atanh (Vec16f const x) {    // inverse hyperbolic tangent
+    return Vec16f(atanh(x.get_low()), atanh(x.get_high()));
+}
+static inline Vec8d atanh (Vec8d const x) {      // inverse hyperbolic tangent
+    return Vec8d(atanh(x.get_low()), atanh(x.get_high()));
+}
+
+// error function
+static inline Vec16f erf (Vec16f const x) {      // error function
+    return Vec16f(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec8d erf (Vec8d const x) {        // error function
+    return Vec8d(erf(x.get_low()), erf(x.get_high()));
+}
+static inline Vec16f erfc (Vec16f const x) {     // error function complement
+    return Vec16f(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec8d erfc (Vec8d const x) {       // error function complement
+    return Vec8d(erfc(x.get_low()), erfc(x.get_high()));
+}
+static inline Vec16f erfinv (Vec16f const x) {   // inverse error function
+    return Vec16f(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+static inline Vec8d erfinv (Vec8d const x) {     // inverse error function
+    return Vec8d(erfinv(x.get_low()), erfinv(x.get_high()));
+}
+
+static inline Vec16f cdfnorm (Vec16f const x) {  // cumulative normal distribution function
+    return Vec16f(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+static inline Vec8d cdfnorm (Vec8d const x) {    // cumulative normal distribution function
+    return Vec8d(cdfnorm(x.get_low()), cdfnorm(x.get_high()));
+}
+static inline Vec16f cdfnorminv (Vec16f const x) { // inverse cumulative normal distribution function
+    return Vec16f(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+static inline Vec8d cdfnorminv (Vec8d const x) {   // inverse cumulative normal distribution function
+    return Vec8d(cdfnorminv(x.get_low()), cdfnorminv(x.get_high()));
+}
+
+#endif   // VECTORF512_H
+
+#endif   // MAX_VECTOR_SIZE >= 512
+
+#ifdef   VCL_NAMESPACE
+}
+#endif   // VCL_NAMESPACE
+
+#endif   // VECTORMATH_COMMON_H
+
+#endif   // VECTORMATH_LIB_H
diff --git a/Bwdif/VCL2/vectormath_trig.h b/Bwdif/VCL2/vectormath_trig.h
new file mode 100644
index 0000000..7b33886
--- /dev/null
+++ b/Bwdif/VCL2/vectormath_trig.h
@@ -0,0 +1,898 @@
+/****************************  vectormath_trig.h   ******************************
+* Author:        Agner Fog
+* Date created:  2014-04-18
+* Last modified: 2020-06-08
+* Version:       2.00.03
+* Project:       vector class library
+* Description:
+* Header file containing inline version of trigonometric functions
+* and inverse trigonometric functions
+* sin, cos, sincos, tan
+* asin, acos, atan, atan2
+*
+* Theory, methods and inspiration based partially on these sources:
+* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
+*   Ellis Horwood, 1989.
+* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
+*   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
+* > Cephes math library by Stephen L. Moshier 1992,
+*   http://www.netlib.org/cephes/
+*
+* For detailed instructions, see vectormath_common.h and vcl_manual.pdf
+*
+* (c) Copyright 2014-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef VECTORMATH_TRIG_H
+#define VECTORMATH_TRIG_H  1
+
+#include "vectormath_common.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+// *************************************************************
+//             sin/cos template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// SC:     1 = sin, 2 = cos, 3 = sincos
+// Paramterers:
+// xx = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template<typename VTYPE, int SC>
+static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const xx) {
+
+    // define constants
+    const double P0sin = -1.66666666666666307295E-1;
+    const double P1sin = 8.33333333332211858878E-3;
+    const double P2sin = -1.98412698295895385996E-4;
+    const double P3sin = 2.75573136213857245213E-6;
+    const double P4sin = -2.50507477628578072866E-8;
+    const double P5sin = 1.58962301576546568060E-10;
+
+    const double P0cos = 4.16666666666665929218E-2;
+    const double P1cos = -1.38888888888730564116E-3;
+    const double P2cos = 2.48015872888517045348E-5;
+    const double P3cos = -2.75573141792967388112E-7;
+    const double P4cos = 2.08757008419747316778E-9;
+    const double P5cos = -1.13585365213876817300E-11;
+
+    const double DP1 = 7.853981554508209228515625E-1 * 2.;
+    const double DP2 = 7.94662735614792836714E-9 * 2.;
+    const double DP3 = 3.06161699786838294307E-17 * 2.;
+    /*
+    const double DP1sc = 7.85398125648498535156E-1;
+    const double DP2sc = 3.77489470793079817668E-8;
+    const double DP3sc = 2.69515142907905952645E-15;
+    */
+    typedef decltype(roundi(xx)) ITYPE;          // integer vector type
+    typedef decltype(nan_code(xx)) UITYPE;       // unsigned integer vector type
+    typedef decltype(xx < xx) BVTYPE;            // boolean vector type
+
+    VTYPE  xa, x, y, x2, s, c, sin1, cos1;       // data vectors
+    ITYPE  q, qq, signsin, signcos;              // integer vectors, 64 bit
+
+    BVTYPE swap, overflow;                       // boolean vectors
+
+    xa = abs(xx);
+
+    // Find quadrant
+    y = round(xa * (double)(2. / VM_PI));        // quadrant, as float
+    q = roundi(y);                               // quadrant, as integer
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 1
+    // 3*pi/4 - 5*pi/4 => 2
+    // 5*pi/4 - 7*pi/4 => 3
+    // 7*pi/4 - 8*pi/4 => 4
+
+    // Reduce by extended precision modular arithmetic
+    x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));    // x = ((xa - y * DP1) - y * DP2) - y * DP3;
+
+    // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4
+    x2 = x * x;
+    s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
+    c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
+    s = mul_add(x * x2, s, x);                                       // s = x + (x * x2) * s;
+    c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0));                 // c = 1.0 - x2 * 0.5 + (x2 * x2) * c;
+
+    // swap sin and cos if odd quadrant
+    swap = BVTYPE((q & 1) != 0);
+
+    // check for overflow
+    overflow = BVTYPE(UITYPE(q) > 0x80000000000000);  // q big if overflow
+    overflow &= is_finite(xa);
+    s = select(overflow, 0.0, s);
+    c = select(overflow, 1.0, c);
+
+    if constexpr ((SC & 1) != 0) {  // calculate sin
+        sin1 = select(swap, c, s);
+        signsin = ((q << 62) ^ ITYPE(reinterpret_i(xx)));
+        sin1 = sign_combine(sin1, reinterpret_d(signsin));
+    }
+    if constexpr ((SC & 2) != 0) {  // calculate cos
+        cos1 = select(swap, s, c);
+        signcos = ((q + 1) & 2) << 62;
+        cos1 ^= reinterpret_d(signcos);
+    }
+    if constexpr (SC == 3) {  // calculate both. cos returned through pointer
+        *cosret = cos1;
+    }
+    if constexpr ((SC & 1) != 0) return sin1; else return cos1;
+}
+
+// instantiations of sincos_d template:
+
+static inline Vec2d sin(Vec2d const x) {
+    return sincos_d<Vec2d, 1>(0, x);
+}
+
+static inline Vec2d cos(Vec2d const x) {
+    return sincos_d<Vec2d, 2>(0, x);
+}
+
+static inline Vec2d sincos(Vec2d * cosret, Vec2d const x) {
+    return sincos_d<Vec2d, 3>(cosret, x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d sin(Vec4d const x) {
+    return sincos_d<Vec4d, 1>(0, x);
+}
+
+static inline Vec4d cos(Vec4d const x) {
+    return sincos_d<Vec4d, 2>(0, x);
+}
+
+static inline Vec4d sincos(Vec4d * cosret, Vec4d const x) {
+    return sincos_d<Vec4d, 3>(cosret, x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d sin(Vec8d const x) {
+    return sincos_d<Vec8d, 1>(0, x);
+}
+
+static inline Vec8d cos(Vec8d const x) {
+    return sincos_d<Vec8d, 2>(0, x);
+}
+
+static inline Vec8d sincos(Vec8d * cosret, Vec8d const x) {
+    return sincos_d<Vec8d, 3>(cosret, x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             sincos template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// SC:     1 = sin, 2 = cos, 3 = sincos, 4 = tan
+// Paramterers:
+// xx = input x (radians)
+// cosret = return pointer (only if SC = 3)
+template<typename VTYPE, int SC>
+static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const xx) {
+
+    // define constants
+    const float DP1F = 0.78515625f * 2.f;
+    const float DP2F = 2.4187564849853515625E-4f * 2.f;
+    const float DP3F = 3.77489497744594108E-8f * 2.f;
+
+    const float P0sinf = -1.6666654611E-1f;
+    const float P1sinf = 8.3321608736E-3f;
+    const float P2sinf = -1.9515295891E-4f;
+
+    const float P0cosf = 4.166664568298827E-2f;
+    const float P1cosf = -1.388731625493765E-3f;
+    const float P2cosf = 2.443315711809948E-5f;
+
+    typedef decltype(roundi(xx)) ITYPE;          // integer vector type
+    typedef decltype(nan_code(xx)) UITYPE;       // unsigned integer vector type
+    typedef decltype(xx < xx) BVTYPE;            // boolean vector type
+
+    VTYPE  xa, x, y, x2, s, c, sin1, cos1;       // data vectors
+    ITYPE  q, signsin, signcos;                  // integer vectors
+    BVTYPE swap, overflow;                       // boolean vectors
+
+    xa = abs(xx);
+
+    // Find quadrant
+    y = round(xa * (float)(2. / VM_PI));         // quadrant, as float
+    q = roundi(y);                               // quadrant, as integer
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 1
+    // 3*pi/4 - 5*pi/4 => 2
+    // 5*pi/4 - 7*pi/4 => 3
+    // 7*pi/4 - 8*pi/4 => 4
+
+    // Reduce by extended precision modular arithmetic
+    // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
+    x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa)));
+
+    // A two-step reduction saves time at the cost of precision for very big x:
+    //x = (xa - y * DP1F) - y * (DP2F+DP3F);
+
+    // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4
+    x2 = x * x;
+    s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x;
+    c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f);
+
+    // swap sin and cos if odd quadrant
+    swap = BVTYPE((q & 1) != 0);
+
+    // check for overflow
+    overflow = BVTYPE(UITYPE(q) > 0x2000000);  // q big if overflow
+    overflow &= is_finite(xa);
+    s = select(overflow, 0.0f, s);
+    c = select(overflow, 1.0f, c);
+
+    if constexpr ((SC & 5) != 0) {  // calculate sin
+        sin1 = select(swap, c, s);
+        signsin = ((q << 30) ^ ITYPE(reinterpret_i(xx)));
+        sin1 = sign_combine(sin1, reinterpret_f(signsin));
+    }
+    if constexpr ((SC & 6) != 0) {  // calculate cos
+        cos1 = select(swap, s, c);
+        signcos = ((q + 1) & 2) << 30;
+        cos1 ^= reinterpret_f(signcos);
+    }
+    if constexpr (SC == 1) return sin1;
+    else if constexpr (SC == 2) return cos1;
+    else if constexpr (SC == 3) {  // calculate both. cos returned through pointer
+        *cosret = cos1;
+        return sin1;
+    }
+    else {  // SC == 4. tan
+        return sin1 / cos1;
+    }
+}
+
+// instantiations of sincos_f template:
+
+static inline Vec4f sin(Vec4f const x) {
+    return sincos_f<Vec4f, 1>(0, x);
+}
+
+static inline Vec4f cos(Vec4f const x) {
+    return sincos_f<Vec4f, 2>(0, x);
+}
+
+static inline Vec4f sincos(Vec4f * cosret, Vec4f const x) {
+    return sincos_f<Vec4f, 3>(cosret, x);
+}
+
+static inline Vec4f tan(Vec4f const x) {
+    return sincos_f<Vec4f, 4>(0, x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f sin(Vec8f const x) {
+    return sincos_f<Vec8f, 1>(0, x);
+}
+
+static inline Vec8f cos(Vec8f const x) {
+    return sincos_f<Vec8f, 2>(0, x);
+}
+
+static inline Vec8f sincos(Vec8f * cosret, Vec8f const x) {
+    return sincos_f<Vec8f, 3>(cosret, x);
+}
+
+static inline Vec8f tan(Vec8f const x) {
+    return sincos_f<Vec8f, 4>(0, x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f sin(Vec16f const x) {
+    return sincos_f<Vec16f, 1>(0, x);
+}
+
+static inline Vec16f cos(Vec16f const x) {
+    return sincos_f<Vec16f, 2>(0, x);
+}
+
+static inline Vec16f sincos(Vec16f * cosret, Vec16f const x) {
+    return sincos_f<Vec16f, 3>(cosret, x);
+}
+
+static inline Vec16f tan(Vec16f const x) {
+    return sincos_f<Vec16f, 4>(0, x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             tan template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// Paramterers:
+// x = input x (radians)
+template<typename VTYPE>
+static inline VTYPE tan_d(VTYPE const x) {
+
+    // define constants
+    const double DP1 = 7.853981554508209228515625E-1 * 2.;;
+    const double DP2 = 7.94662735614792836714E-9 * 2.;;
+    const double DP3 = 3.06161699786838294307E-17 * 2.;;
+
+    const double P2tan = -1.30936939181383777646E4;
+    const double P1tan = 1.15351664838587416140E6;
+    const double P0tan = -1.79565251976484877988E7;
+
+    const double Q3tan = 1.36812963470692954678E4;
+    const double Q2tan = -1.32089234440210967447E6;
+    const double Q1tan = 2.50083801823357915839E7;
+    const double Q0tan = -5.38695755929454629881E7;
+
+    typedef decltype(x > x) BVTYPE;         // boolean vector type
+    VTYPE  xa, y, z, zz, px, qx, tn, recip; // data vectors
+    BVTYPE doinvert, xzero, overflow;       // boolean vectors
+    typedef decltype(nan_code(x)) UITYPE;   // unsigned integer vector type
+
+
+    xa = abs(x);
+
+    // Find quadrant
+    y = round(xa * (double)(2. / VM_PI));   // quadrant, as float
+    auto q = roundi(y);                     // quadrant, as integer
+    // Find quadrant
+    //      0 -   pi/4 => 0
+    //   pi/4 - 3*pi/4 => 1
+    // 3*pi/4 - 5*pi/4 => 2
+    // 5*pi/4 - 7*pi/4 => 3
+    // 7*pi/4 - 8*pi/4 => 4
+
+    // Reduce by extended precision modular arithmetic
+    // z = ((xa - y * DP1) - y * DP2) - y * DP3;
+    z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));
+
+    // Pade expansion of tan, valid for -pi/4 <= x <= pi/4
+    zz = z * z;
+    px = polynomial_2(zz, P0tan, P1tan, P2tan);
+    qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan);
+
+    // qx cannot be 0 for x <= pi/4
+    tn = mul_add(px / qx, z * zz, z);            // tn = z + z * zz * px / qx;
+
+    // if (q&2) tn = -1/tn
+    doinvert = BVTYPE((q & 1) != 0);
+    xzero = (xa == 0.);
+    // avoid division by 0. We will not be using recip anyway if xa == 0.
+    // tn never becomes exactly 0 when x = pi/2 so we only have to make
+    // a special case for x == 0.
+    recip = (-1.) / select(xzero, VTYPE(-1.), tn);
+    tn = select(doinvert, recip, tn);
+    tn = sign_combine(tn, x);       // get original sign
+
+    overflow = BVTYPE(UITYPE(q) > 0x80000000000000) & is_finite(xa);
+    tn = select(overflow, 0., tn);
+
+    return tn;
+}
+
+// instantiations of tan_d template:
+
+static inline Vec2d tan(Vec2d const x) {
+    return tan_d(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d tan(Vec4d const x) {
+    return tan_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d tan(Vec8d const x) {
+    return tan_d(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             tan template, single precision
+// *************************************************************
+// This is removed for the single precision version.
+// It is faster to use tan(x) = sin(x)/cos(x)
+
+
+
+// *************************************************************
+//             asin/acos template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// AC: 0 = asin, 1 = acos
+// Paramterers:
+// x = input x
+template<typename VTYPE, int AC>
+static inline VTYPE asin_d(VTYPE const x) {
+
+    // define constants
+    const double R4asin = 2.967721961301243206100E-3;
+    const double R3asin = -5.634242780008963776856E-1;
+    const double R2asin = 6.968710824104713396794E0;
+    const double R1asin = -2.556901049652824852289E1;
+    const double R0asin = 2.853665548261061424989E1;
+
+    const double S3asin = -2.194779531642920639778E1;
+    const double S2asin = 1.470656354026814941758E2;
+    const double S1asin = -3.838770957603691357202E2;
+    const double S0asin = 3.424398657913078477438E2;
+
+    const double P5asin = 4.253011369004428248960E-3;
+    const double P4asin = -6.019598008014123785661E-1;
+    const double P3asin = 5.444622390564711410273E0;
+    const double P2asin = -1.626247967210700244449E1;
+    const double P1asin = 1.956261983317594739197E1;
+    const double P0asin = -8.198089802484824371615E0;
+
+    const double Q4asin = -1.474091372988853791896E1;
+    const double Q3asin = 7.049610280856842141659E1;
+    const double Q2asin = -1.471791292232726029859E2;
+    const double Q1asin = 1.395105614657485689735E2;
+    const double Q0asin = -4.918853881490881290097E1;
+
+    VTYPE  xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2;
+    bool   dobig, dosmall;
+
+    xa = abs(x);
+    auto big = xa >= 0.625;  // boolean vector
+
+    /*
+    Small: xa < 0.625
+    ------------------
+    x = xa * xa;
+    px = PX(x);
+    qx = QX(x);
+    y1 = x*px/qx;
+    y1 = xa * y1 + xa;
+
+    Big: xa >= 0.625
+    ------------------
+    x = 1.0 - xa;
+    rx = RX(x);
+    sx = SX(x);
+    y1 = x * rx/sx;
+    x3 = sqrt(x+x);
+    y3 = x3 * y1 - MOREBITS;
+    z = pi/2 - x3 - y3
+    */
+
+    // select a common x for all polynomials
+    // This allows sharing of powers of x through common subexpression elimination
+    x1 = select(big, 1.0 - xa, xa * xa);
+
+    // calculate powers of x1 outside branches to make sure they are only calculated once
+    x2 = x1 * x1;
+    x4 = x2 * x2;
+    x5 = x4 * x1;
+    x3 = x2 * x1;
+
+    dosmall = !horizontal_and(big);    // at least one element is small
+    dobig = horizontal_or(big);        // at least one element is big
+
+    // calculate polynomials (reuse powers of x)
+    if (dosmall) {
+        // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin);
+        // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin);
+        px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1*P1asin) + mul_add(x5, P5asin, x2*P2asin);
+        qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1*Q1asin) + mul_add(x2, Q2asin, Q0asin);
+    }
+    if (dobig) {
+        // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin);
+        // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin);
+        rx = mul_add(x3, R3asin, x2*R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin));
+        sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin));
+    }
+
+    // select and divide outside branches to avoid dividing twice
+    vx = select(big, rx, px);
+    wx = select(big, sx, qx);
+    y1 = vx / wx * x1;
+
+    // results for big
+    if (dobig) {                                 // avoid square root if all are small
+        xb = sqrt(x1 + x1);                      // this produces NAN if xa > 1 so we don't need a special case for xa > 1
+        z1 = mul_add(xb, y1, xb);                // yb = xb * y1; z1 = xb + yb;
+    }
+
+    // results for small
+    z2 = mul_add(xa, y1, xa);                    // z2 = xa * y1 + xa;
+
+    // correct for sign
+    if constexpr (AC == 1) {                     // acos
+        z1 = select(x < 0., VM_PI - z1, z1);
+        z2 = VM_PI_2 - sign_combine(z2, x);
+        z = select(big, z1, z2);
+    }
+    else {     // asin
+        z1 = VM_PI_2 - z1;
+        z = select(big, z1, z2);
+        z = sign_combine(z, x);
+    }
+    return z;
+}
+
+// instantiations of asin_d template:
+
+static inline Vec2d asin(Vec2d const x) {
+    return asin_d<Vec2d, 0>(x);
+}
+
+static inline Vec2d acos(Vec2d const x) {
+    return asin_d<Vec2d, 1>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d asin(Vec4d const x) {
+    return asin_d<Vec4d, 0>(x);
+}
+
+static inline Vec4d acos(Vec4d const x) {
+    return asin_d<Vec4d, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d asin(Vec8d const x) {
+    return asin_d<Vec8d, 0>(x);
+}
+
+static inline Vec8d acos(Vec8d const x) {
+    return asin_d<Vec8d, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             asin/acos template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// AC: 0 = asin, 1 = acos
+// Paramterers:
+// x = input x
+template<typename VTYPE, int AC>
+static inline VTYPE asin_f(VTYPE const x) {
+
+    // define constants
+    const float P4asinf = 4.2163199048E-2f;
+    const float P3asinf = 2.4181311049E-2f;
+    const float P2asinf = 4.5470025998E-2f;
+    const float P1asinf = 7.4953002686E-2f;
+    const float P0asinf = 1.6666752422E-1f;
+
+    VTYPE  xa, x1, x2, x3, x4, xb, z, z1, z2;
+
+    xa = abs(x);
+    auto big = xa > 0.5f;                        // boolean vector
+
+    x1 = 0.5f * (1.0f - xa);
+    x2 = xa * xa;
+    x3 = select(big, x1, x2);
+
+    //if (horizontal_or(big))
+    {
+        xb = sqrt(x1);
+    }
+    x4 = select(big, xb, xa);
+
+    z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
+    z = mul_add(z, x3*x4, x4);                   // z = z * (x3*x4) + x4;
+    z1 = z + z;
+
+    // correct for sign
+    if constexpr (AC == 1) {                     // acos
+        z1 = select(x < 0., float(VM_PI) - z1, z1);
+        z2 = float(VM_PI_2) - sign_combine(z, x);
+        z = select(big, z1, z2);
+    }
+    else {     // asin
+        z1 = float(VM_PI_2) - z1;
+        z = select(big, z1, z);
+        z = sign_combine(z, x);
+    }
+
+    return z;
+}
+
+// instantiations of asin_f template:
+
+static inline Vec4f asin(Vec4f const x) {
+    return asin_f<Vec4f, 0>(x);
+}
+
+static inline Vec4f acos(Vec4f const x) {
+    return asin_f<Vec4f, 1>(x);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f asin(Vec8f const x) {
+    return asin_f<Vec8f, 0>(x);
+}
+static inline Vec8f acos(Vec8f const x) {
+    return asin_f<Vec8f, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f asin(Vec16f const x) {
+    return asin_f<Vec16f, 0>(x);
+}
+static inline Vec16f acos(Vec16f const x) {
+    return asin_f<Vec16f, 1>(x);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+// *************************************************************
+//             atan template, double precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// T2:     0 = atan, 1 = atan2
+// Paramterers:
+// y, x. calculate tan(y/x)
+// result is between -pi/2 and +pi/2 when x > 0
+// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
+template<typename VTYPE, int T2>
+static inline VTYPE atan_d(VTYPE const y, VTYPE const x) {
+
+    // define constants
+    //const double ONEOPIO4 = 4./VM_PI;
+    const double MOREBITS = 6.123233995736765886130E-17;
+    const double MOREBITSO2 = MOREBITS * 0.5;
+    const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880;
+
+    const double P4atan = -8.750608600031904122785E-1;
+    const double P3atan = -1.615753718733365076637E1;
+    const double P2atan = -7.500855792314704667340E1;
+    const double P1atan = -1.228866684490136173410E2;
+    const double P0atan = -6.485021904942025371773E1;
+
+    const double Q4atan = 2.485846490142306297962E1;
+    const double Q3atan = 1.650270098316988542046E2;
+    const double Q2atan = 4.328810604912902668951E2;
+    const double Q1atan = 4.853903996359136964868E2;
+    const double Q0atan = 1.945506571482613964425E2;
+
+    typedef decltype (x > x) BVTYPE;                            // boolean vector type
+    VTYPE  t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re;  // data vectors
+    BVTYPE swapxy, notbig, notsmal;                             // boolean vectors
+
+    if constexpr (T2 == 1) {  // atan2(y,x)
+        // move in first octant
+        x1 = abs(x);
+        y1 = abs(y);
+        swapxy = (y1 > x1);
+        // swap x and y if y1 > x1
+        x2 = select(swapxy, y1, x1);
+        y2 = select(swapxy, x1, y1);
+
+        // check for special case: x and y are both +/- INF
+        BVTYPE both_infinite = is_inf(x) & is_inf(y);   // x and Y are both infinite
+        if (horizontal_or(both_infinite)) {             // at least one element has both infinite
+            VTYPE mone = VTYPE(-1.0);
+            x2 = select(both_infinite, x2 & mone, x2);  // get 1.0 with the sign of x
+            y2 = select(both_infinite, y2 & mone, y2);  // get 1.0 with the sign of y
+        }
+
+        t = y2 / x2;                  // x = y = 0 gives NAN here
+    }
+    else {    // atan(y)
+        t = abs(y);
+    }
+
+    // small:  t < 0.66
+    // medium: 0.66 <= t <= 2.4142 (1+sqrt(2))
+    // big:    t > 2.4142
+    notbig  = t <= T3PO8;  // t <= 2.4142
+    notsmal = t >= 0.66;   // t >= 0.66
+
+    s   = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2));
+    s   = notsmal & s;                   // select(notsmal, s, 0.);
+    fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS));
+    fac = notsmal & fac;  //select(notsmal, fac, 0.);
+
+    // small:  z = t / 1.0;
+    // medium: z = (t-1.0) / (t+1.0);
+    // big:    z = -1.0 / t;
+    a = notbig & t;                    // select(notbig, t, 0.);
+    a = if_add(notsmal, a, -1.);
+    b = notbig & VTYPE(1.);            //  select(notbig, 1., 0.);
+    b = if_add(notsmal, b, t);
+    z = a / b;                         // division by 0 will not occur unless x and y are both 0
+
+    zz = z * z;
+
+    px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
+    qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
+
+    re = mul_add(px / qx, z * zz, z);  // re = (px / qx) * (z * zz) + z;
+    re += s + fac;
+
+    if constexpr (T2 == 1) {           // atan2(y,x)
+        // move back in place
+        re = select(swapxy, VM_PI_2 - re, re);
+        re = select((x | y) == 0., 0., re);      // atan2(0,0) = 0 by convention
+        re = select(sign_bit(x), VM_PI - re, re);// also for x = -0.
+    }
+    // get sign bit
+    re = sign_combine(re, y);
+
+    return re;
+}
+
+// instantiations of atan_d template:
+
+static inline Vec2d atan2(Vec2d const y, Vec2d const x) {
+    return atan_d<Vec2d, 1>(y, x);
+}
+
+static inline Vec2d atan(Vec2d const y) {
+    return atan_d<Vec2d, 0>(y, 0.);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec4d atan2(Vec4d const y, Vec4d const x) {
+    return atan_d<Vec4d, 1>(y, x);
+}
+
+static inline Vec4d atan(Vec4d const y) {
+    return atan_d<Vec4d, 0>(y, 0.);
+}
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec8d atan2(Vec8d const y, Vec8d const x) {
+    return atan_d<Vec8d, 1>(y, x);
+}
+
+static inline Vec8d atan(Vec8d const y) {
+    return atan_d<Vec8d, 0>(y, 0.);
+}
+#endif // MAX_VECTOR_SIZE >= 512
+
+
+
+// *************************************************************
+//             atan template, single precision
+// *************************************************************
+// Template parameters:
+// VTYPE:  f.p. vector type
+// T2:     0 = atan, 1 = atan2
+// Paramterers:
+// y, x. calculate tan(y/x)
+// result is between -pi/2 and +pi/2 when x > 0
+// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
+template<typename VTYPE, int T2>
+static inline VTYPE atan_f(VTYPE const y, VTYPE const x) {
+
+    // define constants
+    const float P3atanf = 8.05374449538E-2f;
+    const float P2atanf = -1.38776856032E-1f;
+    const float P1atanf = 1.99777106478E-1f;
+    const float P0atanf = -3.33329491539E-1f;
+
+    typedef decltype (x > x) BVTYPE;             // boolean vector type
+    VTYPE  t, x1, x2, y1, y2, s, a, b, z, zz, re;// data vectors
+    BVTYPE swapxy, notbig, notsmal;              // boolean vectors
+
+    if constexpr (T2 == 1) {  // atan2(y,x)
+        // move in first octant
+        x1 = abs(x);
+        y1 = abs(y);
+        swapxy = (y1 > x1);
+        // swap x and y if y1 > x1
+        x2 = select(swapxy, y1, x1);
+        y2 = select(swapxy, x1, y1);
+
+        // check for special case: x and y are both +/- INF
+        BVTYPE both_infinite = is_inf(x) & is_inf(y);   // x and Y are both infinite
+        if (horizontal_or(both_infinite)) {             // at least one element has both infinite
+            VTYPE mone = VTYPE(-1.0f);
+            x2 = select(both_infinite, x2 & mone, x2);  // get 1.0 with the sign of x
+            y2 = select(both_infinite, y2 & mone, y2);  // get 1.0 with the sign of y
+        }
+
+        // x = y = 0 will produce NAN. No problem, fixed below
+        t = y2 / x2;
+    }
+    else {    // atan(y)
+        t = abs(y);
+    }
+
+    // small:  t < 0.4142
+    // medium: 0.4142 <= t <= 2.4142
+    // big:    t > 2.4142  (not for atan2)
+    if constexpr (T2 == 0) {  // atan(y)
+        notsmal = t >= float(VM_SQRT2 - 1.);     // t >= tan  pi/8
+        notbig = t <= float(VM_SQRT2 + 1.);      // t <= tan 3pi/8
+
+        s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2)));
+        s = notsmal & s;                         // select(notsmal, s, 0.);
+
+        // small:  z = t / 1.0;
+        // medium: z = (t-1.0) / (t+1.0);
+        // big:    z = -1.0 / t;
+        a = notbig & t;                // select(notbig, t, 0.);
+        a = if_add(notsmal, a, -1.f);
+        b = notbig & VTYPE(1.f);       //  select(notbig, 1., 0.);
+        b = if_add(notsmal, b, t);
+        z = a / b;                     // division by 0 will not occur unless x and y are both 0
+    }
+    else {  // atan2(y,x)
+        // small:  z = t / 1.0;
+        // medium: z = (t-1.0) / (t+1.0);
+        notsmal = t >= float(VM_SQRT2 - 1.);
+        a = if_add(notsmal, t, -1.f);
+        b = if_add(notsmal, 1.f, t);
+        s = notsmal & VTYPE(float(VM_PI_4));
+        z = a / b;
+    }
+
+    zz = z * z;
+
+    // Taylor expansion
+    re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf);
+    re = mul_add(re, zz * z, z) + s;
+
+    if constexpr (T2 == 1) {                               // atan2(y,x)
+        // move back in place
+        re = select(swapxy, float(VM_PI_2) - re, re);
+        re = select((x | y) == 0.f, 0.f, re);              // atan2(0,+0) = 0 by convention
+        re = select(sign_bit(x), float(VM_PI) - re, re);   // also for x = -0.
+    }
+    // get sign bit
+    re = sign_combine(re, y);
+
+    return re;
+}
+
+// instantiations of atan_f template:
+
+static inline Vec4f atan2(Vec4f const y, Vec4f const x) {
+    return atan_f<Vec4f, 1>(y, x);
+}
+
+static inline Vec4f atan(Vec4f const y) {
+    return atan_f<Vec4f, 0>(y, 0.);
+}
+
+#if MAX_VECTOR_SIZE >= 256
+static inline Vec8f atan2(Vec8f const y, Vec8f const x) {
+    return atan_f<Vec8f, 1>(y, x);
+}
+
+static inline Vec8f atan(Vec8f const y) {
+    return atan_f<Vec8f, 0>(y, 0.);
+}
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+#if MAX_VECTOR_SIZE >= 512
+static inline Vec16f atan2(Vec16f const y, Vec16f const x) {
+    return atan_f<Vec16f, 1>(y, x);
+}
+
+static inline Vec16f atan(Vec16f const y) {
+    return atan_f<Vec16f, 0>(y, 0.);
+}
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+#endif
diff --git a/README.md b/README.md
index 4d9e3d7..84a6973 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Ported from FFmpeg's libavfilter https://www.ffmpeg.org/ffmpeg-filters.html#bwdi
 Usage
 =====
 
-    bwdif.Bwdif(clip clip, int field)
+    bwdif.Bwdif(clip clip, int field[, int opt=0])
 
 * clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported.
 
@@ -19,6 +19,13 @@ Usage
   * 2 = double rate (alternates each frame), starts with bottom
   * 3 = double rate (alternates each frame), starts with top
 
+* opt: Sets which cpu optimizations to use.
+  * 0 = auto detect
+  * 1 = use c
+  * 2 = use sse2
+  * 3 = use avx2
+  * 4 = use avx512
+
 
 Compilation
 ===========
diff --git a/meson.build b/meson.build
index 284d173..a413fe1 100644
--- a/meson.build
+++ b/meson.build
@@ -5,17 +5,58 @@ project('Bwdif', 'cpp',
 )
 
 sources = [
-  'Bwdif/Bwdif.cpp'
+  'Bwdif/Bwdif.cpp',
+  'Bwdif/Bwdif.h'
 ]
 
 vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args: true, includes: true)
 
+libs = []
+
 if host_machine.cpu_family().startswith('x86')
-  add_project_arguments('-mfpmath=sse', '-msse2', language: 'cpp')
+  add_project_arguments('-DBWDIF_X86', '-mfpmath=sse', '-msse2', language: 'cpp')
+
+  sources += [
+    'Bwdif/Bwdif_SSE2.cpp',
+    'Bwdif/VCL2/instrset.h',
+    'Bwdif/VCL2/instrset_detect.cpp',
+    'Bwdif/VCL2/vector_convert.h',
+    'Bwdif/VCL2/vectorclass.h',
+    'Bwdif/VCL2/vectorf128.h',
+    'Bwdif/VCL2/vectorf256.h',
+    'Bwdif/VCL2/vectorf256e.h',
+    'Bwdif/VCL2/vectorf512.h',
+    'Bwdif/VCL2/vectorf512e.h',
+    'Bwdif/VCL2/vectori128.h',
+    'Bwdif/VCL2/vectori256.h',
+    'Bwdif/VCL2/vectori256e.h',
+    'Bwdif/VCL2/vectori512.h',
+    'Bwdif/VCL2/vectori512e.h',
+    'Bwdif/VCL2/vectori512s.h',
+    'Bwdif/VCL2/vectori512se.h',
+    'Bwdif/VCL2/vectormath_common.h',
+    'Bwdif/VCL2/vectormath_exp.h',
+    'Bwdif/VCL2/vectormath_hyp.h',
+    'Bwdif/VCL2/vectormath_lib.h',
+    'Bwdif/VCL2/vectormath_trig.h'
+  ]
+
+  libs += static_library('avx2', 'Bwdif/Bwdif_AVX2.cpp',
+    dependencies: vapoursynth_dep,
+    cpp_args: ['-mavx2', '-mfma'],
+    gnu_symbol_visibility: 'hidden'
+  )
+
+  libs += static_library('avx512', 'Bwdif/Bwdif_AVX512.cpp',
+    dependencies: vapoursynth_dep,
+    cpp_args: ['-mavx512f', '-mavx512bw', '-mavx512dq', '-mavx512vl', '-mfma'],
+    gnu_symbol_visibility: 'hidden'
+  )
 endif
 
 shared_module('bwdif', sources,
   dependencies: vapoursynth_dep,
+  link_with: libs,
   install: true,
   install_dir: join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth'),
   gnu_symbol_visibility: 'hidden'