diff --git a/Bwdif/Bwdif.cpp b/Bwdif/Bwdif.cpp
index b83ceaa..c7b4114 100644
--- a/Bwdif/Bwdif.cpp
+++ b/Bwdif/Bwdif.cpp
@@ -26,53 +26,62 @@
* along with this program. If not, see .
*/
+#include
#include
#include
#include
#include
-#include
#include
#include
-/*
- * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
- * Used when there is spatial and temporal interpolation.
- * Filter coefficients coef_sp are used when there is spatial interpolation only.
- * Adjusted for matching visual sharpness impression of spatial and temporal interpolation.
- */
-static constexpr uint16_t coef_lf[2] = { 4309, 213 };
-static constexpr uint16_t coef_hf[3] = { 5570, 3801, 1016 };
-static constexpr uint16_t coef_sp[2] = { 5077, 981 };
-static constexpr float coef_lf_f[2] = { 4309 / 8192.0f, 213 / 8192.0f };
-static constexpr float coef_hf_f[3] = { 5570 / 8192.0f, 3801 / 8192.0f, 1016 / 8192.0f };
-static constexpr float coef_sp_f[2] = { 5077 / 8192.0f, 981 / 8192.0f };
+#include "Bwdif.h"
+
+#ifdef BWDIF_X86
+template extern void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template extern void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template extern void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template extern void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template extern void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template extern void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
struct BwdifData final {
VSNodeRef * node;
VSVideoInfo vi;
const VSVideoInfo * viSaved;
int field;
- int peak;
+ int edgeStep, lineStep, peak;
+ void (*filterEdgeWithSpat)(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+ void (*filterEdgeWithoutSpat)(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+ void (*filterLine)(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
};
-template
-static inline void filterEdge(const T * prev2, const T * prev, const T * cur, const T * next, const T * next2, T * VS_RESTRICT dst, const int width,
- const int positiveStride, const int negativeStride, const int stride2, const int peak) noexcept {
- const T * prev2Above2 = prev2 - stride2;
- const T * prev2Below2 = prev2 + stride2;
- const T * prevAbove = prev + negativeStride;
- const T * prevBelow = prev + positiveStride;
- const T * curAbove = cur + negativeStride;
- const T * curBelow = cur + positiveStride;
- const T * nextAbove = next + negativeStride;
- const T * nextBelow = next + positiveStride;
- const T * next2Above2 = next2 - stride2;
- const T * next2Below2 = next2 + stride2;
+template
+static inline void filterEdge_c(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * VS_RESTRICT dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prevAbove = prev + negativeStride;
+ const pixel_t * prevBelow = prev + positiveStride;
+ const pixel_t * curAbove = cur + negativeStride;
+ const pixel_t * curBelow = cur + positiveStride;
+ const pixel_t * nextAbove = next + negativeStride;
+ const pixel_t * nextBelow = next + positiveStride;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
for (int x = 0; x < width; x++) {
- if constexpr (std::is_integral_v) {
+ if constexpr (std::is_integral_v) {
const int c = curAbove[x];
const int d = (prev2[x] + next2[x]) >> 1;
const int e = curBelow[x];
@@ -89,9 +98,9 @@ static inline void filterEdge(const T * prev2, const T * prev, const T * cur, co
const int f = ((prev2Below2[x] + next2Below2[x]) >> 1) - e;
const int dc = d - c;
const int de = d - e;
- const int max = std::max({ de, dc, std::min(b, f) });
- const int min = std::min({ de, dc, std::max(b, f) });
- diff = std::max({ diff, min, -max });
+ const int maximum = std::max({ de, dc, std::min(b, f) });
+ const int minimum = std::min({ de, dc, std::max(b, f) });
+ diff = std::max({ diff, minimum, -maximum });
}
const int interpol = std::clamp((c + e) >> 1, d - diff, d + diff);
@@ -114,9 +123,9 @@ static inline void filterEdge(const T * prev2, const T * prev, const T * cur, co
const float f = ((prev2Below2[x] + next2Below2[x]) * 0.5f) - e;
const float dc = d - c;
const float de = d - e;
- const float max = std::max({ de, dc, std::min(b, f) });
- const float min = std::min({ de, dc, std::max(b, f) });
- diff = std::max({ diff, min, -max });
+ const float maximum = std::max({ de, dc, std::min(b, f) });
+ const float minimum = std::min({ de, dc, std::max(b, f) });
+ diff = std::max({ diff, minimum, -maximum });
}
dst[x] = std::clamp((c + e) * 0.5f, d - diff, d + diff);
@@ -125,28 +134,35 @@ static inline void filterEdge(const T * prev2, const T * prev, const T * cur, co
}
}
-template
-static inline void filterLine(const T * prev2, const T * prev, const T * cur, const T * next, const T * next2, T * VS_RESTRICT dst, const int width,
- const int stride, const int stride2, const int stride3, const int stride4, const int peak) noexcept {
- const T * prev2Above4 = prev2 - stride4;
- const T * prev2Above2 = prev2 - stride2;
- const T * prev2Below2 = prev2 + stride2;
- const T * prev2Below4 = prev2 + stride4;
- const T * prevAbove = prev - stride;
- const T * prevBelow = prev + stride;
- const T * curAbove3 = cur - stride3;
- const T * curAbove = cur - stride;
- const T * curBelow = cur + stride;
- const T * curBelow3 = cur + stride3;
- const T * nextAbove = next - stride;
- const T * nextBelow = next + stride;
- const T * next2Above4 = next2 - stride4;
- const T * next2Above2 = next2 - stride2;
- const T * next2Below2 = next2 + stride2;
- const T * next2Below4 = next2 + stride4;
+template
+static inline void filterLine_c(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * VS_RESTRICT dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above4 = prev2 - stride4;
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prev2Below4 = prev2 + stride4;
+ const pixel_t * prevAbove = prev - stride;
+ const pixel_t * prevBelow = prev + stride;
+ const pixel_t * curAbove3 = cur - stride3;
+ const pixel_t * curAbove = cur - stride;
+ const pixel_t * curBelow = cur + stride;
+ const pixel_t * curBelow3 = cur + stride3;
+ const pixel_t * nextAbove = next - stride;
+ const pixel_t * nextBelow = next + stride;
+ const pixel_t * next2Above4 = next2 - stride4;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+ const pixel_t * next2Below4 = next2 + stride4;
for (int x = 0; x < width; x++) {
- if constexpr (std::is_integral_v) {
+ if constexpr (std::is_integral_v) {
const int c = curAbove[x];
const int d = (prev2[x] + next2[x]) >> 1;
const int e = curBelow[x];
@@ -162,9 +178,9 @@ static inline void filterLine(const T * prev2, const T * prev, const T * cur, co
const int f = ((prev2Below2[x] + next2Below2[x]) >> 1) - e;
const int dc = d - c;
const int de = d - e;
- const int max = std::max({ de, dc, std::min(b, f) });
- const int min = std::min({ de, dc, std::max(b, f) });
- diff = std::max({ diff, min, -max });
+ const int maximum = std::max({ de, dc, std::min(b, f) });
+ const int minimum = std::min({ de, dc, std::max(b, f) });
+ diff = std::max({ diff, minimum, -maximum });
int interpol;
if (std::abs(c - e) > temporal_diff0)
@@ -194,9 +210,9 @@ static inline void filterLine(const T * prev2, const T * prev, const T * cur, co
const float f = ((prev2Below2[x] + next2Below2[x]) * 0.5f) - e;
const float dc = d - c;
const float de = d - e;
- const float max = std::max({ de, dc, std::min(b, f) });
- const float min = std::min({ de, dc, std::max(b, f) });
- diff = std::max({ diff, min, -max });
+ const float maximum = std::max({ de, dc, std::min(b, f) });
+ const float minimum = std::min({ de, dc, std::max(b, f) });
+ diff = std::max({ diff, minimum, -maximum });
float interpol;
if (std::abs(c - e) > temporal_diff0)
@@ -213,23 +229,23 @@ static inline void filterLine(const T * prev2, const T * prev, const T * cur, co
}
}
-template
+template
static void filter(const VSFrameRef * prevFrame, const VSFrameRef * curFrame, const VSFrameRef * nextFrame, VSFrameRef * dstFrame,
const int field, const BwdifData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
for (int plane = 0; plane < d->vi.format->numPlanes; plane++) {
const int width = vsapi->getFrameWidth(curFrame, plane);
const int height = vsapi->getFrameHeight(curFrame, plane);
- const int stride = vsapi->getStride(curFrame, plane) / sizeof(T);
- const T * prev = reinterpret_cast(vsapi->getReadPtr(prevFrame, plane));
- const T * cur = reinterpret_cast(vsapi->getReadPtr(curFrame, plane));
- const T * next = reinterpret_cast(vsapi->getReadPtr(nextFrame, plane));
- T * VS_RESTRICT dst = reinterpret_cast(vsapi->getWritePtr(dstFrame, plane));
+ const int stride = vsapi->getStride(curFrame, plane) / sizeof(pixel_t);
+ const pixel_t * prev = reinterpret_cast(vsapi->getReadPtr(prevFrame, plane));
+ const pixel_t * cur = reinterpret_cast(vsapi->getReadPtr(curFrame, plane));
+ const pixel_t * next = reinterpret_cast(vsapi->getReadPtr(nextFrame, plane));
+ pixel_t * VS_RESTRICT dst = reinterpret_cast(vsapi->getWritePtr(dstFrame, plane));
vs_bitblt(dst + stride * (1 - field),
vsapi->getStride(dstFrame, plane) * 2,
cur + stride * (1 - field),
vsapi->getStride(curFrame, plane) * 2,
- width * sizeof(T),
+ width * sizeof(pixel_t),
height / 2);
prev += stride * field;
@@ -237,27 +253,27 @@ static void filter(const VSFrameRef * prevFrame, const VSFrameRef * curFrame, co
next += stride * field;
dst += stride * field;
- const T * prev2 = field ? prev : cur;
- const T * next2 = field ? cur : next;
+ const pixel_t * prev2 = field ? prev : cur;
+ const pixel_t * next2 = field ? cur : next;
for (int y = field; y < height; y += 2) {
if ((y < 4) || (y + 5 > height)) {
if ((y < 2) || (y + 3 > height))
- filterEdge(prev2, prev, cur, next, next2, dst, width,
- y + 1 < height ? stride : -stride,
- y > 0 ? -stride : stride,
- stride * 2,
- d->peak);
+ d->filterEdgeWithoutSpat(prev2, prev, cur, next, next2, dst, width,
+ y + 1 < height ? stride : -stride,
+ y > 0 ? -stride : stride,
+ stride * 2,
+ d->edgeStep, d->peak);
else
- filterEdge(prev2, prev, cur, next, next2, dst, width,
- y + 1 < height ? stride : -stride,
- y > 0 ? -stride : stride,
- stride * 2,
- d->peak);
+ d->filterEdgeWithSpat(prev2, prev, cur, next, next2, dst, width,
+ y + 1 < height ? stride : -stride,
+ y > 0 ? -stride : stride,
+ stride * 2,
+ d->edgeStep, d->peak);
} else {
- filterLine(prev2, prev, cur, next, next2, dst, width,
- stride, stride * 2, stride * 3, stride * 4,
- d->peak);
+ d->filterLine(prev2, prev, cur, next, next2, dst, width,
+ stride, stride * 2, stride * 3, stride * 4,
+ d->lineStep, d->peak);
}
prev2 += stride * 2;
@@ -359,6 +375,7 @@ static void VS_CC bwdifCreate(const VSMap * in, VSMap * out, void * userData, VS
d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
d->vi = *vsapi->getVideoInfo(d->node);
d->viSaved = vsapi->getVideoInfo(d->node);
+ int err;
if (!isConstantFormat(&d->vi) ||
(d->vi.format->sampleType == stInteger && d->vi.format->bitsPerSample > 16) ||
@@ -369,10 +386,89 @@ static void VS_CC bwdifCreate(const VSMap * in, VSMap * out, void * userData, VS
throw "height must be greater than or equal to 4"sv;
d->field = int64ToIntS(vsapi->propGetInt(in, "field", 0, nullptr));
+ const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
if (d->field < 0 || d->field > 3)
throw "field must be 0, 1, 2, or 3"sv;
+ if (opt < 0 || opt > 4)
+ throw "opt must be 0, 1, 2, 3, or 4"sv;
+
+ {
+ if (d->vi.format->bytesPerSample == 1) {
+ d->filterEdgeWithSpat = filterEdge_c;
+ d->filterEdgeWithoutSpat = filterEdge_c;
+ d->filterLine = filterLine_c;
+ } else if (d->vi.format->bytesPerSample == 2) {
+ d->filterEdgeWithSpat = filterEdge_c;
+ d->filterEdgeWithoutSpat = filterEdge_c;
+ d->filterLine = filterLine_c;
+ } else {
+ d->filterEdgeWithSpat = filterEdge_c;
+ d->filterEdgeWithoutSpat = filterEdge_c;
+ d->filterLine = filterLine_c;
+ }
+
+#ifdef BWDIF_X86
+ const int iset = instrset_detect();
+ if ((opt == 0 && iset >= 10) || opt == 4) {
+ if (d->vi.format->bytesPerSample == 1) {
+ d->filterEdgeWithSpat = filterEdge_avx512;
+ d->filterEdgeWithoutSpat = filterEdge_avx512;
+ d->filterLine = filterLine_avx512;
+ d->edgeStep = 32;
+ } else if (d->vi.format->bytesPerSample == 2) {
+ d->filterEdgeWithSpat = filterEdge_avx512;
+ d->filterEdgeWithoutSpat = filterEdge_avx512;
+ d->filterLine = filterLine_avx512;
+ d->edgeStep = 16;
+ } else {
+ d->filterEdgeWithSpat = filterEdge_avx512;
+ d->filterEdgeWithoutSpat = filterEdge_avx512;
+ d->filterLine = filterLine_avx512;
+ d->edgeStep = 16;
+ }
+ d->lineStep = 16;
+ } else if ((opt == 0 && iset >= 8) || opt == 3) {
+ if (d->vi.format->bytesPerSample == 1) {
+ d->filterEdgeWithSpat = filterEdge_avx2;
+ d->filterEdgeWithoutSpat = filterEdge_avx2;
+ d->filterLine = filterLine_avx2;
+ d->edgeStep = 16;
+ } else if (d->vi.format->bytesPerSample == 2) {
+ d->filterEdgeWithSpat = filterEdge_avx2;
+ d->filterEdgeWithoutSpat = filterEdge_avx2;
+ d->filterLine = filterLine_avx2;
+ d->edgeStep = 8;
+ } else {
+ d->filterEdgeWithSpat = filterEdge_avx2;
+ d->filterEdgeWithoutSpat = filterEdge_avx2;
+ d->filterLine = filterLine_avx2;
+ d->edgeStep = 8;
+ }
+ d->lineStep = 8;
+ } else if ((opt == 0 && iset >= 2) || opt == 2) {
+ if (d->vi.format->bytesPerSample == 1) {
+ d->filterEdgeWithSpat = filterEdge_sse2;
+ d->filterEdgeWithoutSpat = filterEdge_sse2;
+ d->filterLine = filterLine_sse2;
+ d->edgeStep = 8;
+ } else if (d->vi.format->bytesPerSample == 2) {
+ d->filterEdgeWithSpat = filterEdge_sse2;
+ d->filterEdgeWithoutSpat = filterEdge_sse2;
+ d->filterLine = filterLine_sse2;
+ d->edgeStep = 4;
+ } else {
+ d->filterEdgeWithSpat = filterEdge_sse2;
+ d->filterEdgeWithoutSpat = filterEdge_sse2;
+ d->filterLine = filterLine_sse2;
+ d->edgeStep = 4;
+ }
+ d->lineStep = 4;
+ }
+#endif
+ }
+
if (d->field > 1) {
if (d->vi.numFrames > INT_MAX / 2)
throw "resulting clip is too long"sv;
@@ -398,6 +494,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegiste
configFunc("com.holywu.bwdif", "bwdif", "BobWeaver Deinterlacing Filter", VAPOURSYNTH_API_VERSION, 1, plugin);
registerFunc("Bwdif",
"clip:clip;"
- "field:int;",
+ "field:int;"
+ "opt:int:opt;",
bwdifCreate, nullptr, plugin);
}
diff --git a/Bwdif/Bwdif.h b/Bwdif/Bwdif.h
new file mode 100644
index 0000000..9d2e8b1
--- /dev/null
+++ b/Bwdif/Bwdif.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include
+
+#ifdef BWDIF_X86
+#include "VCL2/vectorclass.h"
+#endif
+
+/*
+ * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
+ * Used when there is spatial and temporal interpolation.
+ * Filter coefficients coef_sp are used when there is spatial interpolation only.
+ * Adjusted for matching visual sharpness impression of spatial and temporal interpolation.
+ */
+static constexpr uint16_t coef_lf[2] = { 4309, 213 };
+static constexpr uint16_t coef_hf[3] = { 5570, 3801, 1016 };
+static constexpr uint16_t coef_sp[2] = { 5077, 981 };
+static constexpr float coef_lf_f[2] = { 4309 / 8192.0f, 213 / 8192.0f };
+static constexpr float coef_hf_f[3] = { 5570 / 8192.0f, 3801 / 8192.0f, 1016 / 8192.0f };
+static constexpr float coef_sp_f[2] = { 5077 / 8192.0f, 981 / 8192.0f };
diff --git a/Bwdif/Bwdif.vcxproj b/Bwdif/Bwdif.vcxproj
index 3b6a56b..17a75fd 100644
--- a/Bwdif/Bwdif.vcxproj
+++ b/Bwdif/Bwdif.vcxproj
@@ -35,8 +35,9 @@
- NDEBUG;%(PreprocessorDefinitions)
+ BWDIF_X86;NDEBUG;%(PreprocessorDefinitions)
Level3
+ true
false
true
stdcpp17
@@ -49,6 +50,17 @@
+
+ AdvancedVectorExtensions2
+
+
+ AdvancedVectorExtensions512
+
+
+
+
+
+
diff --git a/Bwdif/Bwdif.vcxproj.filters b/Bwdif/Bwdif.vcxproj.filters
index 964a8c1..5aeb46a 100644
--- a/Bwdif/Bwdif.vcxproj.filters
+++ b/Bwdif/Bwdif.vcxproj.filters
@@ -18,5 +18,22 @@
Source Files
+
+ Source Files
+
+
+ Source Files
+
+
+ Source Files
+
+
+ Source Files
+
+
+
+
+ Header Files
+
\ No newline at end of file
diff --git a/Bwdif/Bwdif_AVX2.cpp b/Bwdif/Bwdif_AVX2.cpp
new file mode 100644
index 0000000..e500d7f
--- /dev/null
+++ b/Bwdif/Bwdif_AVX2.cpp
@@ -0,0 +1,224 @@
+#ifdef BWDIF_X86
+#include "Bwdif.h"
+
+template
+void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prevAbove = prev + negativeStride;
+ const pixel_t * prevBelow = prev + positiveStride;
+ const pixel_t * curAbove = cur + negativeStride;
+ const pixel_t * curBelow = cur + positiveStride;
+ const pixel_t * nextAbove = next + negativeStride;
+ const pixel_t * nextBelow = next + positiveStride;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+
+ for (int x = 0; x < width; x += step) {
+ if constexpr (std::is_same_v) {
+ const Vec16s c = Vec16s().load_16uc(curAbove + x);
+ const Vec16s d = (Vec16s().load_16uc(prev2 + x) + Vec16s().load_16uc(next2 + x)) >> 1;
+ const Vec16s e = Vec16s().load_16uc(curBelow + x);
+ const Vec16s temporal_diff0 = abs(Vec16s().load_16uc(prev2 + x) - Vec16s().load_16uc(next2 + x));
+ const Vec16s temporal_diff1 = (abs(Vec16s().load_16uc(prevAbove + x) - c) + abs(Vec16s().load_16uc(prevBelow + x) - e)) >> 1;
+ const Vec16s temporal_diff2 = (abs(Vec16s().load_16uc(nextAbove + x) - c) + abs(Vec16s().load_16uc(nextBelow + x) - e)) >> 1;
+ const Vec16s temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ Vec16s diff;
+ if constexpr (spat) {
+ const Vec16s b = ((Vec16s().load_16uc(prev2Above2 + x) + Vec16s().load_16uc(next2Above2 + x)) >> 1) - c;
+ const Vec16s f = ((Vec16s().load_16uc(prev2Below2 + x) + Vec16s().load_16uc(next2Below2 + x)) >> 1) - e;
+ const Vec16s dc = d - c;
+ const Vec16s de = d - e;
+ const Vec16s maximum = max(max(de, dc), min(b, f));
+ const Vec16s minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ Vec16s interpol = min(max((c + e) >> 1, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si256()).get_low();
+ result.store_nt(dst + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec8i c = Vec8i().load_8us(curAbove + x);
+ const Vec8i d = (Vec8i().load_8us(prev2 + x) + Vec8i().load_8us(next2 + x)) >> 1;
+ const Vec8i e = Vec8i().load_8us(curBelow + x);
+ const Vec8i temporal_diff0 = abs(Vec8i().load_8us(prev2 + x) - Vec8i().load_8us(next2 + x));
+ const Vec8i temporal_diff1 = (abs(Vec8i().load_8us(prevAbove + x) - c) + abs(Vec8i().load_8us(prevBelow + x) - e)) >> 1;
+ const Vec8i temporal_diff2 = (abs(Vec8i().load_8us(nextAbove + x) - c) + abs(Vec8i().load_8us(nextBelow + x) - e)) >> 1;
+ const Vec8i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ Vec8i diff;
+ if constexpr (spat) {
+ const Vec8i b = ((Vec8i().load_8us(prev2Above2 + x) + Vec8i().load_8us(next2Above2 + x)) >> 1) - c;
+ const Vec8i f = ((Vec8i().load_8us(prev2Below2 + x) + Vec8i().load_8us(next2Below2 + x)) >> 1) - e;
+ const Vec8i dc = d - c;
+ const Vec8i de = d - e;
+ const Vec8i maximum = max(max(de, dc), min(b, f));
+ const Vec8i minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ Vec8i interpol = min(max((c + e) >> 1, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si256()).get_low();
+ min(result, peak).store_nt(dst + x);
+ } else {
+ const Vec8f c = Vec8f().load_a(curAbove + x);
+ const Vec8f d = (Vec8f().load_a(prev2 + x) + Vec8f().load_a(next2 + x)) * 0.5f;
+ const Vec8f e = Vec8f().load_a(curBelow + x);
+ const Vec8f temporal_diff0 = abs(Vec8f().load_a(prev2 + x) - Vec8f().load_a(next2 + x));
+ const Vec8f temporal_diff1 = (abs(Vec8f().load_a(prevAbove + x) - c) + abs(Vec8f().load_a(prevBelow + x) - e)) * 0.5f;
+ const Vec8f temporal_diff2 = (abs(Vec8f().load_a(nextAbove + x) - c) + abs(Vec8f().load_a(nextBelow + x) - e)) * 0.5f;
+ const Vec8f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+ Vec8f diff;
+ if constexpr (spat) {
+ const Vec8f b = ((Vec8f().load_a(prev2Above2 + x) + Vec8f().load_a(next2Above2 + x)) * 0.5f) - c;
+ const Vec8f f = ((Vec8f().load_a(prev2Below2 + x) + Vec8f().load_a(next2Below2 + x)) * 0.5f) - e;
+ const Vec8f dc = d - c;
+ const Vec8f de = d - e;
+ const Vec8f maximum = max(max(de, dc), min(b, f));
+ const Vec8f minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ const Vec8f interpol = min(max((c + e) * 0.5f, d - diff), d + diff);
+ select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+ }
+ }
+}
+
+template
+void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above4 = prev2 - stride4;
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prev2Below4 = prev2 + stride4;
+ const pixel_t * prevAbove = prev - stride;
+ const pixel_t * prevBelow = prev + stride;
+ const pixel_t * curAbove3 = cur - stride3;
+ const pixel_t * curAbove = cur - stride;
+ const pixel_t * curBelow = cur + stride;
+ const pixel_t * curBelow3 = cur + stride3;
+ const pixel_t * nextAbove = next - stride;
+ const pixel_t * nextBelow = next + stride;
+ const pixel_t * next2Above4 = next2 - stride4;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+ const pixel_t * next2Below4 = next2 + stride4;
+
+ for (int x = 0; x < width; x += step) {
+ if constexpr (std::is_same_v) {
+ const Vec8i c = Vec8i().load_8uc(curAbove + x);
+ const Vec8i d = (Vec8i().load_8uc(prev2 + x) + Vec8i().load_8uc(next2 + x)) >> 1;
+ const Vec8i e = Vec8i().load_8uc(curBelow + x);
+ const Vec8i temporal_diff0 = abs(Vec8i().load_8uc(prev2 + x) - Vec8i().load_8uc(next2 + x));
+ const Vec8i temporal_diff1 = (abs(Vec8i().load_8uc(prevAbove + x) - c) + abs(Vec8i().load_8uc(prevBelow + x) - e)) >> 1;
+ const Vec8i temporal_diff2 = (abs(Vec8i().load_8uc(nextAbove + x) - c) + abs(Vec8i().load_8uc(nextBelow + x) - e)) >> 1;
+ const Vec8i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ const Vec8i b = ((Vec8i().load_8uc(prev2Above2 + x) + Vec8i().load_8uc(next2Above2 + x)) >> 1) - c;
+ const Vec8i f = ((Vec8i().load_8uc(prev2Below2 + x) + Vec8i().load_8uc(next2Below2 + x)) >> 1) - e;
+ const Vec8i dc = d - c;
+ const Vec8i de = d - e;
+ const Vec8i maximum = max(max(de, dc), min(b, f));
+ const Vec8i minimum = min(min(de, dc), max(b, f));
+ const Vec8i diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec8i interpol1 = (((coef_hf[0] * (Vec8i().load_8uc(prev2 + x) + Vec8i().load_8uc(next2 + x))
+ - coef_hf[1] * (Vec8i().load_8uc(prev2Above2 + x) + Vec8i().load_8uc(next2Above2 + x) + Vec8i().load_8uc(prev2Below2 + x) + Vec8i().load_8uc(next2Below2 + x))
+ + coef_hf[2] * (Vec8i().load_8uc(prev2Above4 + x) + Vec8i().load_8uc(next2Above4 + x) + Vec8i().load_8uc(prev2Below4 + x) + Vec8i().load_8uc(next2Below4 + x))) >> 2)
+ + coef_lf[0] * (c + e) - coef_lf[1] * (Vec8i().load_8uc(curAbove3 + x) + Vec8i().load_8uc(curBelow3 + x))) >> 13;
+ const Vec8i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec8i().load_8uc(curAbove3 + x) + Vec8i().load_8uc(curBelow3 + x))) >> 13;
+
+ Vec8i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(compress_saturated(interpol, zero_si256()), zero_si256()).get_low();
+ result.storel(dst + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec8i c = Vec8i().load_8us(curAbove + x);
+ const Vec8i d = (Vec8i().load_8us(prev2 + x) + Vec8i().load_8us(next2 + x)) >> 1;
+ const Vec8i e = Vec8i().load_8us(curBelow + x);
+ const Vec8i temporal_diff0 = abs(Vec8i().load_8us(prev2 + x) - Vec8i().load_8us(next2 + x));
+ const Vec8i temporal_diff1 = (abs(Vec8i().load_8us(prevAbove + x) - c) + abs(Vec8i().load_8us(prevBelow + x) - e)) >> 1;
+ const Vec8i temporal_diff2 = (abs(Vec8i().load_8us(nextAbove + x) - c) + abs(Vec8i().load_8us(nextBelow + x) - e)) >> 1;
+ const Vec8i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ const Vec8i b = ((Vec8i().load_8us(prev2Above2 + x) + Vec8i().load_8us(next2Above2 + x)) >> 1) - c;
+ const Vec8i f = ((Vec8i().load_8us(prev2Below2 + x) + Vec8i().load_8us(next2Below2 + x)) >> 1) - e;
+ const Vec8i dc = d - c;
+ const Vec8i de = d - e;
+ const Vec8i maximum = max(max(de, dc), min(b, f));
+ const Vec8i minimum = min(min(de, dc), max(b, f));
+ const Vec8i diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec8i interpol1 = (((coef_hf[0] * (Vec8i().load_8us(prev2 + x) + Vec8i().load_8us(next2 + x))
+ - coef_hf[1] * (Vec8i().load_8us(prev2Above2 + x) + Vec8i().load_8us(next2Above2 + x) + Vec8i().load_8us(prev2Below2 + x) + Vec8i().load_8us(next2Below2 + x))
+ + coef_hf[2] * (Vec8i().load_8us(prev2Above4 + x) + Vec8i().load_8us(next2Above4 + x) + Vec8i().load_8us(prev2Below4 + x) + Vec8i().load_8us(next2Below4 + x))) >> 2)
+ + coef_lf[0] * (c + e) - coef_lf[1] * (Vec8i().load_8us(curAbove3 + x) + Vec8i().load_8us(curBelow3 + x))) >> 13;
+ const Vec8i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec8i().load_8us(curAbove3 + x) + Vec8i().load_8us(curBelow3 + x))) >> 13;
+
+ Vec8i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si256()).get_low();
+ min(result, peak).store_nt(dst + x);
+ } else {
+ const Vec8f c = Vec8f().load_a(curAbove + x);
+ const Vec8f d = (Vec8f().load_a(prev2 + x) + Vec8f().load_a(next2 + x)) * 0.5f;
+ const Vec8f e = Vec8f().load_a(curBelow + x);
+ const Vec8f temporal_diff0 = abs(Vec8f().load_a(prev2 + x) - Vec8f().load_a(next2 + x));
+ const Vec8f temporal_diff1 = (abs(Vec8f().load_a(prevAbove + x) - c) + abs(Vec8f().load_a(prevBelow + x) - e)) * 0.5f;
+ const Vec8f temporal_diff2 = (abs(Vec8f().load_a(nextAbove + x) - c) + abs(Vec8f().load_a(nextBelow + x) - e)) * 0.5f;
+ const Vec8f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+ const Vec8f b = ((Vec8f().load_a(prev2Above2 + x) + Vec8f().load_a(next2Above2 + x)) * 0.5f) - c;
+ const Vec8f f = ((Vec8f().load_a(prev2Below2 + x) + Vec8f().load_a(next2Below2 + x)) * 0.5f) - e;
+ const Vec8f dc = d - c;
+ const Vec8f de = d - e;
+ const Vec8f maximum = max(max(de, dc), min(b, f));
+ const Vec8f minimum = min(min(de, dc), max(b, f));
+ const Vec8f diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec8f interpol1 = ((coef_hf_f[0] * (Vec8f().load_a(prev2 + x) + Vec8f().load_a(next2 + x))
+ - coef_hf_f[1] * (Vec8f().load_a(prev2Above2 + x) + Vec8f().load_a(next2Above2 + x) + Vec8f().load_a(prev2Below2 + x) + Vec8f().load_a(next2Below2 + x))
+ + coef_hf_f[2] * (Vec8f().load_a(prev2Above4 + x) + Vec8f().load_a(next2Above4 + x) + Vec8f().load_a(prev2Below4 + x) + Vec8f().load_a(next2Below4 + x))) * 0.25f
+ + coef_lf_f[0] * (c + e) - coef_lf_f[1] * (Vec8f().load_a(curAbove3 + x) + Vec8f().load_a(curBelow3 + x)));
+ const Vec8f interpol2 = coef_sp_f[0] * (c + e) - coef_sp_f[1] * (Vec8f().load_a(curAbove3 + x) + Vec8f().load_a(curBelow3 + x));
+
+ Vec8f interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+ }
+ }
+}
+
+template void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
diff --git a/Bwdif/Bwdif_AVX512.cpp b/Bwdif/Bwdif_AVX512.cpp
new file mode 100644
index 0000000..0390fc5
--- /dev/null
+++ b/Bwdif/Bwdif_AVX512.cpp
@@ -0,0 +1,224 @@
+#ifdef BWDIF_X86
+#include "Bwdif.h"
+
+template
+void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prevAbove = prev + negativeStride;
+ const pixel_t * prevBelow = prev + positiveStride;
+ const pixel_t * curAbove = cur + negativeStride;
+ const pixel_t * curBelow = cur + positiveStride;
+ const pixel_t * nextAbove = next + negativeStride;
+ const pixel_t * nextBelow = next + positiveStride;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+
+ for (int x = 0; x < width; x += step) {
+ if constexpr (std::is_same_v) {
+ const Vec32s c = Vec32s().load_32uc(curAbove + x);
+ const Vec32s d = (Vec32s().load_32uc(prev2 + x) + Vec32s().load_32uc(next2 + x)) >> 1;
+ const Vec32s e = Vec32s().load_32uc(curBelow + x);
+ const Vec32s temporal_diff0 = abs(Vec32s().load_32uc(prev2 + x) - Vec32s().load_32uc(next2 + x));
+ const Vec32s temporal_diff1 = (abs(Vec32s().load_32uc(prevAbove + x) - c) + abs(Vec32s().load_32uc(prevBelow + x) - e)) >> 1;
+ const Vec32s temporal_diff2 = (abs(Vec32s().load_32uc(nextAbove + x) - c) + abs(Vec32s().load_32uc(nextBelow + x) - e)) >> 1;
+ const Vec32s temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ Vec32s diff;
+ if constexpr (spat) {
+ const Vec32s b = ((Vec32s().load_32uc(prev2Above2 + x) + Vec32s().load_32uc(next2Above2 + x)) >> 1) - c;
+ const Vec32s f = ((Vec32s().load_32uc(prev2Below2 + x) + Vec32s().load_32uc(next2Below2 + x)) >> 1) - e;
+ const Vec32s dc = d - c;
+ const Vec32s de = d - e;
+ const Vec32s maximum = max(max(de, dc), min(b, f));
+ const Vec32s minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ Vec32s interpol = min(max((c + e) >> 1, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si512()).get_low();
+ result.store_nt(dst + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec16i c = Vec16i().load_16us(curAbove + x);
+ const Vec16i d = (Vec16i().load_16us(prev2 + x) + Vec16i().load_16us(next2 + x)) >> 1;
+ const Vec16i e = Vec16i().load_16us(curBelow + x);
+ const Vec16i temporal_diff0 = abs(Vec16i().load_16us(prev2 + x) - Vec16i().load_16us(next2 + x));
+ const Vec16i temporal_diff1 = (abs(Vec16i().load_16us(prevAbove + x) - c) + abs(Vec16i().load_16us(prevBelow + x) - e)) >> 1;
+ const Vec16i temporal_diff2 = (abs(Vec16i().load_16us(nextAbove + x) - c) + abs(Vec16i().load_16us(nextBelow + x) - e)) >> 1;
+ const Vec16i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ Vec16i diff;
+ if constexpr (spat) {
+ const Vec16i b = ((Vec16i().load_16us(prev2Above2 + x) + Vec16i().load_16us(next2Above2 + x)) >> 1) - c;
+ const Vec16i f = ((Vec16i().load_16us(prev2Below2 + x) + Vec16i().load_16us(next2Below2 + x)) >> 1) - e;
+ const Vec16i dc = d - c;
+ const Vec16i de = d - e;
+ const Vec16i maximum = max(max(de, dc), min(b, f));
+ const Vec16i minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ Vec16i interpol = min(max((c + e) >> 1, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si512()).get_low();
+ min(result, peak).store_nt(dst + x);
+ } else {
+ const Vec16f c = Vec16f().load_a(curAbove + x);
+ const Vec16f d = (Vec16f().load_a(prev2 + x) + Vec16f().load_a(next2 + x)) * 0.5f;
+ const Vec16f e = Vec16f().load_a(curBelow + x);
+ const Vec16f temporal_diff0 = abs(Vec16f().load_a(prev2 + x) - Vec16f().load_a(next2 + x));
+ const Vec16f temporal_diff1 = (abs(Vec16f().load_a(prevAbove + x) - c) + abs(Vec16f().load_a(prevBelow + x) - e)) * 0.5f;
+ const Vec16f temporal_diff2 = (abs(Vec16f().load_a(nextAbove + x) - c) + abs(Vec16f().load_a(nextBelow + x) - e)) * 0.5f;
+ const Vec16f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+ Vec16f diff;
+ if constexpr (spat) {
+ const Vec16f b = ((Vec16f().load_a(prev2Above2 + x) + Vec16f().load_a(next2Above2 + x)) * 0.5f) - c;
+ const Vec16f f = ((Vec16f().load_a(prev2Below2 + x) + Vec16f().load_a(next2Below2 + x)) * 0.5f) - e;
+ const Vec16f dc = d - c;
+ const Vec16f de = d - e;
+ const Vec16f maximum = max(max(de, dc), min(b, f));
+ const Vec16f minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ const Vec16f interpol = min(max((c + e) * 0.5f, d - diff), d + diff);
+ select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+ }
+ }
+}
+
+template
+void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above4 = prev2 - stride4;
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prev2Below4 = prev2 + stride4;
+ const pixel_t * prevAbove = prev - stride;
+ const pixel_t * prevBelow = prev + stride;
+ const pixel_t * curAbove3 = cur - stride3;
+ const pixel_t * curAbove = cur - stride;
+ const pixel_t * curBelow = cur + stride;
+ const pixel_t * curBelow3 = cur + stride3;
+ const pixel_t * nextAbove = next - stride;
+ const pixel_t * nextBelow = next + stride;
+ const pixel_t * next2Above4 = next2 - stride4;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+ const pixel_t * next2Below4 = next2 + stride4;
+
+ for (int x = 0; x < width; x += step) {
+ if constexpr (std::is_same_v) {
+ const Vec16i c = Vec16i().load_16uc(curAbove + x);
+ const Vec16i d = (Vec16i().load_16uc(prev2 + x) + Vec16i().load_16uc(next2 + x)) >> 1;
+ const Vec16i e = Vec16i().load_16uc(curBelow + x);
+ const Vec16i temporal_diff0 = abs(Vec16i().load_16uc(prev2 + x) - Vec16i().load_16uc(next2 + x));
+ const Vec16i temporal_diff1 = (abs(Vec16i().load_16uc(prevAbove + x) - c) + abs(Vec16i().load_16uc(prevBelow + x) - e)) >> 1;
+ const Vec16i temporal_diff2 = (abs(Vec16i().load_16uc(nextAbove + x) - c) + abs(Vec16i().load_16uc(nextBelow + x) - e)) >> 1;
+ const Vec16i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ const Vec16i b = ((Vec16i().load_16uc(prev2Above2 + x) + Vec16i().load_16uc(next2Above2 + x)) >> 1) - c;
+ const Vec16i f = ((Vec16i().load_16uc(prev2Below2 + x) + Vec16i().load_16uc(next2Below2 + x)) >> 1) - e;
+ const Vec16i dc = d - c;
+ const Vec16i de = d - e;
+ const Vec16i maximum = max(max(de, dc), min(b, f));
+ const Vec16i minimum = min(min(de, dc), max(b, f));
+ const Vec16i diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec16i interpol1 = (((coef_hf[0] * (Vec16i().load_16uc(prev2 + x) + Vec16i().load_16uc(next2 + x))
+ - coef_hf[1] * (Vec16i().load_16uc(prev2Above2 + x) + Vec16i().load_16uc(next2Above2 + x) + Vec16i().load_16uc(prev2Below2 + x) + Vec16i().load_16uc(next2Below2 + x))
+ + coef_hf[2] * (Vec16i().load_16uc(prev2Above4 + x) + Vec16i().load_16uc(next2Above4 + x) + Vec16i().load_16uc(prev2Below4 + x) + Vec16i().load_16uc(next2Below4 + x))) >> 2)
+ + coef_lf[0] * (c + e) - coef_lf[1] * (Vec16i().load_16uc(curAbove3 + x) + Vec16i().load_16uc(curBelow3 + x))) >> 13;
+ const Vec16i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec16i().load_16uc(curAbove3 + x) + Vec16i().load_16uc(curBelow3 + x))) >> 13;
+
+ Vec16i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(compress_saturated(interpol, zero_si512()), zero_si512()).get_low().get_low();
+ result.store_nt(dst + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec16i c = Vec16i().load_16us(curAbove + x);
+ const Vec16i d = (Vec16i().load_16us(prev2 + x) + Vec16i().load_16us(next2 + x)) >> 1;
+ const Vec16i e = Vec16i().load_16us(curBelow + x);
+ const Vec16i temporal_diff0 = abs(Vec16i().load_16us(prev2 + x) - Vec16i().load_16us(next2 + x));
+ const Vec16i temporal_diff1 = (abs(Vec16i().load_16us(prevAbove + x) - c) + abs(Vec16i().load_16us(prevBelow + x) - e)) >> 1;
+ const Vec16i temporal_diff2 = (abs(Vec16i().load_16us(nextAbove + x) - c) + abs(Vec16i().load_16us(nextBelow + x) - e)) >> 1;
+ const Vec16i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ const Vec16i b = ((Vec16i().load_16us(prev2Above2 + x) + Vec16i().load_16us(next2Above2 + x)) >> 1) - c;
+ const Vec16i f = ((Vec16i().load_16us(prev2Below2 + x) + Vec16i().load_16us(next2Below2 + x)) >> 1) - e;
+ const Vec16i dc = d - c;
+ const Vec16i de = d - e;
+ const Vec16i maximum = max(max(de, dc), min(b, f));
+ const Vec16i minimum = min(min(de, dc), max(b, f));
+ const Vec16i diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec16i interpol1 = (((coef_hf[0] * (Vec16i().load_16us(prev2 + x) + Vec16i().load_16us(next2 + x))
+ - coef_hf[1] * (Vec16i().load_16us(prev2Above2 + x) + Vec16i().load_16us(next2Above2 + x) + Vec16i().load_16us(prev2Below2 + x) + Vec16i().load_16us(next2Below2 + x))
+ + coef_hf[2] * (Vec16i().load_16us(prev2Above4 + x) + Vec16i().load_16us(next2Above4 + x) + Vec16i().load_16us(prev2Below4 + x) + Vec16i().load_16us(next2Below4 + x))) >> 2)
+ + coef_lf[0] * (c + e) - coef_lf[1] * (Vec16i().load_16us(curAbove3 + x) + Vec16i().load_16us(curBelow3 + x))) >> 13;
+ const Vec16i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec16i().load_16us(curAbove3 + x) + Vec16i().load_16us(curBelow3 + x))) >> 13;
+
+ Vec16i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si512()).get_low();
+ min(result, peak).store_nt(dst + x);
+ } else {
+ const Vec16f c = Vec16f().load_a(curAbove + x);
+ const Vec16f d = (Vec16f().load_a(prev2 + x) + Vec16f().load_a(next2 + x)) * 0.5f;
+ const Vec16f e = Vec16f().load_a(curBelow + x);
+ const Vec16f temporal_diff0 = abs(Vec16f().load_a(prev2 + x) - Vec16f().load_a(next2 + x));
+ const Vec16f temporal_diff1 = (abs(Vec16f().load_a(prevAbove + x) - c) + abs(Vec16f().load_a(prevBelow + x) - e)) * 0.5f;
+ const Vec16f temporal_diff2 = (abs(Vec16f().load_a(nextAbove + x) - c) + abs(Vec16f().load_a(nextBelow + x) - e)) * 0.5f;
+ const Vec16f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+ const Vec16f b = ((Vec16f().load_a(prev2Above2 + x) + Vec16f().load_a(next2Above2 + x)) * 0.5f) - c;
+ const Vec16f f = ((Vec16f().load_a(prev2Below2 + x) + Vec16f().load_a(next2Below2 + x)) * 0.5f) - e;
+ const Vec16f dc = d - c;
+ const Vec16f de = d - e;
+ const Vec16f maximum = max(max(de, dc), min(b, f));
+ const Vec16f minimum = min(min(de, dc), max(b, f));
+ const Vec16f diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec16f interpol1 = ((coef_hf_f[0] * (Vec16f().load_a(prev2 + x) + Vec16f().load_a(next2 + x))
+ - coef_hf_f[1] * (Vec16f().load_a(prev2Above2 + x) + Vec16f().load_a(next2Above2 + x) + Vec16f().load_a(prev2Below2 + x) + Vec16f().load_a(next2Below2 + x))
+ + coef_hf_f[2] * (Vec16f().load_a(prev2Above4 + x) + Vec16f().load_a(next2Above4 + x) + Vec16f().load_a(prev2Below4 + x) + Vec16f().load_a(next2Below4 + x))) * 0.25f
+ + coef_lf_f[0] * (c + e) - coef_lf_f[1] * (Vec16f().load_a(curAbove3 + x) + Vec16f().load_a(curBelow3 + x)));
+ const Vec16f interpol2 = coef_sp_f[0] * (c + e) - coef_sp_f[1] * (Vec16f().load_a(curAbove3 + x) + Vec16f().load_a(curBelow3 + x));
+
+ Vec16f interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+ }
+ }
+}
+
+template void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_avx512(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
diff --git a/Bwdif/Bwdif_SSE2.cpp b/Bwdif/Bwdif_SSE2.cpp
new file mode 100644
index 0000000..e54a6e7
--- /dev/null
+++ b/Bwdif/Bwdif_SSE2.cpp
@@ -0,0 +1,224 @@
+#ifdef BWDIF_X86
+#include "Bwdif.h"
+
+template
+void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prevAbove = prev + negativeStride;
+ const pixel_t * prevBelow = prev + positiveStride;
+ const pixel_t * curAbove = cur + negativeStride;
+ const pixel_t * curBelow = cur + positiveStride;
+ const pixel_t * nextAbove = next + negativeStride;
+ const pixel_t * nextBelow = next + positiveStride;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+
+ for (int x = 0; x < width; x += step) {
+ if constexpr (std::is_same_v) {
+ const Vec8s c = Vec8s().load_8uc(curAbove + x);
+ const Vec8s d = (Vec8s().load_8uc(prev2 + x) + Vec8s().load_8uc(next2 + x)) >> 1;
+ const Vec8s e = Vec8s().load_8uc(curBelow + x);
+ const Vec8s temporal_diff0 = abs(Vec8s().load_8uc(prev2 + x) - Vec8s().load_8uc(next2 + x));
+ const Vec8s temporal_diff1 = (abs(Vec8s().load_8uc(prevAbove + x) - c) + abs(Vec8s().load_8uc(prevBelow + x) - e)) >> 1;
+ const Vec8s temporal_diff2 = (abs(Vec8s().load_8uc(nextAbove + x) - c) + abs(Vec8s().load_8uc(nextBelow + x) - e)) >> 1;
+ const Vec8s temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ Vec8s diff;
+ if constexpr (spat) {
+ const Vec8s b = ((Vec8s().load_8uc(prev2Above2 + x) + Vec8s().load_8uc(next2Above2 + x)) >> 1) - c;
+ const Vec8s f = ((Vec8s().load_8uc(prev2Below2 + x) + Vec8s().load_8uc(next2Below2 + x)) >> 1) - e;
+ const Vec8s dc = d - c;
+ const Vec8s de = d - e;
+ const Vec8s maximum = max(max(de, dc), min(b, f));
+ const Vec8s minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ Vec8s interpol = min(max((c + e) >> 1, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si128());
+ result.storel(dst + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec4i c = Vec4i().load_4us(curAbove + x);
+ const Vec4i d = (Vec4i().load_4us(prev2 + x) + Vec4i().load_4us(next2 + x)) >> 1;
+ const Vec4i e = Vec4i().load_4us(curBelow + x);
+ const Vec4i temporal_diff0 = abs(Vec4i().load_4us(prev2 + x) - Vec4i().load_4us(next2 + x));
+ const Vec4i temporal_diff1 = (abs(Vec4i().load_4us(prevAbove + x) - c) + abs(Vec4i().load_4us(prevBelow + x) - e)) >> 1;
+ const Vec4i temporal_diff2 = (abs(Vec4i().load_4us(nextAbove + x) - c) + abs(Vec4i().load_4us(nextBelow + x) - e)) >> 1;
+ const Vec4i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ Vec4i diff;
+ if constexpr (spat) {
+ const Vec4i b = ((Vec4i().load_4us(prev2Above2 + x) + Vec4i().load_4us(next2Above2 + x)) >> 1) - c;
+ const Vec4i f = ((Vec4i().load_4us(prev2Below2 + x) + Vec4i().load_4us(next2Below2 + x)) >> 1) - e;
+ const Vec4i dc = d - c;
+ const Vec4i de = d - e;
+ const Vec4i maximum = max(max(de, dc), min(b, f));
+ const Vec4i minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ Vec4i interpol = min(max((c + e) >> 1, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si128());
+ min(result, peak).storel(dst + x);
+ } else {
+ const Vec4f c = Vec4f().load_a(curAbove + x);
+ const Vec4f d = (Vec4f().load_a(prev2 + x) + Vec4f().load_a(next2 + x)) * 0.5f;
+ const Vec4f e = Vec4f().load_a(curBelow + x);
+ const Vec4f temporal_diff0 = abs(Vec4f().load_a(prev2 + x) - Vec4f().load_a(next2 + x));
+ const Vec4f temporal_diff1 = (abs(Vec4f().load_a(prevAbove + x) - c) + abs(Vec4f().load_a(prevBelow + x) - e)) * 0.5f;
+ const Vec4f temporal_diff2 = (abs(Vec4f().load_a(nextAbove + x) - c) + abs(Vec4f().load_a(nextBelow + x) - e)) * 0.5f;
+ const Vec4f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+ Vec4f diff;
+ if constexpr (spat) {
+ const Vec4f b = ((Vec4f().load_a(prev2Above2 + x) + Vec4f().load_a(next2Above2 + x)) * 0.5f) - c;
+ const Vec4f f = ((Vec4f().load_a(prev2Below2 + x) + Vec4f().load_a(next2Below2 + x)) * 0.5f) - e;
+ const Vec4f dc = d - c;
+ const Vec4f de = d - e;
+ const Vec4f maximum = max(max(de, dc), min(b, f));
+ const Vec4f minimum = min(min(de, dc), max(b, f));
+ diff = max(max(temporal_diff, minimum), -maximum);
+ }
+
+ const Vec4f interpol = min(max((c + e) * 0.5f, d - diff), d + diff);
+ select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+ }
+ }
+}
+
+template
+void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width,
+ const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept {
+ const pixel_t * prev2 = reinterpret_cast(_prev2);
+ const pixel_t * prev = reinterpret_cast(_prev);
+ const pixel_t * cur = reinterpret_cast(_cur);
+ const pixel_t * next = reinterpret_cast(_next);
+ const pixel_t * next2 = reinterpret_cast(_next2);
+ pixel_t * dst = reinterpret_cast(_dst);
+
+ const pixel_t * prev2Above4 = prev2 - stride4;
+ const pixel_t * prev2Above2 = prev2 - stride2;
+ const pixel_t * prev2Below2 = prev2 + stride2;
+ const pixel_t * prev2Below4 = prev2 + stride4;
+ const pixel_t * prevAbove = prev - stride;
+ const pixel_t * prevBelow = prev + stride;
+ const pixel_t * curAbove3 = cur - stride3;
+ const pixel_t * curAbove = cur - stride;
+ const pixel_t * curBelow = cur + stride;
+ const pixel_t * curBelow3 = cur + stride3;
+ const pixel_t * nextAbove = next - stride;
+ const pixel_t * nextBelow = next + stride;
+ const pixel_t * next2Above4 = next2 - stride4;
+ const pixel_t * next2Above2 = next2 - stride2;
+ const pixel_t * next2Below2 = next2 + stride2;
+ const pixel_t * next2Below4 = next2 + stride4;
+
+ for (int x = 0; x < width; x += step) {
+ if constexpr (std::is_same_v) {
+ const Vec4i c = Vec4i().load_4uc(curAbove + x);
+ const Vec4i d = (Vec4i().load_4uc(prev2 + x) + Vec4i().load_4uc(next2 + x)) >> 1;
+ const Vec4i e = Vec4i().load_4uc(curBelow + x);
+ const Vec4i temporal_diff0 = abs(Vec4i().load_4uc(prev2 + x) - Vec4i().load_4uc(next2 + x));
+ const Vec4i temporal_diff1 = (abs(Vec4i().load_4uc(prevAbove + x) - c) + abs(Vec4i().load_4uc(prevBelow + x) - e)) >> 1;
+ const Vec4i temporal_diff2 = (abs(Vec4i().load_4uc(nextAbove + x) - c) + abs(Vec4i().load_4uc(nextBelow + x) - e)) >> 1;
+ const Vec4i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ const Vec4i b = ((Vec4i().load_4uc(prev2Above2 + x) + Vec4i().load_4uc(next2Above2 + x)) >> 1) - c;
+ const Vec4i f = ((Vec4i().load_4uc(prev2Below2 + x) + Vec4i().load_4uc(next2Below2 + x)) >> 1) - e;
+ const Vec4i dc = d - c;
+ const Vec4i de = d - e;
+ const Vec4i maximum = max(max(de, dc), min(b, f));
+ const Vec4i minimum = min(min(de, dc), max(b, f));
+ const Vec4i diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec4i interpol1 = (((coef_hf[0] * (Vec4i().load_4uc(prev2 + x) + Vec4i().load_4uc(next2 + x))
+ - coef_hf[1] * (Vec4i().load_4uc(prev2Above2 + x) + Vec4i().load_4uc(next2Above2 + x) + Vec4i().load_4uc(prev2Below2 + x) + Vec4i().load_4uc(next2Below2 + x))
+ + coef_hf[2] * (Vec4i().load_4uc(prev2Above4 + x) + Vec4i().load_4uc(next2Above4 + x) + Vec4i().load_4uc(prev2Below4 + x) + Vec4i().load_4uc(next2Below4 + x))) >> 2)
+ + coef_lf[0] * (c + e) - coef_lf[1] * (Vec4i().load_4uc(curAbove3 + x) + Vec4i().load_4uc(curBelow3 + x))) >> 13;
+ const Vec4i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec4i().load_4uc(curAbove3 + x) + Vec4i().load_4uc(curBelow3 + x))) >> 13;
+
+ Vec4i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(compress_saturated(interpol, zero_si128()), zero_si128());
+ result.store_si32(dst + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec4i c = Vec4i().load_4us(curAbove + x);
+ const Vec4i d = (Vec4i().load_4us(prev2 + x) + Vec4i().load_4us(next2 + x)) >> 1;
+ const Vec4i e = Vec4i().load_4us(curBelow + x);
+ const Vec4i temporal_diff0 = abs(Vec4i().load_4us(prev2 + x) - Vec4i().load_4us(next2 + x));
+ const Vec4i temporal_diff1 = (abs(Vec4i().load_4us(prevAbove + x) - c) + abs(Vec4i().load_4us(prevBelow + x) - e)) >> 1;
+ const Vec4i temporal_diff2 = (abs(Vec4i().load_4us(nextAbove + x) - c) + abs(Vec4i().load_4us(nextBelow + x) - e)) >> 1;
+ const Vec4i temporal_diff = max(max(temporal_diff0 >> 1, temporal_diff1), temporal_diff2);
+
+ const Vec4i b = ((Vec4i().load_4us(prev2Above2 + x) + Vec4i().load_4us(next2Above2 + x)) >> 1) - c;
+ const Vec4i f = ((Vec4i().load_4us(prev2Below2 + x) + Vec4i().load_4us(next2Below2 + x)) >> 1) - e;
+ const Vec4i dc = d - c;
+ const Vec4i de = d - e;
+ const Vec4i maximum = max(max(de, dc), min(b, f));
+ const Vec4i minimum = min(min(de, dc), max(b, f));
+ const Vec4i diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec4i interpol1 = (((coef_hf[0] * (Vec4i().load_4us(prev2 + x) + Vec4i().load_4us(next2 + x))
+ - coef_hf[1] * (Vec4i().load_4us(prev2Above2 + x) + Vec4i().load_4us(next2Above2 + x) + Vec4i().load_4us(prev2Below2 + x) + Vec4i().load_4us(next2Below2 + x))
+ + coef_hf[2] * (Vec4i().load_4us(prev2Above4 + x) + Vec4i().load_4us(next2Above4 + x) + Vec4i().load_4us(prev2Below4 + x) + Vec4i().load_4us(next2Below4 + x))) >> 2)
+ + coef_lf[0] * (c + e) - coef_lf[1] * (Vec4i().load_4us(curAbove3 + x) + Vec4i().load_4us(curBelow3 + x))) >> 13;
+ const Vec4i interpol2 = (coef_sp[0] * (c + e) - coef_sp[1] * (Vec4i().load_4us(curAbove3 + x) + Vec4i().load_4us(curBelow3 + x))) >> 13;
+
+ Vec4i interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ interpol = select(temporal_diff == 0, d, interpol);
+ const auto result = compress_saturated_s2u(interpol, zero_si128());
+ min(result, peak).storel(dst + x);
+ } else {
+ const Vec4f c = Vec4f().load_a(curAbove + x);
+ const Vec4f d = (Vec4f().load_a(prev2 + x) + Vec4f().load_a(next2 + x)) * 0.5f;
+ const Vec4f e = Vec4f().load_a(curBelow + x);
+ const Vec4f temporal_diff0 = abs(Vec4f().load_a(prev2 + x) - Vec4f().load_a(next2 + x));
+ const Vec4f temporal_diff1 = (abs(Vec4f().load_a(prevAbove + x) - c) + abs(Vec4f().load_a(prevBelow + x) - e)) * 0.5f;
+ const Vec4f temporal_diff2 = (abs(Vec4f().load_a(nextAbove + x) - c) + abs(Vec4f().load_a(nextBelow + x) - e)) * 0.5f;
+ const Vec4f temporal_diff = max(max(temporal_diff0 * 0.5f, temporal_diff1), temporal_diff2);
+
+ const Vec4f b = ((Vec4f().load_a(prev2Above2 + x) + Vec4f().load_a(next2Above2 + x)) * 0.5f) - c;
+ const Vec4f f = ((Vec4f().load_a(prev2Below2 + x) + Vec4f().load_a(next2Below2 + x)) * 0.5f) - e;
+ const Vec4f dc = d - c;
+ const Vec4f de = d - e;
+ const Vec4f maximum = max(max(de, dc), min(b, f));
+ const Vec4f minimum = min(min(de, dc), max(b, f));
+ const Vec4f diff = max(max(temporal_diff, minimum), -maximum);
+
+ const Vec4f interpol1 = ((coef_hf_f[0] * (Vec4f().load_a(prev2 + x) + Vec4f().load_a(next2 + x))
+ - coef_hf_f[1] * (Vec4f().load_a(prev2Above2 + x) + Vec4f().load_a(next2Above2 + x) + Vec4f().load_a(prev2Below2 + x) + Vec4f().load_a(next2Below2 + x))
+ + coef_hf_f[2] * (Vec4f().load_a(prev2Above4 + x) + Vec4f().load_a(next2Above4 + x) + Vec4f().load_a(prev2Below4 + x) + Vec4f().load_a(next2Below4 + x))) * 0.25f
+ + coef_lf_f[0] * (c + e) - coef_lf_f[1] * (Vec4f().load_a(curAbove3 + x) + Vec4f().load_a(curBelow3 + x)));
+ const Vec4f interpol2 = coef_sp_f[0] * (c + e) - coef_sp_f[1] * (Vec4f().load_a(curAbove3 + x) + Vec4f().load_a(curBelow3 + x));
+
+ Vec4f interpol = select(abs(c - e) > temporal_diff0, interpol1, interpol2);
+ interpol = min(max(interpol, d - diff), d + diff);
+ select(temporal_diff == 0, d, interpol).store_nt(dst + x);
+ }
+ }
+}
+
+template void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+template void filterEdge_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int positiveStride, const int negativeStride, const int stride2, const int step, const int peak) noexcept;
+
+template void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+template void filterLine_sse2(const void * _prev2, const void * _prev, const void * _cur, const void * _next, const void * _next2, void * _dst, const int width, const int stride, const int stride2, const int stride3, const int stride4, const int step, const int peak) noexcept;
+#endif
diff --git a/Bwdif/VCL2/LICENSE b/Bwdif/VCL2/LICENSE
new file mode 100644
index 0000000..fd2deda
--- /dev/null
+++ b/Bwdif/VCL2/LICENSE
@@ -0,0 +1,191 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+
+ Copyright 2012-2019 Agner Fog.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/Bwdif/VCL2/instrset.h b/Bwdif/VCL2/instrset.h
new file mode 100644
index 0000000..34e34b7
--- /dev/null
+++ b/Bwdif/VCL2/instrset.h
@@ -0,0 +1,1485 @@
+/**************************** instrset.h **********************************
+* Author: Agner Fog
+* Date created: 2012-05-30
+* Last modified: 2020-06-08
+* Version: 2.01.03
+* Project: vector class library
+* Description:
+* Header file for various compiler-specific tasks as well as common
+* macros and templates. This file contains:
+*
+* > Selection of the supported instruction set
+* > Defines compiler version macros
+* > Undefines certain macros that prevent function overloading
+* > Helper functions that depend on instruction set, compiler, or platform
+* > Common templates for permute, blend, etc.
+*
+* For instructions, see vcl_manual.pdf
+*
+* (c) Copyright 2012-2020 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#ifndef INSTRSET_H
+#define INSTRSET_H 20102
+
+
+// Allow the use of floating point permute instructions on integer vectors.
+// Some CPU's have an extra latency of 1 or 2 clock cycles for this, but
+// it may still be faster than alternative implementations:
+#define ALLOW_FP_PERMUTE true
+
+
+// Macro to indicate 64 bit mode
+#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__)
+#define __x86_64__ 1 // There are many different macros for this, decide on only one
+#endif
+
+// The following values of INSTRSET are currently defined:
+// 2: SSE2
+// 3: SSE3
+// 4: SSSE3
+// 5: SSE4.1
+// 6: SSE4.2
+// 7: AVX
+// 8: AVX2
+// 9: AVX512F
+// 10: AVX512BW/DQ/VL
+// In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this
+// decision cannot be made before the market situation for CPUs with these
+// instruction sets is known (these future instruction set extensions are already
+// used in some VCL functions and tested with an emulator)
+
+// Find instruction set from compiler macros if INSTRSET is not defined.
+// Note: Most of these macros are not defined in Microsoft compilers
+#ifndef INSTRSET
+#if defined ( __AVX512VL__ ) && defined ( __AVX512BW__ ) && defined ( __AVX512DQ__ )
+#define INSTRSET 10
+#elif defined ( __AVX512F__ ) || defined ( __AVX512__ )
+#define INSTRSET 9
+#elif defined ( __AVX2__ )
+#define INSTRSET 8
+#elif defined ( __AVX__ )
+#define INSTRSET 7
+#elif defined ( __SSE4_2__ )
+#define INSTRSET 6
+#elif defined ( __SSE4_1__ )
+#define INSTRSET 5
+#elif defined ( __SSSE3__ )
+#define INSTRSET 4
+#elif defined ( __SSE3__ )
+#define INSTRSET 3
+#elif defined ( __SSE2__ ) || defined ( __x86_64__ )
+#define INSTRSET 2
+#elif defined ( __SSE__ )
+#define INSTRSET 1
+#elif defined ( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2
+#define INSTRSET _M_IX86_FP
+#else
+#define INSTRSET 0
+#endif // instruction set defines
+#endif // INSTRSET
+
+// Include the appropriate header file for intrinsic functions
+#if INSTRSET > 7 // AVX2 and later
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+#include // x86intrin.h includes header files for whatever instruction
+ // sets are specified on the compiler command line, such as:
+ // xopintrin.h, fma4intrin.h
+#else
+#include // MS/Intel version of immintrin.h covers AVX and later
+#endif // __GNUC__
+#elif INSTRSET == 7
+#include // AVX
+#elif INSTRSET == 6
+#include // SSE4.2
+#elif INSTRSET == 5
+#include // SSE4.1
+#elif INSTRSET == 4
+#include // SSSE3
+#elif INSTRSET == 3
+#include // SSE3
+#elif INSTRSET == 2
+#include // SSE2
+#elif INSTRSET == 1
+#include // SSE
+#endif // INSTRSET
+
+#if INSTRSET >= 8 && !defined(__FMA__)
+// Assume that all processors that have AVX2 also have FMA3
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+// Prevent error message in g++ and Clang when using FMA intrinsics with avx2:
+#if !defined(DISABLE_WARNING_AVX2_WITHOUT_FMA)
+#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#endif
+#elif ! defined (__clang__)
+#define __FMA__ 1
+#endif
+#endif
+
+// AMD instruction sets
+#if defined (__XOP__) || defined (__FMA4__)
+#ifdef __GNUC__
+#include // AMD XOP (Gnu)
+#else
+#include // AMD XOP (Microsoft)
+#endif // __GNUC__
+#elif defined (__SSE4A__) // AMD SSE4A
+#include
+#endif // __XOP__
+
+// FMA3 instruction set
+#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__)) && ! defined (__INTEL_COMPILER)
+#include
+#endif // __FMA__
+
+// FMA4 instruction set
+#if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__))
+#include // must have both x86intrin.h and fma4intrin.h, don't know why
+#endif // __FMA4__
+
+
+#include // Define integer types with known size
+#include // define abs(int)
+
+#ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler
+#include // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
+#endif // _MSC_VER
+
+
+// functions in instrset_detect.cpp:
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+ int instrset_detect(void); // tells which instruction sets are supported
+ bool hasFMA3(void); // true if FMA3 instructions supported
+ bool hasFMA4(void); // true if FMA4 instructions supported
+ bool hasXOP(void); // true if XOP instructions supported
+ bool hasAVX512ER(void); // true if AVX512ER instructions supported
+ bool hasAVX512VBMI(void); // true if AVX512VBMI instructions supported
+ bool hasAVX512VBMI2(void); // true if AVX512VBMI2 instructions supported
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+// functions in physical_processors.cpp:
+int physicalProcessors(int * logical_processors = 0);
+
+
+// GCC version
+#if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
+#define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
+#endif
+
+// Clang version
+#if defined (__clang__)
+#define CLANG_VERSION ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__))
+// Problem: The version number is not consistent across platforms
+// http://llvm.org/bugs/show_bug.cgi?id=12643
+// Apple bug 18746972
+#endif
+
+// Fix problem with non-overloadable macros named min and max in WinDef.h
+#ifdef _MSC_VER
+#if defined (_WINDEF_) && defined(min) && defined(max)
+#undef min
+#undef max
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+// warning for poor support for AVX512F in MS compiler
+#ifndef __INTEL_COMPILER
+#if INSTRSET == 9
+#pragma message("Warning: MS compiler cannot generate code for AVX512F without AVX512DQ")
+#endif
+#if _MSC_VER < 1920 && INSTRSET > 8
+#pragma message("Warning: Your compiler has poor support for AVX512. Code may be erroneous.\nPlease use a newer compiler version or a different compiler!")
+#endif
+#endif // __INTEL_COMPILER
+#endif // _MSC_VER
+
+/* Intel compiler problem:
+The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have
+a problem with constexpr function returns not being constant enough.
+*/
+#if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 9999
+#error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
+#endif
+
+/* Clang problem:
+The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical.
+See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
+Additional problem: The version number is not consistent across platforms. The Apple build has
+different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
+http://llvm.org/bugs/show_bug.cgi?id=12643
+We have to make switches here when - hopefully - the error some day has been fixed.
+We need different version checks with and whithout __apple_build_version__
+*/
+#if (defined (__clang__) || defined(__apple_build_version__)) && !defined(__INTEL_COMPILER)
+#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+#endif
+
+#if defined (GCC_VERSION) && GCC_VERSION < 99999 && !defined(__clang__)
+#define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
+#endif
+
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+// Constant for indicating don't care in permute and blend functions.
+// V_DC is -256 in Vector class library version 1.xx
+// V_DC can be any value less than -1 in Vector class library version 2.00
+constexpr int V_DC = -256;
+
+
+/*****************************************************************************
+*
+* Helper functions that depend on instruction set, compiler, or platform
+*
+*****************************************************************************/
+
+// Define interface to cpuid instruction.
+// input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
+// output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
+static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) {
+#if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
+ int a, b, c, d;
+ __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : );
+ output[0] = a;
+ output[1] = b;
+ output[2] = c;
+ output[3] = d;
+
+#elif defined (_MSC_VER) // Microsoft compiler, intrin.h included
+ __cpuidex(output, functionnumber, ecxleaf); // intrinsic function for CPUID
+
+#else // unknown platform. try inline assembly with masm/intel syntax
+ __asm {
+ mov eax, functionnumber
+ mov ecx, ecxleaf
+ cpuid;
+ mov esi, output
+ mov[esi], eax
+ mov[esi + 4], ebx
+ mov[esi + 8], ecx
+ mov[esi + 12], edx
+ }
+#endif
+}
+
+
+// Define popcount function. Gives sum of bits
+#if INSTRSET >= 6 // SSE4.2
+// popcnt instruction is not officially part of the SSE4.2 instruction set,
+// but available in all known processors with SSE4.2
+static inline uint32_t vml_popcnt(uint32_t a) {
+ return (uint32_t)_mm_popcnt_u32(a); // Intel intrinsic. Supported by gcc and clang
+}
+#ifdef __x86_64__
+static inline int64_t vml_popcnt(uint64_t a) {
+ return _mm_popcnt_u64(a); // Intel intrinsic.
+}
+#else // 32 bit mode
+static inline int64_t vml_popcnt(uint64_t a) {
+ return _mm_popcnt_u32(uint32_t(a >> 32)) + _mm_popcnt_u32(uint32_t(a));
+}
+#endif
+#else // no SSE4.2
+static inline uint32_t vml_popcnt(uint32_t a) {
+ // popcnt instruction not available
+ uint32_t b = a - ((a >> 1) & 0x55555555);
+ uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
+ uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
+ uint32_t e = d * 0x01010101;
+ return e >> 24;
+}
+
+static inline int32_t vml_popcnt(uint64_t a) {
+ return vml_popcnt(uint32_t(a >> 32)) + vml_popcnt(uint32_t(a));
+}
+
+#endif
+
+// Define bit-scan-forward function. Gives index to lowest set bit
+#if defined (__GNUC__) || defined(__clang__)
+ // gcc and Clang have no bit_scan_forward intrinsic
+#if defined(__clang__) // fix clang bug
+ // Clang uses a k register as parameter a when inlined from horizontal_find_first
+__attribute__((noinline))
+#endif
+static uint32_t bit_scan_forward(uint32_t a) {
+ uint32_t r;
+ __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : );
+ return r;
+}
+static inline uint32_t bit_scan_forward(uint64_t a) {
+ uint32_t lo = uint32_t(a);
+ if (lo) return bit_scan_forward(lo);
+ uint32_t hi = uint32_t(a >> 32);
+ return bit_scan_forward(hi) + 32;
+}
+
+#else // other compilers
+static inline uint32_t bit_scan_forward(uint32_t a) {
+ unsigned long r;
+ _BitScanForward(&r, a); // defined in intrin.h for MS and Intel compilers
+ return r;
+}
+#ifdef __x86_64__
+static inline uint32_t bit_scan_forward(uint64_t a) {
+ unsigned long r;
+ _BitScanForward64(&r, a); // defined in intrin.h for MS and Intel compilers
+ return (uint32_t)r;
+}
+#else
+static inline uint32_t bit_scan_forward(uint64_t a) {
+ uint32_t lo = uint32_t(a);
+ if (lo) return bit_scan_forward(lo);
+ uint32_t hi = uint32_t(a >> 32);
+ return bit_scan_forward(hi) + 32;
+}
+#endif
+#endif
+
+
+// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
+#if defined (__GNUC__) || defined(__clang__)
+static inline uint32_t bit_scan_reverse(uint32_t a) __attribute__((pure));
+static inline uint32_t bit_scan_reverse(uint32_t a) {
+ uint32_t r;
+ __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : );
+ return r;
+}
+#ifdef __x86_64__
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+ uint64_t r;
+ __asm("bsrq %1, %0" : "=r"(r) : "r"(a) : );
+ return r;
+}
+#else // 32 bit mode
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+ uint64_t ahi = a >> 32;
+ if (ahi == 0) return bit_scan_reverse(uint32_t(a));
+ else return bit_scan_reverse(uint32_t(ahi)) + 32;
+}
+#endif
+#else
+static inline uint32_t bit_scan_reverse(uint32_t a) {
+ unsigned long r;
+ _BitScanReverse(&r, a); // defined in intrin.h for MS and Intel compilers
+ return r;
+}
+#ifdef __x86_64__
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+ unsigned long r;
+ _BitScanReverse64(&r, a); // defined in intrin.h for MS and Intel compilers
+ return r;
+}
+#else // 32 bit mode
+static inline uint32_t bit_scan_reverse(uint64_t a) {
+ uint64_t ahi = a >> 32;
+ if (ahi == 0) return bit_scan_reverse(uint32_t(a));
+ else return bit_scan_reverse(uint32_t(ahi)) + 32;
+}
+#endif
+#endif
+
+// Same function, for compile-time constants
+constexpr int bit_scan_reverse_const(uint64_t const n) {
+ if (n == 0) return -1;
+ uint64_t a = n, b = 0, j = 64, k = 0;
+ do {
+ j >>= 1;
+ k = (uint64_t)1 << j;
+ if (a >= k) {
+ a >>= j;
+ b += j;
+ }
+ } while (j > 0);
+ return int(b);
+}
+
+
+/*****************************************************************************
+*
+* Common templates
+*
+*****************************************************************************/
+
+// Template class to represent compile-time integer constant
+template class Const_int_t {}; // represent compile-time signed integer constant
+template class Const_uint_t {}; // represent compile-time unsigned integer constant
+#define const_int(n) (Const_int_t ()) // n must be compile-time integer constant
+#define const_uint(n) (Const_uint_t()) // n must be compile-time unsigned integer constant
+
+
+// template for producing quiet NAN
+template
+static inline VTYPE nan_vec(uint32_t payload = 0x100) {
+ if constexpr ((VTYPE::elementtype() & 1) != 0) { // double
+ union {
+ uint64_t q;
+ double f;
+ } ud;
+ // n is left justified to avoid loss of NAN payload when converting to float
+ ud.q = 0x7FF8000000000000 | uint64_t(payload) << 29;
+ return VTYPE(ud.f);
+ }
+ // float will be converted to double if necessary
+ union {
+ uint32_t i;
+ float f;
+ } uf;
+ uf.i = 0x7FC00000 | (payload & 0x003FFFFF);
+ return VTYPE(uf.f);
+}
+
+
+// Test if a parameter is a compile-time constant
+/* Unfortunately, this works only for macro parameters, not for inline function parameters.
+ I hope that some solution will appear in the future, but for now it appears to be
+ impossible to check if a function parameter is a compile-time constant.
+ This would be useful in operator / and in function pow:
+ #if defined(__GNUC__) || defined (__clang__)
+ #define is_constant(a) __builtin_constant_p(a)
+ #else
+ #define is_constant(a) false
+ #endif
+*/
+
+
+/*****************************************************************************
+*
+* Helper functions for permute and blend functions
+*
+******************************************************************************
+Rules for constexpr functions:
+
+> All variable declarations must include initialization
+
+> Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; ..
+ Instead, you have to declare the loop counter before the for-loop.
+
+> Do not make constexpr functions that return vector types. This requires type
+ punning with a union, which is not allowed in constexpr functions under C++17.
+ It may be possible under C++20
+
+*****************************************************************************/
+
+// Define type for Encapsulated array to use as return type:
+template
+struct EList {
+ T a[N];
+};
+
+
+// get_inttype: get an integer of a size that matches the element size
+// of vector class V with the value -1
+template
+constexpr auto get_inttype() {
+ constexpr int elementsize = sizeof(V) / V::size(); // size of vector elements
+
+ if constexpr (elementsize >= 8) {
+ return -int64_t(1);
+ }
+ else if constexpr (elementsize >= 4) {
+ return int32_t(-1);
+ }
+ else if constexpr (elementsize >= 2) {
+ return int16_t(-1);
+ }
+ else {
+ return int8_t(-1);
+ }
+}
+
+
+// zero_mask: return a compact bit mask mask for zeroing using AVX512 mask.
+// Parameter a is a reference to a constexpr int array of permutation indexes
+template
+constexpr auto zero_mask(int const (&a)[N]) {
+ uint64_t mask = 0;
+ int i = 0;
+
+ for (i = 0; i < N; i++) {
+ if (a[i] >= 0) mask |= uint64_t(1) << i;
+ }
+ if constexpr (N <= 8 ) return uint8_t(mask);
+ else if constexpr (N <= 16) return uint16_t(mask);
+ else if constexpr (N <= 32) return uint32_t(mask);
+ else return mask;
+}
+
+
+// zero_mask_broad: return a broad byte mask for zeroing.
+// Parameter a is a reference to a constexpr int array of permutation indexes
+template
+constexpr auto zero_mask_broad(int const (&A)[V::size()]) {
+ constexpr int N = V::size(); // number of vector elements
+ typedef decltype(get_inttype()) Etype; // element type
+ EList u = {{0}}; // list for return
+ int i = 0;
+ for (i = 0; i < N; i++) {
+ u.a[i] = A[i] >= 0 ? get_inttype() : 0;
+ }
+ return u; // return encapsulated array
+}
+
+
+// make_bit_mask: return a compact mask of bits from a list of N indexes:
+// B contains options indicating how to gather the mask
+// bit 0-7 in B indicates which bit in each index to collect
+// bit 8 = 0x100: set 1 in the lower half of the bit mask if the indicated bit is 1.
+// bit 8 = 0 : set 1 in the lower half of the bit mask if the indicated bit is 0.
+// bit 9 = 0x200: set 1 in the upper half of the bit mask if the indicated bit is 1.
+// bit 9 = 0 : set 1 in the upper half of the bit mask if the indicated bit is 0.
+// bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC
+// Parameter a is a reference to a constexpr int array of permutation indexes
+template
+constexpr uint64_t make_bit_mask(int const (&a)[N]) {
+ uint64_t r = 0; // return value
+ uint8_t j = uint8_t(B & 0xFF); // index to selected bit
+ uint64_t s = 0; // bit number i in r
+ uint64_t f = 0; // 1 if bit not flipped
+ int i = 0;
+ for (i = 0; i < N; i++) {
+ int ix = a[i];
+ if (ix < 0) { // -1 or V_DC
+ s = (B >> 10) & 1;
+ }
+ else {
+ s = ((uint32_t)ix >> j) & 1; // extract selected bit
+ if (i < N/2) {
+ f = (B >> 8) & 1; // lower half
+ }
+ else {
+ f = (B >> 9) & 1; // upper half
+ }
+ s ^= f ^ 1; // flip bit if needed
+ }
+ r |= uint64_t(s) << i; // set bit in return value
+ }
+ return r;
+}
+
+
+// make_broad_mask: Convert a bit mask m to a broad mask
+// The return value will be a broad boolean mask with elementsize matching vector class V
+template
+constexpr auto make_broad_mask(uint64_t const m) {
+ constexpr int N = V::size(); // number of vector elements
+ typedef decltype(get_inttype()) Etype; // element type
+ EList u = {{0}}; // list for returning
+ int i = 0;
+ for (i = 0; i < N; i++) {
+ u.a[i] = ((m >> i) & 1) != 0 ? get_inttype() : 0;
+ }
+ return u; // return encapsulated array
+}
+
+
+// perm_mask_broad: return a mask for permutation by a vector register index.
+// Parameter A is a reference to a constexpr int array of permutation indexes
+template
+constexpr auto perm_mask_broad(int const (&A)[V::size()]) {
+ constexpr int N = V::size(); // number of vector elements
+ typedef decltype(get_inttype()) Etype; // vector element type
+ EList u = {{0}}; // list for returning
+ int i = 0;
+ for (i = 0; i < N; i++) {
+ u.a[i] = Etype(A[i]);
+ }
+ return u; // return encapsulated array
+}
+
+
+// perm_flags: returns information about how a permute can be implemented.
+// The return value is composed of these flag bits:
+const int perm_zeroing = 1; // needs zeroing
+const int perm_perm = 2; // permutation needed
+const int perm_allzero = 4; // all is zero or don't care
+const int perm_largeblock = 8; // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i)
+const int perm_addz = 0x10; // additional zeroing needed after permute with larger block size or shift
+const int perm_addz2 = 0x20; // additional zeroing needed after perm_zext, perm_compress, or perm_expand
+const int perm_cross_lane = 0x40; // permutation crossing 128-bit lanes
+const int perm_same_pattern = 0x80; // same permute pattern in all 128-bit lanes
+const int perm_punpckh = 0x100; // permutation pattern fits punpckh instruction
+const int perm_punpckl = 0x200; // permutation pattern fits punpckl instruction
+const int perm_rotate = 0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count
+const int perm_shright = 0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count
+const int perm_shleft = 0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count
+const int perm_rotate_big = 0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count
+const int perm_broadcast = 0x8000; // permutation pattern fits broadcast of a single element.
+const int perm_zext = 0x10000; // permutation pattern fits zero extension
+const int perm_compress = 0x20000; // permutation pattern fits vpcompress instruction
+const int perm_expand = 0x40000; // permutation pattern fits vpexpand instruction
+const int perm_outofrange = 0x10000000; // index out of range
+const int perm_rot_count = 32; // rotate or shift count is in bits perm_rot_count to perm_rot_count+3
+const int perm_ipattern = 40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4
+
+template
+constexpr uint64_t perm_flags(int const (&a)[V::size()]) {
+ // a is a reference to a constexpr array of permutation indexes
+ // V is a vector class
+ constexpr int N = V::size(); // number of elements
+ uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
+ uint32_t i = 0; // loop counter
+ int j = 0; // loop counter
+ int ix = 0; // index number i
+ const uint32_t nlanes = sizeof(V) / 16; // number of 128-bit lanes
+ const uint32_t lanesize = N / nlanes; // elements per lane
+ const uint32_t elementsize = sizeof(V) / N; // size of each vector element
+ uint32_t lane = 0; // current lane
+ uint32_t rot = 999; // rotate left count
+ int32_t broadc = 999; // index to broadcasted element
+ uint32_t patfail = 0; // remember certain patterns that do not fit
+ uint32_t addz2 = 0; // remember certain patterns need extra zeroing
+ int32_t compresslasti = -1; // last index in perm_compress fit
+ int32_t compresslastp = -1; // last position in perm_compress fit
+ int32_t expandlasti = -1; // last index in perm_expand fit
+ int32_t expandlastp = -1; // last position in perm_expand fit
+
+ int lanepattern[lanesize] = {0}; // pattern in each lane
+
+ for (i = 0; i < N; i++) { // loop through indexes
+ ix = a[i]; // current index
+ // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
+ if (ix == -1) {
+ r |= perm_zeroing; // zeroing requested
+ }
+ else if (ix != V_DC && uint32_t(ix) >= N) {
+ r |= perm_outofrange; // index out of range
+ }
+ if (ix >= 0) {
+ r &= ~ perm_allzero; // not all zero
+ if (ix != (int)i) r |= perm_perm; // needs permutation
+ if (broadc == 999) broadc = ix; // remember broadcast index
+ else if (broadc != ix) broadc = 1000; // does not fit broadcast
+ }
+ // check if pattern fits a larger block size:
+ // even indexes must be even, odd indexes must fit the preceding even index + 1
+ if ((i & 1) == 0) { // even index
+ if (ix >= 0 && (ix & 1)) r &= ~perm_largeblock;// not even. does not fit larger block size
+ int iy = a[i + 1]; // next odd index
+ if (iy >= 0 && (iy & 1) == 0) r &= ~ perm_largeblock; // not odd. does not fit larger block size
+ if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ perm_largeblock; // does not fit preceding index + 1
+ if (ix == -1 && iy >= 0) r |= perm_addz; // needs additional zeroing at current block size
+ if (iy == -1 && ix >= 0) r |= perm_addz; // needs additional zeroing at current block size
+ }
+ lane = i / lanesize; // current lane
+ if (lane == 0) { // first lane, or no pattern yet
+ lanepattern[i] = ix; // save pattern
+ }
+ // check if crossing lanes
+ if (ix >= 0) {
+ uint32_t lanei = (uint32_t)ix / lanesize; // source lane
+ if (lanei != lane) r |= perm_cross_lane; // crossing lane
+ }
+ // check if same pattern in all lanes
+ if (lane != 0 && ix >= 0) { // not first lane
+ int j1 = i - int(lane * lanesize); // index into lanepattern
+ int jx = ix - int(lane * lanesize); // pattern within lane
+ if (jx < 0 || jx >= (int)lanesize) r &= ~perm_same_pattern; // source is in another lane
+ if (lanepattern[j1] < 0) {
+ lanepattern[j1] = jx; // pattern not known from previous lane
+ }
+ else {
+ if (lanepattern[j1] != jx) r &= ~perm_same_pattern; // not same pattern
+ }
+ }
+ if (ix >= 0) {
+ // check if pattern fits zero extension (perm_zext)
+ if (uint32_t(ix*2) != i) {
+ patfail |= 1; // does not fit zero extension
+ }
+ // check if pattern fits compress (perm_compress)
+ if (ix > compresslasti && ix - compresslasti >= (int)i - compresslastp) {
+ if ((int)i - compresslastp > 1) addz2 |= 2;// perm_compress may need additional zeroing
+ compresslasti = ix; compresslastp = i;
+ }
+ else {
+ patfail |= 2; // does not fit perm_compress
+ }
+ // check if pattern fits expand (perm_expand)
+ if (ix > expandlasti && ix - expandlasti <= (int)i - expandlastp) {
+ if (ix - expandlasti > 1) addz2 |= 4; // perm_expand may need additional zeroing
+ expandlasti = ix; expandlastp = i;
+ }
+ else {
+ patfail |= 4; // does not fit perm_compress
+ }
+ }
+ else if (ix == -1) {
+ if ((i & 1) == 0) addz2 |= 1; // zero extension needs additional zeroing
+ }
+ }
+ if (!(r & perm_perm)) return r; // more checks are superfluous
+
+ if (!(r & perm_largeblock)) r &= ~ perm_addz; // remove irrelevant flag
+ if (r & perm_cross_lane) r &= ~ perm_same_pattern; // remove irrelevant flag
+ if ((patfail & 1) == 0) {
+ r |= perm_zext; // fits zero extension
+ if ((addz2 & 1) != 0) r |= perm_addz2;
+ }
+ else if ((patfail & 2) == 0) {
+ r |= perm_compress; // fits compression
+ if ((addz2 & 2) != 0) { // check if additional zeroing needed
+ for (j = 0; j < compresslastp; j++) {
+ if (a[j] == -1) r |= perm_addz2;
+ }
+ }
+ }
+ else if ((patfail & 4) == 0) {
+ r |= perm_expand; // fits expansion
+ if ((addz2 & 4) != 0) { // check if additional zeroing needed
+ for (j = 0; j < expandlastp; j++) {
+ if (a[j] == -1) r |= perm_addz2;
+ }
+ }
+ }
+
+ if (r & perm_same_pattern) {
+ // same pattern in all lanes. check if it fits specific patterns
+ bool fit = true;
+ // fit shift or rotate
+ for (i = 0; i < lanesize; i++) {
+ if (lanepattern[i] >= 0) {
+ uint32_t rot1 = uint32_t(lanepattern[i] + lanesize - i) % lanesize;
+ if (rot == 999) {
+ rot = rot1;
+ }
+ else { // check if fit
+ if (rot != rot1) fit = false;
+ }
+ }
+ }
+ rot &= lanesize-1; // prevent out of range values
+ if (fit) { // fits rotate, and possibly shift
+ uint64_t rot2 = (rot * elementsize) & 0xF; // rotate right count in bytes
+ r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19
+#if INSTRSET >= 4 // SSSE3
+ r |= perm_rotate; // allow palignr
+#endif
+ // fit shift left
+ fit = true;
+ for (i = 0; i < lanesize-rot; i++) { // check if first rot elements are zero or don't care
+ if (lanepattern[i] >= 0) fit = false;
+ }
+ if (fit) {
+ r |= perm_shleft;
+ for (; i < lanesize; i++) if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed
+ }
+ // fit shift right
+ fit = true;
+ for (i = lanesize-(uint32_t)rot; i < lanesize; i++) { // check if last (lanesize-rot) elements are zero or don't care
+ if (lanepattern[i] >= 0) fit = false;
+ }
+ if (fit) {
+ r |= perm_shright;
+ for (i = 0; i < lanesize-rot; i++) {
+ if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed
+ }
+ }
+ }
+ // fit punpckhi
+ fit = true;
+ uint32_t j2 = lanesize / 2;
+ for (i = 0; i < lanesize; i++) {
+ if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false;
+ if ((i & 1) != 0) j2++;
+ }
+ if (fit) r |= perm_punpckh;
+ // fit punpcklo
+ fit = true;
+ j2 = 0;
+ for (i = 0; i < lanesize; i++) {
+ if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false;
+ if ((i & 1) != 0) j2++;
+ }
+ if (fit) r |= perm_punpckl;
+ // fit pshufd
+ if (elementsize >= 4) {
+ uint64_t p = 0;
+ for (i = 0; i < lanesize; i++) {
+ if (lanesize == 4) {
+ p |= (lanepattern[i] & 3) << 2 * i;
+ }
+ else { // lanesize = 2
+ p |= ((lanepattern[i] & 1) * 10 + 4) << 4 * i;
+ }
+ }
+ r |= p << perm_ipattern;
+ }
+ }
+#if INSTRSET >= 7
+ else { // not same pattern in all lanes
+ if constexpr (nlanes > 1) { // Try if it fits big rotate
+ for (i = 0; i < N; i++) {
+ ix = a[i];
+ if (ix >= 0) {
+ uint32_t rot2 = (ix + N - i) % N; // rotate count
+ if (rot == 999) {
+ rot = rot2; // save rotate count
+ }
+ else if (rot != rot2) {
+ rot = 1000; break; // does not fit big rotate
+ }
+ }
+ }
+ if (rot < N) { // fits big rotate
+ r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
+ }
+ }
+ }
+#endif
+ if (broadc < 999 && (r & (perm_rotate|perm_shright|perm_shleft|perm_rotate_big)) == 0) {
+ r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
+ }
+ return r;
+}
+
+
+// compress_mask: returns a bit mask to use for compression instruction.
+// It is presupposed that perm_flags indicates perm_compress.
+// Additional zeroing is needed if perm_flags indicates perm_addz2
+template
+constexpr uint64_t compress_mask(int const (&a)[N]) {
+ // a is a reference to a constexpr array of permutation indexes
+ int ix = 0, lasti = -1, lastp = -1;
+ uint64_t m = 0;
+ int i = 0; int j = 1; // loop counters
+ for (i = 0; i < N; i++) {
+ ix = a[i]; // permutation index
+ if (ix >= 0) {
+ m |= (uint64_t)1 << ix; // mask for compression source
+ for (j = 1; j < i - lastp; j++) {
+ m |= (uint64_t)1 << (lasti + j); // dummy filling source
+ }
+ lastp = i; lasti = ix;
+ }
+ }
+ return m;
+}
+
+// expand_mask: returns a bit mask to use for expansion instruction.
+// It is presupposed that perm_flags indicates perm_expand.
+// Additional zeroing is needed if perm_flags indicates perm_addz2
+template
+constexpr uint64_t expand_mask(int const (&a)[N]) {
+ // a is a reference to a constexpr array of permutation indexes
+ int ix = 0, lasti = -1, lastp = -1;
+ uint64_t m = 0;
+ int i = 0; int j = 1;
+ for (i = 0; i < N; i++) {
+ ix = a[i]; // permutation index
+ if (ix >= 0) {
+ m |= (uint64_t)1 << i; // mask for expansion destination
+ for (j = 1; j < ix - lasti; j++) {
+ m |= (uint64_t)1 << (lastp + j); // dummy filling destination
+ }
+ lastp = i; lasti = ix;
+ }
+ }
+ return m;
+}
+
+// perm16_flags: returns information about how to permute a vector of 16-bit integers
+// Note: It is presupposed that perm_flags reports perm_same_pattern
+// The return value is composed of these bits:
+// 1: data from low 64 bits to low 64 bits. pattern in bit 32-39
+// 2: data from high 64 bits to high 64 bits. pattern in bit 40-47
+// 4: data from high 64 bits to low 64 bits. pattern in bit 48-55
+// 8: data from low 64 bits to high 64 bits. pattern in bit 56-63
+template
+constexpr uint64_t perm16_flags(int const (&a)[V::size()]) {
+ // a is a reference to a constexpr array of permutation indexes
+ // V is a vector class
+ constexpr int N = V::size(); // number of elements
+
+ uint64_t retval = 0; // return value
+ uint32_t pat[4] = {0,0,0,0}; // permute patterns
+ uint32_t i = 0; // loop counter
+ int ix = 0; // index number i
+ const uint32_t lanesize = 8; // elements per lane
+ uint32_t lane = 0; // current lane
+ int lanepattern[lanesize] = {0}; // pattern in each lane
+
+ for (i = 0; i < N; i++) {
+ ix = a[i];
+ lane = i / lanesize; // current lane
+ if (lane == 0) {
+ lanepattern[i] = ix; // save pattern
+ }
+ else if (ix >= 0) { // not first lane
+ uint32_t j = i - lane * lanesize; // index into lanepattern
+ int jx = ix - lane * lanesize; // pattern within lane
+ if (lanepattern[j] < 0) {
+ lanepattern[j] = jx; // pattern not known from previous lane
+ }
+ }
+ }
+ // four patterns: low2low, high2high, high2low, low2high
+ for (i = 0; i < 4; i++) {
+ // loop through low pattern
+ if (lanepattern[i] >= 0) {
+ if (lanepattern[i] < 4) { // low2low
+ retval |= 1;
+ pat[0] |= uint32_t(lanepattern[i] & 3) << (2 * i);
+ }
+ else { // high2low
+ retval |= 4;
+ pat[2] |= uint32_t(lanepattern[i] & 3) << (2 * i);
+ }
+ }
+ // loop through high pattern
+ if (lanepattern[i+4] >= 0) {
+ if (lanepattern[i+4] < 4) { // low2high
+ retval |= 8;
+ pat[3] |= uint32_t(lanepattern[i+4] & 3) << (2 * i);
+ }
+ else { // high2high
+ retval |= 2;
+ pat[1] |= uint32_t(lanepattern[i+4] & 3) << (2 * i);
+ }
+ }
+ }
+ // join return data
+ for (i = 0; i < 4; i++) {
+ retval |= (uint64_t)pat[i] << (32 + i*8);
+ }
+ return retval;
+}
+
+
+// pshufb_mask: return a broad byte mask for permutation within lanes
+// for use with the pshufb instruction (_mm..._shuffle_epi8).
+// The pshufb instruction provides fast permutation and zeroing,
+// allowing different patterns in each lane but no crossing of lane boundaries
+template
+constexpr auto pshufb_mask(int const (&A)[V::size()]) {
+ // Parameter a is a reference to a constexpr array of permutation indexes
+ // V is a vector class
+ // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
+ constexpr uint32_t N = V::size(); // number of vector elements
+ constexpr uint32_t elementsize = sizeof(V) / N; // size of each vector element
+ constexpr uint32_t nlanes = sizeof(V) / 16; // number of 128 bit lanes in vector
+ constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane
+
+ EList u = {{0}}; // list for returning
+
+ uint32_t i = 0; // loop counters
+ uint32_t j = 0;
+ int m = 0;
+ int k = 0;
+ uint32_t lane = 0;
+
+ for (lane = 0; lane < nlanes; lane++) { // loop through lanes
+ for (i = 0; i < elements_per_lane; i++) { // loop through elements in lane
+ // permutation index for element within lane
+ int8_t p = -1;
+ int ix = A[m];
+ if (ix >= 0) {
+ ix ^= oppos * elements_per_lane; // flip bit if opposite lane
+ }
+ ix -= int(lane * elements_per_lane); // index relative to lane
+ if (ix >= 0 && ix < (int)elements_per_lane) { // index points to desired lane
+ p = ix * elementsize;
+ }
+ for (j = 0; j < elementsize; j++) { // loop through bytes in element
+ u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index
+ }
+ m++;
+ }
+ }
+ return u; // return encapsulated array
+}
+
+
+// largeblock_perm: return indexes for replacing a permute or blend with
+// a certain block size by a permute or blend with the double block size.
+// Note: it is presupposed that perm_flags() indicates perm_largeblock
+// It is required that additional zeroing is added if perm_flags() indicates perm_addz
+template
+constexpr EList largeblock_perm(int const (&a)[N]) {
+ // Parameter a is a reference to a constexpr array of permutation indexes
+ EList list = {{0}}; // result indexes
+ int ix = 0; // even index
+ int iy = 0; // odd index
+ int iz = 0; // combined index
+ bool fit_addz = false; // additional zeroing needed at the lower block level
+ int i = 0; // loop counter
+
+ // check if additional zeroing is needed at current block size
+ for (i = 0; i < N; i += 2) {
+ ix = a[i]; // even index
+ iy = a[i+1]; // odd index
+ if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) {
+ fit_addz = true;
+ }
+ }
+
+ // loop through indexes
+ for (i = 0; i < N; i += 2) {
+ ix = a[i]; // even index
+ iy = a[i+1]; // odd index
+ if (ix >= 0) {
+ iz = ix / 2; // half index
+ }
+ else if (iy >= 0) {
+ iz = iy / 2;
+ }
+ else {
+ iz = ix | iy; // -1 or V_DC. -1 takes precedence
+ if (fit_addz) iz = V_DC; // V_DC, because result will be zeroed later
+ }
+ list.a[i/2] = iz; // save to list
+ }
+ return list;
+}
+
+
+// blend_flags: returns information about how a blend function can be implemented
+// The return value is composed of these flag bits:
+const int blend_zeroing = 1; // needs zeroing
+const int blend_allzero = 2; // all is zero or don't care
+const int blend_largeblock = 4; // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i)
+const int blend_addz = 8; // additional zeroing needed after blend with larger block size or shift
+const int blend_a = 0x10; // has data from a
+const int blend_b = 0x20; // has data from b
+const int blend_perma = 0x40; // permutation of a needed
+const int blend_permb = 0x80; // permutation of b needed
+const int blend_cross_lane = 0x100; // permutation crossing 128-bit lanes
+const int blend_same_pattern = 0x200; // same permute/blend pattern in all 128-bit lanes
+const int blend_punpckhab = 0x1000; // pattern fits punpckh(a,b)
+const int blend_punpckhba = 0x2000; // pattern fits punpckh(b,a)
+const int blend_punpcklab = 0x4000; // pattern fits punpckl(a,b)
+const int blend_punpcklba = 0x8000; // pattern fits punpckl(b,a)
+const int blend_rotateab = 0x10000; // pattern fits palignr(a,b)
+const int blend_rotateba = 0x20000; // pattern fits palignr(b,a)
+const int blend_shufab = 0x40000; // pattern fits shufps/shufpd(a,b)
+const int blend_shufba = 0x80000; // pattern fits shufps/shufpd(b,a)
+const int blend_rotate_big = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern
+const int blend_outofrange= 0x10000000; // index out of range
+const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7
+const int blend_rotpattern = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7
+
+template
+constexpr uint64_t blend_flags(int const (&a)[V::size()]) {
+ // a is a reference to a constexpr array of permutation indexes
+ // V is a vector class
+ constexpr int N = V::size(); // number of elements
+ uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
+ uint32_t iu = 0; // loop counter
+ int32_t ii = 0; // loop counter
+ int ix = 0; // index number i
+ const uint32_t nlanes = sizeof(V) / 16; // number of 128-bit lanes
+ const uint32_t lanesize = N / nlanes; // elements per lane
+ uint32_t lane = 0; // current lane
+ uint32_t rot = 999; // rotate left count
+ int lanepattern[lanesize] = {0}; // pattern in each lane
+ if (lanesize == 2 && N <= 8) {
+ r |= blend_shufab | blend_shufba; // check if it fits shufpd
+ }
+
+ for (ii = 0; ii < N; ii++) { // loop through indexes
+ ix = a[ii]; // index
+ if (ix < 0) {
+ if (ix == -1) r |= blend_zeroing; // set to zero
+ else if (ix != V_DC) {
+ r = blend_outofrange; break; // illegal index
+ }
+ }
+ else { // ix >= 0
+ r &= ~ blend_allzero;
+ if (ix < N) {
+ r |= blend_a; // data from a
+ if (ix != ii) r |= blend_perma; // permutation of a
+ }
+ else if (ix < 2*N) {
+ r |= blend_b; // data from b
+ if (ix != ii + N) r |= blend_permb; // permutation of b
+ }
+ else {
+ r = blend_outofrange; break; // illegal index
+ }
+ }
+ // check if pattern fits a larger block size:
+ // even indexes must be even, odd indexes must fit the preceding even index + 1
+ if ((ii & 1) == 0) { // even index
+ if (ix >= 0 && (ix&1)) r &= ~blend_largeblock; // not even. does not fit larger block size
+ int iy = a[ii+1]; // next odd index
+ if (iy >= 0 && (iy & 1) == 0) r &= ~ blend_largeblock; // not odd. does not fit larger block size
+ if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ blend_largeblock; // does not fit preceding index + 1
+ if (ix == -1 && iy >= 0) r |= blend_addz; // needs additional zeroing at current block size
+ if (iy == -1 && ix >= 0) r |= blend_addz; // needs additional zeroing at current block size
+ }
+ lane = (uint32_t)ii / lanesize; // current lane
+ if (lane == 0) { // first lane, or no pattern yet
+ lanepattern[ii] = ix; // save pattern
+ }
+ // check if crossing lanes
+ if (ix >= 0) {
+ uint32_t lanei = uint32_t(ix & ~N) / lanesize; // source lane
+ if (lanei != lane) {
+ r |= blend_cross_lane; // crossing lane
+ }
+ if (lanesize == 2) { // check if it fits pshufd
+ if (lanei != lane) r &= ~(blend_shufab | blend_shufba);
+ if ((((ix & N) != 0) ^ ii) & 1) r &= ~blend_shufab;
+ else r &= ~blend_shufba;
+ }
+ }
+ // check if same pattern in all lanes
+ if (lane != 0 && ix >= 0) { // not first lane
+ int j = ii - int(lane * lanesize); // index into lanepattern
+ int jx = ix - int(lane * lanesize); // pattern within lane
+ if (jx < 0 || (jx & ~N) >= (int)lanesize) r &= ~blend_same_pattern; // source is in another lane
+ if (lanepattern[j] < 0) {
+ lanepattern[j] = jx; // pattern not known from previous lane
+ }
+ else {
+ if (lanepattern[j] != jx) r &= ~blend_same_pattern; // not same pattern
+ }
+ }
+ }
+ if (!(r & blend_largeblock)) r &= ~ blend_addz; // remove irrelevant flag
+ if (r & blend_cross_lane) r &= ~ blend_same_pattern; // remove irrelevant flag
+ if (!(r & (blend_perma | blend_permb))) {
+ return r; // no permutation. more checks are superfluous
+ }
+ if (r & blend_same_pattern) {
+ // same pattern in all lanes. check if it fits unpack patterns
+ r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba;
+ for (iu = 0; iu < lanesize; iu++) { // loop through lanepattern
+ ix = lanepattern[iu];
+ if (ix >= 0) {
+ if ((uint32_t)ix != iu / 2 + (iu & 1) * N) r &= ~ blend_punpcklab;
+ if ((uint32_t)ix != iu / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpcklba;
+ if ((uint32_t)ix != (iu + lanesize) / 2 + (iu & 1) * N) r &= ~ blend_punpckhab;
+ if ((uint32_t)ix != (iu + lanesize) / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpckhba;
+ }
+ }
+#if INSTRSET >= 4 // SSSE3. check if it fits palignr
+ for (iu = 0; iu < lanesize; iu++) {
+ ix = lanepattern[iu];
+ if (ix >= 0) {
+ uint32_t t = ix & ~N;
+ if (ix & N) t += lanesize;
+ uint32_t tb = (t + 2*lanesize - iu) % (lanesize * 2);
+ if (rot == 999) {
+ rot = tb;
+ }
+ else { // check if fit
+ if (rot != tb) rot = 1000;
+ }
+ }
+ }
+ if (rot < 999) { // firs palignr
+ if (rot < lanesize) {
+ r |= blend_rotateba;
+ }
+ else {
+ r |= blend_rotateab;
+ }
+ const uint32_t elementsize = sizeof(V) / N;
+ r |= uint64_t((rot & (lanesize - 1)) * elementsize) << blend_rotpattern;
+ }
+#endif
+ if (lanesize == 4) {
+ // check if it fits shufps
+ r |= blend_shufab | blend_shufba;
+ for (ii = 0; ii < 2; ii++) {
+ ix = lanepattern[ii];
+ if (ix >= 0) {
+ if (ix & N) r &= ~ blend_shufab;
+ else r &= ~ blend_shufba;
+ }
+ }
+ for (; ii < 4; ii++) {
+ ix = lanepattern[ii];
+ if (ix >= 0) {
+ if (ix & N) r &= ~ blend_shufba;
+ else r &= ~ blend_shufab;
+ }
+ }
+ if (r & (blend_shufab | blend_shufba)) { // fits shufps/shufpd
+ uint8_t shufpattern = 0; // get pattern
+ for (iu = 0; iu < lanesize; iu++) {
+ shufpattern |= (lanepattern[iu] & 3) << iu * 2;
+ }
+ r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
+ }
+ }
+ }
+ else if (nlanes > 1) { // not same pattern in all lanes
+ rot = 999; // check if it fits big rotate
+ for (ii = 0; ii < N; ii++) {
+ ix = a[ii];
+ if (ix >= 0) {
+ uint32_t rot2 = (ix + 2 * N - ii) % (2 * N);// rotate count
+ if (rot == 999) {
+ rot = rot2; // save rotate count
+ }
+ else if (rot != rot2) {
+ rot = 1000; break; // does not fit big rotate
+ }
+ }
+ }
+ if (rot < 2 * N) { // fits big rotate
+ r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
+ }
+ }
+ if (lanesize == 2 && (r & (blend_shufab | blend_shufba))) { // fits shufpd. Get pattern
+ for (ii = 0; ii < N; ii++) {
+ r |= uint64_t(a[ii] & 1) << (blend_shufpattern + ii);
+ }
+ }
+ return r;
+}
+
+// blend_perm_indexes: return an Indexlist for implementing a blend function as
+// two permutations. N = vector size.
+// dozero = 0: let unused elements be don't care. The two permutation results must be blended
+// dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed
+// dozero = 2: indexes that are -1 or V_DC are preserved
+template
+constexpr EList blend_perm_indexes(int const (&a)[N]) {
+ // a is a reference to a constexpr array of permutation indexes
+ EList list = {{0}}; // list to return
+ int u = dozero ? -1 : V_DC; // value to use for unused entries
+ int j = 0;
+
+ for (j = 0; j < N; j++) { // loop through indexes
+ int ix = a[j]; // current index
+ if (ix < 0) { // zero or don't care
+ if (dozero == 2) {
+ // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases
+ list.a[j] = ix;
+ list.a[j + N] = ix;
+ }
+ else {
+ // list.a[j] = list.a[j + N] = u;
+ list.a[j] = u;
+ list.a[j + N] = u;
+ }
+ }
+ else if (ix < N) { // value from a
+ list.a[j] = ix;
+ list.a[j+N] = u;
+ }
+ else {
+ list.a[j] = u; // value from b
+ list.a[j+N] = ix - N;
+ }
+ }
+ return list;
+}
+
+// largeblock_indexes: return indexes for replacing a permute or blend with a
+// certain block size by a permute or blend with the double block size.
+// Note: it is presupposed that perm_flags or blend_flags indicates _largeblock
+// It is required that additional zeroing is added if perm_flags or blend_flags
+// indicates _addz
+template
+constexpr EList largeblock_indexes(int const (&a)[N]) {
+ // Parameter a is a reference to a constexpr array of N permutation indexes
+ EList list = {{0}}; // list to return
+
+ bool fit_addz = false; // additional zeroing needed at the lower block level
+ int ix = 0; // even index
+ int iy = 0; // odd index
+ int iz = 0; // combined index
+ int i = 0; // loop counter
+
+ for (i = 0; i < N; i += 2) {
+ ix = a[i]; // even index
+ iy = a[i+1]; // odd index
+ if (ix >= 0) {
+ iz = ix / 2; // half index
+ }
+ else if (iy >= 0) {
+ iz = iy / 2; // half index
+ }
+ else iz = ix | iy; // -1 or V_DC. -1 takes precedence
+ list.a[i/2] = iz; // save to list
+ // check if additional zeroing is needed at current block size
+ if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) {
+ fit_addz = true;
+ }
+ }
+ // replace -1 by V_DC if fit_addz
+ if (fit_addz) {
+ for (i = 0; i < N/2; i++) {
+ if (list.a[i] < 0) list.a[i] = V_DC;
+ }
+ }
+ return list;
+}
+
+
+/****************************************************************************************
+*
+* Vector blend helper function templates
+*
+* These templates are for emulating a blend with a vector size that is not supported by
+* the instruction set, using multiple blends or permutations of half the vector size
+*
+****************************************************************************************/
+
+// Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined
+template void blend2(){}
+template void blend4(){}
+template void blend8(){}
+template void blend16(){}
+template void blend32(){}
+
+// blend_half_indexes: return an Indexlist for emulating a blend function as
+// blends or permutations from multiple sources
+// dozero = 0: let unused elements be don't care. Multiple permutation results must be blended
+// dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed
+// dozero = 2: indexes that are -1 or V_DC are preserved
+// src1, src2: sources to blend in a partial implementation
+template
+constexpr EList blend_half_indexes(int const (&a)[N]) {
+ // a is a reference to a constexpr array of permutation indexes
+ EList list = {{0}}; // list to return
+ int u = dozero ? -1 : V_DC; // value to use for unused entries
+ int j = 0; // loop counter
+
+ for (j = 0; j < N; j++) { // loop through indexes
+ int ix = a[j]; // current index
+ if (ix < 0) { // zero or don't care
+ list.a[j] = (dozero == 2) ? ix : u;
+ }
+ else {
+ int src = ix / N; // source
+ if (src == src1) {
+ list.a[j] = ix & (N - 1);
+ }
+ else if (src == src2) {
+ list.a[j] = (ix & (N - 1)) + N;
+ }
+ else list.a[j] = u;
+ }
+ }
+ return list;
+}
+
+// selectblend: select one of four sources for blending
+template
+static inline auto selectblend(W const a, W const b) {
+ if constexpr (s == 0) return a.get_low();
+ else if constexpr (s == 1) return a.get_high();
+ else if constexpr (s == 2) return b.get_low();
+ else return b.get_high();
+}
+
+// blend_half: Emulate a blend with a vector size that is not supported
+// by multiple blends with half the vector size.
+// blend_half is called twice, to give the low and high half of the result
+// Parameters: W: type of full-size vector
+// i0...: indexes for low or high half
+// a, b: full size input vectors
+// return value: half-size vector for lower or upper part
+template
+auto blend_half(W const& a, W const& b) {
+ typedef decltype(a.get_low()) V; // type for half-size vector
+ constexpr int N = V::size(); // size of half-size vector
+ static_assert(sizeof...(i0) == N, "wrong number of indexes in blend_half");
+ constexpr int ind[N] = { i0... }; // array of indexes
+
+ // lambda to find which of the four possible sources are used
+ // return: EList containing a list of up to 4 sources. The last element is the number of sources used
+ auto listsources = [](int const n, int const (&ind)[N]) constexpr {
+ bool source_used[4] = { false,false,false,false }; // list of sources used
+ int i = 0;
+ for (i = 0; i < n; i++) {
+ int ix = ind[i]; // index
+ if (ix >= 0) {
+ int src = ix / n; // source used
+ source_used[src & 3] = true;
+ }
+ }
+ // return a list of sources used. The last element is the number of sources used
+ EList sources = {{0}};
+ int nsrc = 0; // number of sources
+ for (i = 0; i < 4; i++) {
+ if (source_used[i]) {
+ sources.a[nsrc++] = i;
+ }
+ }
+ sources.a[4] = nsrc;
+ return sources;
+ };
+ // list of sources used
+ constexpr EList sources = listsources(N, ind);
+ constexpr int nsrc = sources.a[4]; // number of sources used
+
+ if constexpr (nsrc == 0) { // no sources
+ return V(0);
+ }
+ // get indexes for the first one or two sources
+ constexpr int uindex = (nsrc > 2) ? 1 : 2; // unused elements set to zero if two blends are combined
+ constexpr EList L = blend_half_indexes(ind);
+ V x0;
+ V src0 = selectblend(a, b); // first source
+ V src1 = selectblend(a, b); // second source
+ if constexpr (N == 2) {
+ x0 = blend2 (src0, src1);
+ }
+ else if constexpr (N == 4) {
+ x0 = blend4 (src0, src1);
+ }
+ else if constexpr (N == 8) {
+ x0 = blend8 (src0, src1);
+ }
+ else if constexpr (N == 16) {
+ x0 = blend16 (src0, src1);
+ }
+ else if constexpr (N == 32) {
+ x0 = blend32 (src0, src1);
+ }
+ if constexpr (nsrc > 2) { // get last one or two sources
+ constexpr EList M = blend_half_indexes(ind);
+ V x1;
+ V src2 = selectblend(a, b); // third source
+ V src3 = selectblend(a, b); // fourth source
+ if constexpr (N == 2) {
+ x1 = blend2 (src0, src1);
+ }
+ else if constexpr (N == 4) {
+ x1 = blend4 (src2, src3);
+ }
+ else if constexpr (N == 8) {
+ x1 = blend8 (src2, src3);
+ }
+ else if constexpr (N == 16) {
+ x1 = blend16 (src2, src3);
+ }
+ else if constexpr (N == 32) {
+ x1 = blend32 (src2, src3);
+ }
+ x0 |= x1; // combine result of two blends. Unused elements are zero
+ }
+ return x0;
+}
+
+
+#ifdef VCL_NAMESPACE
+}
+#endif
+
+
+#endif // INSTRSET_H
diff --git a/Bwdif/VCL2/instrset_detect.cpp b/Bwdif/VCL2/instrset_detect.cpp
new file mode 100644
index 0000000..119661b
--- /dev/null
+++ b/Bwdif/VCL2/instrset_detect.cpp
@@ -0,0 +1,167 @@
+/************************** instrset_detect.cpp ****************************
+* Author: Agner Fog
+* Date created: 2012-05-30
+* Last modified: 2019-08-01
+* Version: 2.00.00
+* Project: vector class library
+* Description:
+* Functions for checking which instruction sets are supported.
+*
+* (c) Copyright 2012-2019 Agner Fog.
+* Apache License version 2.0 or later.
+******************************************************************************/
+
+#include "instrset.h"
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+
+// Define interface to xgetbv instruction
+static inline uint64_t xgetbv (int ctr) {
+#if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
+ // Microsoft or Intel compiler supporting _xgetbv intrinsic
+
+ return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV
+
+#elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax
+
+ uint32_t a, d;
+ __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
+ return a | (uint64_t(d) << 32);
+
+#else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax
+ uint32_t a, d;
+ __asm {
+ mov ecx, ctr
+ _emit 0x0f
+ _emit 0x01
+ _emit 0xd0 ; // xgetbv
+ mov a, eax
+ mov d, edx
+ }
+ return a | (uint64_t(d) << 32);
+
+#endif
+}
+
+/* find supported instruction set
+ return value:
+ 0 = 80386 instruction set
+ 1 or above = SSE (XMM) supported by CPU (not testing for OS support)
+ 2 or above = SSE2
+ 3 or above = SSE3
+ 4 or above = Supplementary SSE3 (SSSE3)
+ 5 or above = SSE4.1
+ 6 or above = SSE4.2
+ 7 or above = AVX supported by CPU and operating system
+ 8 or above = AVX2
+ 9 or above = AVX512F
+ 10 or above = AVX512VL, AVX512BW, AVX512DQ
+*/
+int instrset_detect(void) {
+
+ static int iset = -1; // remember value for next call
+ if (iset >= 0) {
+ return iset; // called before
+ }
+ iset = 0; // default value
+ int abcd[4] = {0,0,0,0}; // cpuid results
+ cpuid(abcd, 0); // call cpuid function 0
+ if (abcd[0] == 0) return iset; // no further cpuid function supported
+ cpuid(abcd, 1); // call cpuid function 1 for feature flags
+ if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point
+ if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX
+ if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move
+ if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE
+ if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE
+ iset = 1; // 1: SSE supported
+ if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2
+ iset = 2; // 2: SSE2 supported
+ if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3
+ iset = 3; // 3: SSE3 supported
+ if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3
+ iset = 4; // 4: SSSE3 supported
+ if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1
+ iset = 5; // 5: SSE4.1 supported
+ if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT
+ if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2
+ iset = 6; // 6: SSE4.2 supported
+ if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE
+ if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S.
+ if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX
+ iset = 7; // 7: AVX supported
+ cpuid(abcd, 7); // call cpuid leaf 7 for feature flags
+ if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2
+ iset = 8;
+ if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512
+ cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags
+ if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512
+ iset = 9;
+ cpuid(abcd, 7); // call cpuid leaf 7 for feature flags
+ if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL
+ if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ
+ iset = 10;
+ return iset;
+}
+
+// detect if CPU supports the FMA3 instruction set
+bool hasFMA3(void) {
+ if (instrset_detect() < 7) return false; // must have AVX
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 1); // call cpuid function 1
+ return ((abcd[2] & (1 << 12)) != 0); // ecx bit 12 indicates FMA3
+}
+
+// detect if CPU supports the FMA4 instruction set
+bool hasFMA4(void) {
+ if (instrset_detect() < 7) return false; // must have AVX
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 0x80000001); // call cpuid function 0x80000001
+ return ((abcd[2] & (1 << 16)) != 0); // ecx bit 16 indicates FMA4
+}
+
+// detect if CPU supports the XOP instruction set
+bool hasXOP(void) {
+ if (instrset_detect() < 7) return false; // must have AVX
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 0x80000001); // call cpuid function 0x80000001
+ return ((abcd[2] & (1 << 11)) != 0); // ecx bit 11 indicates XOP
+}
+
+// detect if CPU supports the F16C instruction set
+bool hasF16C(void) {
+ if (instrset_detect() < 7) return false; // must have AVX
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 1); // call cpuid function 1
+ return ((abcd[2] & (1 << 29)) != 0); // ecx bit 29 indicates F16C
+}
+
+// detect if CPU supports the AVX512ER instruction set
+bool hasAVX512ER(void) {
+ if (instrset_detect() < 9) return false; // must have AVX512F
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 7); // call cpuid function 7
+ return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER
+}
+
+// detect if CPU supports the AVX512VBMI instruction set
+bool hasAVX512VBMI(void) {
+ if (instrset_detect() < 10) return false; // must have AVX512BW
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 7); // call cpuid function 7
+ return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI
+}
+
+// detect if CPU supports the AVX512VBMI2 instruction set
+bool hasAVX512VBMI2(void) {
+ if (instrset_detect() < 10) return false; // must have AVX512BW
+ int abcd[4]; // cpuid results
+ cpuid(abcd, 7); // call cpuid function 7
+ return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2
+}
+
+#ifdef VCL_NAMESPACE
+}
+#endif
diff --git a/Bwdif/VCL2/vector_convert.h b/Bwdif/VCL2/vector_convert.h
new file mode 100644
index 0000000..a7aa2b6
--- /dev/null
+++ b/Bwdif/VCL2/vector_convert.h
@@ -0,0 +1,574 @@
+/************************** vector_convert.h *******************************
+* Author: Agner Fog
+* Date created: 2014-07-23
+* Last modified: 2019-11-17
+* Version: 2.01.00
+* Project: vector class library
+* Description:
+* Header file for conversion between different vector classes with different
+* sizes. Also includes verious generic template functions.
+*
+* (c) Copyright 2012-2019 Agner Fog.
+* Apache License version 2.0 or later.
+*****************************************************************************/
+
+#ifndef VECTOR_CONVERT_H
+#define VECTOR_CONVERT_H
+
+#ifndef VECTORCLASS_H
+#include "vectorclass.h"
+#endif
+
+#if VECTORCLASS_H < 20100
+#error Incompatible versions of vector class library mixed
+#endif
+
+#ifdef VCL_NAMESPACE
+namespace VCL_NAMESPACE {
+#endif
+
+#if MAX_VECTOR_SIZE >= 256
+
+/*****************************************************************************
+*
+* Extend from 128 to 256 bit vectors
+*
+*****************************************************************************/
+
+#if INSTRSET >= 8 // AVX2. 256 bit integer vectors
+
+// sign extend
+static inline Vec16s extend (Vec16c const a) {
+ return _mm256_cvtepi8_epi16(a);
+}
+
+// zero extend
+static inline Vec16us extend (Vec16uc const a) {
+ return _mm256_cvtepu8_epi16(a);
+}
+
+// sign extend
+static inline Vec8i extend (Vec8s const a) {
+ return _mm256_cvtepi16_epi32(a);
+}
+
+// zero extend
+static inline Vec8ui extend (Vec8us const a) {
+ return _mm256_cvtepu16_epi32(a);
+}
+
+// sign extend
+static inline Vec4q extend (Vec4i const a) {
+ return _mm256_cvtepi32_epi64(a);
+}
+
+// zero extend
+static inline Vec4uq extend (Vec4ui const a) {
+ return _mm256_cvtepu32_epi64(a);
+}
+
+
+#else // no AVX2. 256 bit integer vectors are emulated
+
+// sign extend and zero extend functions:
+static inline Vec16s extend (Vec16c const a) {
+ return Vec16s(extend_low(a), extend_high(a));
+}
+
+static inline Vec16us extend (Vec16uc const a) {
+ return Vec16us(extend_low(a), extend_high(a));
+}
+
+static inline Vec8i extend (Vec8s const a) {
+ return Vec8i(extend_low(a), extend_high(a));
+}
+
+static inline Vec8ui extend (Vec8us const a) {
+ return Vec8ui(extend_low(a), extend_high(a));
+}
+
+static inline Vec4q extend (Vec4i const a) {
+ return Vec4q(extend_low(a), extend_high(a));
+}
+
+static inline Vec4uq extend (Vec4ui const a) {
+ return Vec4uq(extend_low(a), extend_high(a));
+}
+
+#endif // AVX2
+
+/*****************************************************************************
+*
+* Conversions between float and double
+*
+*****************************************************************************/
+#if INSTRSET >= 7 // AVX. 256 bit float vectors
+
+// float to double
+static inline Vec4d to_double (Vec4f const a) {
+ return _mm256_cvtps_pd(a);
+}
+
+// double to float
+static inline Vec4f to_float (Vec4d const a) {
+ return _mm256_cvtpd_ps(a);
+}
+
+#else // no AVX2. 256 bit float vectors are emulated
+
+// float to double
+static inline Vec4d to_double (Vec4f const a) {
+ Vec2d lo = _mm_cvtps_pd(a);
+ Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a));
+ return Vec4d(lo,hi);
+}
+
+// double to float
+static inline Vec4f to_float (Vec4d const a) {
+ Vec4f lo = _mm_cvtpd_ps(a.get_low());
+ Vec4f hi = _mm_cvtpd_ps(a.get_high());
+ return _mm_movelh_ps(lo, hi);
+}
+
+#endif
+
+/*****************************************************************************
+*
+* Reduce from 256 to 128 bit vectors
+*
+*****************************************************************************/
+#if INSTRSET >= 10 // AVX512VL
+
+// compress functions. overflow wraps around
+static inline Vec16c compress (Vec16s const a) {
+ return _mm256_cvtepi16_epi8(a);
+}
+
+static inline Vec16uc compress (Vec16us const a) {
+ return _mm256_cvtepi16_epi8(a);
+}
+
+static inline Vec8s compress (Vec8i const a) {
+ return _mm256_cvtepi32_epi16(a);
+}
+
+static inline Vec8us compress (Vec8ui const a) {
+ return _mm256_cvtepi32_epi16(a);
+}
+
+static inline Vec4i compress (Vec4q const a) {
+ return _mm256_cvtepi64_epi32(a);
+}
+
+static inline Vec4ui compress (Vec4uq const a) {
+ return _mm256_cvtepi64_epi32(a);
+}
+
+#else // no AVX512
+
+// compress functions. overflow wraps around
+static inline Vec16c compress (Vec16s const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec16uc compress (Vec16us const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8s compress (Vec8i const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8us compress (Vec8ui const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec4i compress (Vec4q const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec4ui compress (Vec4uq const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+#endif // AVX512
+
+#endif // MAX_VECTOR_SIZE >= 256
+
+
+#if MAX_VECTOR_SIZE >= 512
+
+/*****************************************************************************
+*
+* Extend from 256 to 512 bit vectors
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9 // AVX512. 512 bit integer vectors
+
+// sign extend
+static inline Vec32s extend (Vec32c const a) {
+#if INSTRSET >= 10
+ return _mm512_cvtepi8_epi16(a);
+#else
+ return Vec32s(extend_low(a), extend_high(a));
+#endif
+}
+
+// zero extend
+static inline Vec32us extend (Vec32uc const a) {
+#if INSTRSET >= 10
+ return _mm512_cvtepu8_epi16(a);
+#else
+ return Vec32us(extend_low(a), extend_high(a));
+#endif
+}
+
+// sign extend
+static inline Vec16i extend (Vec16s const a) {
+ return _mm512_cvtepi16_epi32(a);
+}
+
+// zero extend
+static inline Vec16ui extend (Vec16us const a) {
+ return _mm512_cvtepu16_epi32(a);
+}
+
+// sign extend
+static inline Vec8q extend (Vec8i const a) {
+ return _mm512_cvtepi32_epi64(a);
+}
+
+// zero extend
+static inline Vec8uq extend (Vec8ui const a) {
+ return _mm512_cvtepu32_epi64(a);
+}
+
+#else // no AVX512. 512 bit vectors are emulated
+
+
+
+// sign extend
+static inline Vec32s extend (Vec32c const a) {
+ return Vec32s(extend_low(a), extend_high(a));
+}
+
+// zero extend
+static inline Vec32us extend (Vec32uc const a) {
+ return Vec32us(extend_low(a), extend_high(a));
+}
+
+// sign extend
+static inline Vec16i extend (Vec16s const a) {
+ return Vec16i(extend_low(a), extend_high(a));
+}
+
+// zero extend
+static inline Vec16ui extend (Vec16us const a) {
+ return Vec16ui(extend_low(a), extend_high(a));
+}
+
+// sign extend
+static inline Vec8q extend (Vec8i const a) {
+ return Vec8q(extend_low(a), extend_high(a));
+}
+
+// zero extend
+static inline Vec8uq extend (Vec8ui const a) {
+ return Vec8uq(extend_low(a), extend_high(a));
+}
+
+#endif // AVX512
+
+
+/*****************************************************************************
+*
+* Reduce from 512 to 256 bit vectors
+*
+*****************************************************************************/
+#if INSTRSET >= 9 // AVX512F
+
+// compress functions. overflow wraps around
+static inline Vec32c compress (Vec32s const a) {
+#if INSTRSET >= 10 // AVVX512BW
+ return _mm512_cvtepi16_epi8(a);
+#else
+ return compress(a.get_low(), a.get_high());
+#endif
+}
+
+static inline Vec32uc compress (Vec32us const a) {
+ return Vec32uc(compress(Vec32s(a)));
+}
+
+static inline Vec16s compress (Vec16i const a) {
+ return _mm512_cvtepi32_epi16(a);
+}
+
+static inline Vec16us compress (Vec16ui const a) {
+ return _mm512_cvtepi32_epi16(a);
+}
+
+static inline Vec8i compress (Vec8q const a) {
+ return _mm512_cvtepi64_epi32(a);
+}
+
+static inline Vec8ui compress (Vec8uq const a) {
+ return _mm512_cvtepi64_epi32(a);
+}
+
+#else // no AVX512
+
+// compress functions. overflow wraps around
+static inline Vec32c compress (Vec32s const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec32uc compress (Vec32us const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec16s compress (Vec16i const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec16us compress (Vec16ui const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8i compress (Vec8q const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+static inline Vec8ui compress (Vec8uq const a) {
+ return compress(a.get_low(), a.get_high());
+}
+
+#endif // AVX512
+
+/*****************************************************************************
+*
+* Conversions between float and double
+*
+*****************************************************************************/
+
+#if INSTRSET >= 9 // AVX512. 512 bit float vectors
+
+// float to double
+static inline Vec8d to_double (Vec8f const a) {
+ return _mm512_cvtps_pd(a);
+}
+
+// double to float
+static inline Vec8f to_float (Vec8d const a) {
+ return _mm512_cvtpd_ps(a);
+}
+
+#else // no AVX512. 512 bit float vectors are emulated
+
+// float to double
+static inline Vec8d to_double (Vec8f const a) {
+ Vec4d lo = to_double(a.get_low());
+ Vec4d hi = to_double(a.get_high());
+ return Vec8d(lo,hi);
+}
+
+// double to float
+static inline Vec8f to_float (Vec8d const a) {
+ Vec4f lo = to_float(a.get_low());
+ Vec4f hi = to_float(a.get_high());
+ return Vec8f(lo, hi);
+}
+
+#endif
+
+#endif // MAX_VECTOR_SIZE >= 512
+
+// double to float
+static inline Vec4f to_float (Vec2d const a) {
+ return _mm_cvtpd_ps(a);
+}
+
+
+/*****************************************************************************
+*
+* Generic template functions
+*
+* These templates define functions for multiple vector types in one template
+*
+*****************************************************************************/
+
+// horizontal min/max of vector elements
+// implemented with universal template, works for all vector types:
+
+template auto horizontal_min(T const x) {
+ if constexpr ((T::elementtype() & 16) != 0) {
+ // T is a float or double vector
+ if (horizontal_or(is_nan(x))) {
+ // check for NAN because min does not guarantee NAN propagation
+ return x[horizontal_find_first(is_nan(x))];
+ }
+ }
+ return horizontal_min1(x);
+}
+
+template auto horizontal_min1(T const x) {
+ if constexpr (T::elementtype() <= 3) { // boolean vector type
+ return horizontal_and(x);
+ }
+ else if constexpr (sizeof(T) >= 32) {
+ // split recursively into smaller vectors
+ return horizontal_min1(min(x.get_low(), x.get_high()));
+ }
+ else if constexpr (T::size() == 2) {
+ T a = permute2 <1, V_DC>(x); // high half
+ T b = min(a, x);
+ return b[0];
+ }
+ else if constexpr (T::size() == 4) {
+ T a = permute4<2, 3, V_DC, V_DC>(x); // high half
+ T b = min(a, x);
+ a = permute4<1, V_DC, V_DC, V_DC>(b);
+ b = min(a, b);
+ return b[0];
+ }
+ else if constexpr (T::size() == 8) {
+ T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half
+ T b = min(a, x);
+ a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = min(a, b);
+ a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = min(a, b);
+ return b[0];
+ }
+ else {
+ static_assert(T::size() == 16); // no other size is allowed
+ T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half
+ T b = min(a, x);
+ a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = min(a, b);
+ a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = min(a, b);
+ a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = min(a, b);
+ return b[0];
+ }
+}
+
+template auto horizontal_max(T const x) {
+ if constexpr ((T::elementtype() & 16) != 0) {
+ // T is a float or double vector
+ if (horizontal_or(is_nan(x))) {
+ // check for NAN because max does not guarantee NAN propagation
+ return x[horizontal_find_first(is_nan(x))];
+ }
+ }
+ return horizontal_max1(x);
+}
+
+template auto horizontal_max1(T const x) {
+ if constexpr (T::elementtype() <= 3) { // boolean vector type
+ return horizontal_or(x);
+ }
+ else if constexpr (sizeof(T) >= 32) {
+ // split recursively into smaller vectors
+ return horizontal_max1(max(x.get_low(), x.get_high()));
+ }
+ else if constexpr (T::size() == 2) {
+ T a = permute2 <1, V_DC>(x); // high half
+ T b = max(a, x);
+ return b[0];
+ }
+ else if constexpr (T::size() == 4) {
+ T a = permute4<2, 3, V_DC, V_DC>(x); // high half
+ T b = max(a, x);
+ a = permute4<1, V_DC, V_DC, V_DC>(b);
+ b = max(a, b);
+ return b[0];
+ }
+ else if constexpr (T::size() == 8) {
+ T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half
+ T b = max(a, x);
+ a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = max(a, b);
+ a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = max(a, b);
+ return b[0];
+ }
+ else {
+ static_assert(T::size() == 16); // no other size is allowed
+ T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half
+ T b = max(a, x);
+ a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = max(a, b);
+ a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = max(a, b);
+ a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
+ b = max(a, b);
+ return b[0];
+ }
+}
+
+// Find first element that is true in a boolean vector
+template
+static inline int horizontal_find_first(V const x) {
+ static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
+ auto bits = to_bits(x); // convert to bits
+ if (bits == 0) return -1;
+ if constexpr (V::size() < 32) {
+ return bit_scan_forward((uint32_t)bits);
+ }
+ else {
+ return bit_scan_forward(bits);
+ }
+}
+
+// Count the number of elements that are true in a boolean vector
+template