From af9a442ee281f12386ed580fa65cd2a505a1d59a Mon Sep 17 00:00:00 2001 From: Holy Wu Date: Tue, 11 Dec 2018 20:49:31 +0800 Subject: [PATCH] EEDI3: Add parameter mclip --- EEDI3/EEDI3.cpp | 351 ++++++++++++++++++++++++++++------------- EEDI3/EEDI3.hpp | 43 ++++- EEDI3/EEDI3CL.cpp | 128 +++++++-------- EEDI3/EEDI3CL.hpp | 6 +- EEDI3/EEDI3CL_SSE2.cpp | 16 +- EEDI3/EEDI3_AVX.cpp | 186 ++++++++++++++-------- EEDI3/EEDI3_AVX512.cpp | 275 +++++++++++++++++++------------- EEDI3/EEDI3_SSE2.cpp | 275 +++++++++++++++++++------------- EEDI3/EEDI3_SSE4.cpp | 275 +++++++++++++++++++------------- EEDI3/shared.hpp | 68 +++++--- README.md | 16 +- 11 files changed, 1006 insertions(+), 633 deletions(-) diff --git a/EEDI3/EEDI3.cpp b/EEDI3/EEDI3.cpp index 266146d..f1285f7 100644 --- a/EEDI3/EEDI3.cpp +++ b/EEDI3/EEDI3.cpp @@ -29,134 +29,139 @@ #include "EEDI3.hpp" #ifdef VS_TARGET_CPU_X86 -template extern void filter_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template extern void filter_sse4(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template extern void filter_avx(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template extern void filter_avx512(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; +template extern void filter_sse2(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template extern void filter_sse4(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template extern void filter_avx(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template extern void filter_avx512(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; #endif template -static inline void calculateConnectionCosts(const T * src3p, const T * src1p, const T * src1n, const T * src3n, float * VS_RESTRICT ccosts, - const int width, const EEDI3Data * d) noexcept { +static inline void calculateConnectionCosts(const T * src3p, const T * src1p, const T * src1n, const T * src3n, const bool * bmask, float * VS_RESTRICT ccosts, + const int width, const EEDI3Data * const VS_RESTRICT d) noexcept { if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + int s0 = 0, s1 = -1, s2 = -1; - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - int s0 = 0, s1 = -1, s2 = -1; - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += std::abs(src3p[x + u + k] - src1p[x - u + k]) + - std::abs(src1p[x + u + k] - src1n[x - u + k]) + - std::abs(src1n[x + u + k] - src3n[x - u + k]); - - if ((u >= 0 && x >= u2) || (u <= 0 && x < width + u2)) { - s1 = 0; for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += std::abs(src3p[x + k] - src1p[x - u2 + k]) + - std::abs(src1p[x + k] - src1n[x - u2 + k]) + - std::abs(src1n[x + k] - src3n[x - u2 + k]); - } + s0 += std::abs(src3p[x + u + k] - src1p[x - u + k]) + + std::abs(src1p[x + u + k] - src1n[x - u + k]) + + std::abs(src1n[x + u + k] - src3n[x - u + k]); + + if ((u >= 0 && x >= u2) || (u <= 0 && x < width + u2)) { + s1 = 0; + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += std::abs(src3p[x + k] - src1p[x - u2 + k]) + + std::abs(src1p[x + k] - src1n[x - u2 + k]) + + std::abs(src1n[x + k] - src3n[x - u2 + k]); + } - if ((u <= 0 && x >= -u2) || (u >= 0 && x < width - u2)) { - s2 = 0; - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += std::abs(src3p[x + u2 + k] - src1p[x + k]) + - std::abs(src1p[x + u2 + k] - src1n[x + k]) + - std::abs(src1n[x + u2 + k] - src3n[x + k]); - } + if ((u <= 0 && x >= -u2) || (u >= 0 && x < width - u2)) { + s2 = 0; + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += std::abs(src3p[x + u2 + k] - src1p[x + k]) + + std::abs(src1p[x + u2 + k] - src1n[x + k]) + + std::abs(src1n[x + u2 + k] - src3n[x + k]); + } - s1 = (s1 >= 0) ? s1 : (s2 >= 0 ? s2 : s0); - s2 = (s2 >= 0) ? s2 : (s1 >= 0 ? s1 : s0); + s1 = (s1 >= 0) ? s1 : (s2 >= 0 ? s2 : s0); + s2 = (s2 >= 0) ? s2 : (s1 >= 0 ? s1 : s0); - const int ip = (src1p[x + u] + src1n[x - u] + 1) >> 1; // should use cubic if ucubic=true - const int v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); - ccosts[d->tpitch * x + u] = d->alpha * (s0 + s1 + s2) + d->beta * std::abs(u) + d->remainingWeight * v; + const int ip = (src1p[x + u] + src1n[x - u] + 1) >> 1; // should use cubic if ucubic=true + const int v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); + ccosts[d->tpitch * x + u] = d->alpha * (s0 + s1 + s2) + d->beta * std::abs(u) + d->remainingWeight * v; + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - int s = 0; + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + int s = 0; - for (int k = -(d->nrad); k <= d->nrad; k++) - s += std::abs(src3p[x + u + k] - src1p[x - u + k]) + - std::abs(src1p[x + u + k] - src1n[x - u + k]) + - std::abs(src1n[x + u + k] - src3n[x - u + k]); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += std::abs(src3p[x + u + k] - src1p[x - u + k]) + + std::abs(src1p[x + u + k] - src1n[x - u + k]) + + std::abs(src1n[x + u + k] - src3n[x - u + k]); - const int ip = (src1p[x + u] + src1n[x - u] + 1) >> 1; // should use cubic if ucubic=true - const int v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); - ccosts[d->tpitch * x + u] = d->alpha * s + d->beta * std::abs(u) + d->remainingWeight * v; + const int ip = (src1p[x + u] + src1n[x - u] + 1) >> 1; // should use cubic if ucubic=true + const int v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); + ccosts[d->tpitch * x + u] = d->alpha * s + d->beta * std::abs(u) + d->remainingWeight * v; + } } } } } template<> -inline void calculateConnectionCosts(const float * src3p, const float * src1p, const float * src1n, const float * src3n, float * VS_RESTRICT ccosts, - const int width, const EEDI3Data * d) noexcept { +inline void calculateConnectionCosts(const float * src3p, const float * src1p, const float * src1n, const float * src3n, const bool * bmask, float * VS_RESTRICT ccosts, + const int width, const EEDI3Data * const VS_RESTRICT d) noexcept { if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + float s0 = 0.f, s1 = -FLT_MAX, s2 = -FLT_MAX; - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - float s0 = 0.f, s1 = -FLT_MAX, s2 = -FLT_MAX; - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += std::abs(src3p[x + u + k] - src1p[x - u + k]) + - std::abs(src1p[x + u + k] - src1n[x - u + k]) + - std::abs(src1n[x + u + k] - src3n[x - u + k]); - - if ((u >= 0 && x >= u2) || (u <= 0 && x < width + u2)) { - s1 = 0.f; for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += std::abs(src3p[x + k] - src1p[x - u2 + k]) + - std::abs(src1p[x + k] - src1n[x - u2 + k]) + - std::abs(src1n[x + k] - src3n[x - u2 + k]); - } + s0 += std::abs(src3p[x + u + k] - src1p[x - u + k]) + + std::abs(src1p[x + u + k] - src1n[x - u + k]) + + std::abs(src1n[x + u + k] - src3n[x - u + k]); + + if ((u >= 0 && x >= u2) || (u <= 0 && x < width + u2)) { + s1 = 0.f; + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += std::abs(src3p[x + k] - src1p[x - u2 + k]) + + std::abs(src1p[x + k] - src1n[x - u2 + k]) + + std::abs(src1n[x + k] - src3n[x - u2 + k]); + } - if ((u <= 0 && x >= -u2) || (u >= 0 && x < width - u2)) { - s2 = 0.f; - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += std::abs(src3p[x + u2 + k] - src1p[x + k]) + - std::abs(src1p[x + u2 + k] - src1n[x + k]) + - std::abs(src1n[x + u2 + k] - src3n[x + k]); - } + if ((u <= 0 && x >= -u2) || (u >= 0 && x < width - u2)) { + s2 = 0.f; + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += std::abs(src3p[x + u2 + k] - src1p[x + k]) + + std::abs(src1p[x + u2 + k] - src1n[x + k]) + + std::abs(src1n[x + u2 + k] - src3n[x + k]); + } - s1 = (s1 > -FLT_MAX) ? s1 : (s2 > -FLT_MAX ? s2 : s0); - s2 = (s2 > -FLT_MAX) ? s2 : (s1 > -FLT_MAX ? s1 : s0); + s1 = (s1 > -FLT_MAX) ? s1 : (s2 > -FLT_MAX ? s2 : s0); + s2 = (s2 > -FLT_MAX) ? s2 : (s1 > -FLT_MAX ? s1 : s0); - const float ip = (src1p[x + u] + src1n[x - u]) / 2.f; // should use cubic if ucubic=true - const float v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); - ccosts[d->tpitch * x + u] = d->alpha * (s0 + s1 + s2) + d->beta * std::abs(u) + d->remainingWeight * v; + const float ip = (src1p[x + u] + src1n[x - u]) / 2.f; // should use cubic if ucubic=true + const float v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); + ccosts[d->tpitch * x + u] = d->alpha * (s0 + s1 + s2) + d->beta * std::abs(u) + d->remainingWeight * v; + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - float s = 0.f; + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + float s = 0.f; - for (int k = -(d->nrad); k <= d->nrad; k++) - s += std::abs(src3p[x + u + k] - src1p[x - u + k]) + - std::abs(src1p[x + u + k] - src1n[x - u + k]) + - std::abs(src1n[x + u + k] - src3n[x - u + k]); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += std::abs(src3p[x + u + k] - src1p[x - u + k]) + + std::abs(src1p[x + u + k] - src1n[x - u + k]) + + std::abs(src1n[x + u + k] - src3n[x - u + k]); - const float ip = (src1p[x + u] + src1n[x - u]) / 2.f; // should use cubic if ucubic=true - const float v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); - ccosts[d->tpitch * x + u] = d->alpha * s + d->beta * std::abs(u) + d->remainingWeight * v; + const float ip = (src1p[x + u] + src1n[x - u]) / 2.f; // should use cubic if ucubic=true + const float v = std::abs(src1p[x] - ip) + std::abs(src1n[x] - ip); + ccosts[d->tpitch * x + u] = d->alpha * s + d->beta * std::abs(u) + d->remainingWeight * v; + } } } } } -template -static void filter_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3Data * d, const VSAPI * vsapi) noexcept { +template +static void filter_c(const VSFrameRef * src, const VSFrameRef * scp, const VSFrameRef * mclip, VSFrameRef * mcp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -170,13 +175,20 @@ static void filter_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef const T1 * _srcp = reinterpret_cast(vsapi->getReadPtr(pad[plane], 0)) + 12; T1 * VS_RESTRICT _dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const uint8_t * _maskp = nullptr; + if (d->mclip) { + copyMask(mclip, mcp, plane, field_n, d->dh, vsapi); + _maskp = vsapi->getReadPtr(mcp, plane); + } + const auto threadId = std::this_thread::get_id(); + bool * bmask = d->bmask.at(threadId); float * ccosts = d->ccosts.at(threadId) + d->mdis; float * pcosts = d->pcosts.at(threadId) + d->mdis; int * pbackt = d->pbackt.at(threadId) + d->mdis; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2, _srcp + srcStride * (4 + 1 - field_n), vsapi->getStride(pad[plane], 0) * 2, @@ -196,7 +208,30 @@ static void filter_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef const T1 * src1n = srcp + srcStride; const T1 * src3n = srcp + srcStride * 3; - calculateConnectionCosts(src3p, src1p, src1n, src3n, ccosts, dstWidth, d); + if (bmask) { + const uint8_t * maskp = _maskp + vsapi->getStride(mcp, plane) * off; + const int mdis = std::min(dstWidth, d->mdis); + int last = -666999; + + for (int x = 0; x < mdis; x++) { + if (maskp[x]) + last = x + mdis; + } + + for (int x = 0; x < dstWidth - mdis; x++) { + if (maskp[x + mdis]) + last = x + mdis * 2; + + bmask[x] = (x <= last); + } + + for (int x = dstWidth - mdis; x < dstWidth; x++) + bmask[x] = (x <= last); + + memset(ccosts - d->mdis, 0, dstWidth * d->tpitch * sizeof(float)); + } + + calculateConnectionCosts(src3p, src1p, src1n, src3n, bmask, ccosts, dstWidth, d); // calculate path costs *pcosts = *ccosts; @@ -206,25 +241,41 @@ static void filter_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef float * pT = pcosts + d->tpitch * x; int * piT = pbackt + d->tpitch * (x - 1); - const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); - const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - int idx = 0; - float bval = FLT_MAX; - - for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { - const double z = ppT[v] + d->gamma * std::abs(u - v); - const float ccost = static_cast(std::min(z, FLT_MAX * 0.9)); - if (ccost < bval) { - bval = ccost; - idx = v; + if (bmask && !bmask[x]) { + if (x == 1) { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) + pT[u] = tT[u]; + memset(piT - d->mdis, 0, d->tpitch * sizeof(int)); + } else { + memcpy(pT - d->mdis, ppT - d->mdis, d->tpitch * sizeof(float)); + memcpy(piT - d->mdis, piT - d->mdis - d->tpitch, d->tpitch * sizeof(int)); + const int pumax = std::min(x - 1, dstWidth - x); + if (pumax < d->mdis) { + piT[-pumax] = 1 - pumax; + piT[pumax] = pumax - 1; } } + } else { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + int idx = 0; + float bval = FLT_MAX; + + for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { + const double z = ppT[v] + d->gamma * std::abs(u - v); + const float ccost = static_cast(std::min(z, FLT_MAX * 0.9)); + if (ccost < bval) { + bval = ccost; + idx = v; + } + } - const double z = bval + tT[u]; - pT[u] = static_cast(std::min(z, FLT_MAX * 0.9)); - piT[u] = idx; + const double z = bval + tT[u]; + pT[u] = static_cast(std::min(z, FLT_MAX * 0.9)); + piT[u] = idx; + } } } @@ -233,7 +284,7 @@ static void filter_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[d->tpitch * x + fpath[x + 1]]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, bmask, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } if (d->vcheck) { @@ -267,7 +318,7 @@ static void selectFunctions(const unsigned opt, EEDI3Data * d) noexcept { #endif if (d->vi.format->bytesPerSample == 1) { - d->filter = filter_c; + d->filter = filter_c; #ifdef VS_TARGET_CPU_X86 if ((opt == 0 && iset >= 9) || opt == 5) @@ -280,7 +331,7 @@ static void selectFunctions(const unsigned opt, EEDI3Data * d) noexcept { d->filter = filter_sse2; #endif } else if (d->vi.format->bytesPerSample == 2) { - d->filter = filter_c; + d->filter = filter_c; #ifdef VS_TARGET_CPU_X86 if ((opt == 0 && iset >= 9) || opt == 5) @@ -293,7 +344,7 @@ static void selectFunctions(const unsigned opt, EEDI3Data * d) noexcept { d->filter = filter_sse2; #endif } else { - d->filter = filter_c; + d->filter = filter_c; #ifdef VS_TARGET_CPU_X86 if ((opt == 0 && iset >= 9) || opt == 5) @@ -321,6 +372,9 @@ static const VSFrameRef *VS_CC eedi3GetFrame(int n, int activationReason, void * if (d->vcheck && d->sclip) vsapi->requestFrameFilter(n, d->sclip, frameCtx); + + if (d->mclip) + vsapi->requestFrameFilter(d->field > 1 ? n / 2 : n, d->mclip, frameCtx); } else if (activationReason == arAllFramesReady) { #ifdef VS_TARGET_CPU_X86 no_subnormals(); @@ -335,8 +389,27 @@ static const VSFrameRef *VS_CC eedi3GetFrame(int n, int activationReason, void * if (!srcVector) throw std::string{ "malloc failure (srcVector)" }; d->srcVector.emplace(threadId, srcVector); + + if (d->mclip) { + uint8_t * mskVector = vs_aligned_malloc(d->vi.width * d->vectorSize * sizeof(uint8_t), d->alignment); + if (!mskVector) + throw std::string{ "malloc failure (mskVector)" }; + d->mskVector.emplace(threadId, mskVector); + } else { + d->mskVector.emplace(threadId, nullptr); + } } else { d->srcVector.emplace(threadId, nullptr); + d->mskVector.emplace(threadId, nullptr); + } + + if (d->mclip) { + bool * bmask = new (std::nothrow) bool[d->vi.width]; + if (!bmask) + throw std::string{ "malloc failure (bmask)" }; + d->bmask.emplace(threadId, bmask); + } else { + d->bmask.emplace(threadId, nullptr); } float * ccosts = vs_aligned_malloc(d->vi.width * d->tpitchVector * sizeof(float), d->alignment); @@ -365,7 +438,7 @@ static const VSFrameRef *VS_CC eedi3GetFrame(int n, int activationReason, void * d->dmap.emplace(threadId, dmap); if (d->vcheck) { - float * tline = new (std::nothrow) float[d->vi.width]; + int * tline = new (std::nothrow) int[d->vi.width]; if (!tline) throw std::string{ "malloc failure (tline)" }; d->tline.emplace(threadId, tline); @@ -387,6 +460,13 @@ static const VSFrameRef *VS_CC eedi3GetFrame(int n, int activationReason, void * if (d->vcheck && d->sclip) scp = vsapi->getFrameFilter(n, d->sclip, frameCtx); + const VSFrameRef * mclip = nullptr; + VSFrameRef * mcp = nullptr; + if (d->mclip) { + mclip = vsapi->getFrameFilter(d->field > 1 ? n / 2 : n, d->mclip, frameCtx); + mcp = vsapi->newVideoFrame(vsapi->getVideoInfo(d->mclip)->format, d->vi.width, d->vi.height / 2, nullptr, core); + } + VSFrameRef * pad[3] = {}; for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) @@ -417,7 +497,7 @@ static const VSFrameRef *VS_CC eedi3GetFrame(int n, int activationReason, void * field_n = field; } - d->filter(src, scp, dst, pad, field_n, d, vsapi); + d->filter(src, scp, mclip, mcp, dst, pad, field_n, d, vsapi); VSMap * props = vsapi->getFramePropsRW(dst); vsapi->propSetInt(props, "_FieldBased", 0, paReplace); @@ -435,6 +515,8 @@ static const VSFrameRef *VS_CC eedi3GetFrame(int n, int activationReason, void * vsapi->freeFrame(src); vsapi->freeFrame(scp); + vsapi->freeFrame(mclip); + vsapi->freeFrame(mcp); for (int plane = 0; plane < d->vi.format->numPlanes; plane++) vsapi->freeFrame(pad[plane]); @@ -449,10 +531,17 @@ static void VS_CC eedi3Free(void *instanceData, VSCore *core, const VSAPI *vsapi vsapi->freeNode(d->node); vsapi->freeNode(d->sclip); + vsapi->freeNode(d->mclip); for (auto & iter : d->srcVector) vs_aligned_free(iter.second); + for (auto & iter : d->mskVector) + vs_aligned_free(iter.second); + + for (auto & iter : d->bmask) + delete[] iter.second; + for (auto & iter : d->ccosts) vs_aligned_free(iter.second); @@ -480,6 +569,7 @@ void VS_CC eedi3Create(const VSMap *in, VSMap *out, void *userData, VSCore *core d->node = vsapi->propGetNode(in, "clip", 0, nullptr); d->sclip = vsapi->propGetNode(in, "sclip", 0, &err); + d->mclip = vsapi->propGetNode(in, "mclip", 0, &err); d->vi = *vsapi->getVideoInfo(d->node); try { @@ -587,6 +677,35 @@ void VS_CC eedi3Create(const VSMap *in, VSMap *out, void *userData, VSCore *core if (d->vcheck && (vthresh0 <= 0.f || vthresh1 <= 0.f || d->vthresh2 <= 0.f)) throw std::string{ "vthresh0, vthresh1, and vthresh2 must be greater than 0.0" }; + if (d->mclip) { + if (!isSameFormat(vsapi->getVideoInfo(d->mclip), &d->vi)) + throw std::string{ "mclip's format doesn't match" }; + + if (vsapi->getVideoInfo(d->mclip)->numFrames != d->vi.numFrames) + throw std::string{ "mclip's number of frames doesn't match" }; + + if (vsapi->getVideoInfo(d->mclip)->format->bitsPerSample != 8) { + VSMap * args = vsapi->createMap(); + vsapi->propSetNode(args, "clip", d->mclip, paReplace); + vsapi->freeNode(d->mclip); + vsapi->propSetInt(args, "format", vsapi->registerFormat(d->vi.format->colorFamily, stInteger, 8, d->vi.format->subSamplingW, d->vi.format->subSamplingH, core)->id, paReplace); + + VSMap * ret = vsapi->invoke(vsapi->getPluginById("com.vapoursynth.resize", core), "Point", args); + if (vsapi->getError(ret)) { + vsapi->setError(out, vsapi->getError(ret)); + vsapi->freeNode(d->node); + vsapi->freeNode(d->sclip); + vsapi->freeMap(args); + vsapi->freeMap(ret); + return; + } + + d->mclip = vsapi->propGetNode(ret, "clip", 0, nullptr); + vsapi->freeMap(args); + vsapi->freeMap(ret); + } + } + if (opt < 0 || opt > 5) throw std::string{ "opt must be 0, 1, 2, 3, 4, or 5" }; @@ -608,7 +727,7 @@ void VS_CC eedi3Create(const VSMap *in, VSMap *out, void *userData, VSCore *core if (d->vcheck && d->sclip) { if (!isSameFormat(vsapi->getVideoInfo(d->sclip), &d->vi)) - throw std::string{ "sclip must have the same dimensions as main clip and be the same format" }; + throw std::string{ "sclip's format doesn't match" }; if (vsapi->getVideoInfo(d->sclip)->numFrames != d->vi.numFrames) throw std::string{ "sclip's number of frames doesn't match" }; @@ -616,6 +735,8 @@ void VS_CC eedi3Create(const VSMap *in, VSMap *out, void *userData, VSCore *core const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads; d->srcVector.reserve(numThreads); + d->mskVector.reserve(numThreads); + d->bmask.reserve(numThreads); d->ccosts.reserve(numThreads); d->pcosts.reserve(numThreads); d->pbackt.reserve(numThreads); @@ -640,8 +761,8 @@ void VS_CC eedi3Create(const VSMap *in, VSMap *out, void *userData, VSCore *core } d->tpitch = d->mdis * 2 + 1; - d->mdisVector = d->mdis * d->vectorSize; d->tpitchVector = d->tpitch * d->vectorSize; + d->mdisVector = d->mdis * d->vectorSize; d->rcpVthresh0 = 1.f / vthresh0; d->rcpVthresh1 = 1.f / vthresh1; @@ -650,6 +771,7 @@ void VS_CC eedi3Create(const VSMap *in, VSMap *out, void *userData, VSCore *core vsapi->setError(out, ("EEDI3: " + error).c_str()); vsapi->freeNode(d->node); vsapi->freeNode(d->sclip); + vsapi->freeNode(d->mclip); return; } @@ -684,6 +806,7 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegiste "vthresh1:float:opt;" "vthresh2:float:opt;" "sclip:clip:opt;" + "mclip:clip:opt;" "opt:int:opt;", eedi3Create, nullptr, plugin); diff --git a/EEDI3/EEDI3.hpp b/EEDI3/EEDI3.hpp index 68caeb4..1c5aa03 100644 --- a/EEDI3/EEDI3.hpp +++ b/EEDI3/EEDI3.hpp @@ -6,8 +6,17 @@ #define MAX_VECTOR_SIZE 512 #include "vectorclass/vectorclass.h" +static void copyMask(const VSFrameRef * src, VSFrameRef * dst, const int plane, const int field_n, const bool dh, const VSAPI * vsapi) noexcept { + const int off = dh ? 0 : field_n; + const int mul = dh ? 1 : 2; + + vs_bitblt(vsapi->getWritePtr(dst, plane), vsapi->getStride(dst, plane), + vsapi->getReadPtr(src, plane) + vsapi->getStride(src, plane) * off, vsapi->getStride(src, plane) * mul, + vsapi->getFrameWidth(src, plane), vsapi->getFrameHeight(dst, plane)); +} + template -static inline void reorder(const T1 * srcp, T2 * _dstp, const int width, const int height, const int srcStride, const int dstStride, const int srcY, const int vectorSize) noexcept { +static inline void prepareLines(const T1 * srcp, T2 * _dstp, const int width, const int height, const int srcStride, const int dstStride, const int srcY, const int vectorSize) noexcept { for (int y = srcY - 2; y < srcY + 2; y++) { T2 * VS_RESTRICT dstp = _dstp; @@ -25,11 +34,11 @@ static inline void reorder(const T1 * srcp, T2 * _dstp, const int width, const i } for (int x = 0; x < width; x++) - dstp[(x + 12) * vectorSize] = line[x]; + dstp[(12 + x) * vectorSize] = line[x]; for (int x = 0; x < 12; x++) { const int srcX = std::max(width - 1 - x, 0); - dstp[(width + x + 12) * vectorSize] = line[srcX]; + dstp[(width + 12 + x) * vectorSize] = line[srcX]; } dstp++; @@ -38,17 +47,35 @@ static inline void reorder(const T1 * srcp, T2 * _dstp, const int width, const i _dstp += dstStride * vectorSize; } } + +static inline void prepareMask(const uint8_t * srcp, uint8_t * VS_RESTRICT dstp, const int width, const int height, const int stride, const int srcY, const int vectorSize) noexcept { + for (int y = srcY; y < srcY + vectorSize; y++) { + int realY = y; + if (realY >= height) + realY = height * 2 - 1 - realY; + realY = std::max(realY, 0); + + const uint8_t * line = srcp + stride * realY; + + for (int x = 0; x < width; x++) + dstp[x * vectorSize] = line[x]; + + dstp++; + } +} #endif struct EEDI3Data { - VSNodeRef * node, * sclip; + VSNodeRef * node, * sclip, * mclip; VSVideoInfo vi; int field, nrad, mdis, vcheck; bool dh, process[3], ucubic, cost3; float alpha, beta, gamma, vthresh2; - int peak, vectorSize, tpitch, mdisVector, tpitchVector, alignment; + int vectorSize, alignment, tpitch, tpitchVector, mdisVector, peak; float remainingWeight, rcpVthresh0, rcpVthresh1, rcpVthresh2; - std::unordered_map ccosts, pcosts, tline; - std::unordered_map srcVector, pbackt, fpath, dmap; - void (*filter)(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *); + std::unordered_map srcVector, pbackt, fpath, dmap, tline; + std::unordered_map mskVector; + std::unordered_map bmask; + std::unordered_map ccosts, pcosts; + void (*filter)(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *); }; diff --git a/EEDI3/EEDI3CL.cpp b/EEDI3/EEDI3CL.cpp index d243d51..940e8a9 100644 --- a/EEDI3/EEDI3CL.cpp +++ b/EEDI3/EEDI3CL.cpp @@ -32,10 +32,11 @@ #include "EEDI3CL.hpp" #include "EEDI3CL.cl" -template extern void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData *, const VSAPI *); +template extern void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData * const VS_RESTRICT, const VSAPI *); template -static void filterCL_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3CLData * d, const VSAPI * vsapi) { +static void filterCL_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3CLData * const VS_RESTRICT d, const VSAPI * vsapi) { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -58,7 +59,7 @@ static void filterCL_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRe int * pbackt = d->pbackt.at(threadId) + d->mdis; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); const size_t globalWorkSize[] = { static_cast((dstWidth + 63) & -64), 1 }; constexpr size_t localWorkSize[] = { 64, 1 }; @@ -96,7 +97,6 @@ static void filterCL_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRe const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - for (int u = -umax; u <= umax; u++) { int idx = 0; float bval = FLT_MAX; @@ -121,7 +121,7 @@ static void filterCL_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRe for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[d->tpitch * x + fpath[x + 1]]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, nullptr, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); queue.enqueue_unmap_buffer(_ccosts, ccosts - d->mdis); } @@ -131,7 +131,7 @@ static void filterCL_c(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRe const T * scpp = nullptr; if (d->sclip) scpp = reinterpret_cast(vsapi->getReadPtr(scp, plane)) + dstStride * field_n; - T * dstp = _dstp + dstStride * field_n;; + T * dstp = _dstp + dstStride * field_n; vCheck(srcp, scpp, dstp, _dmap, tline, field_n, dstWidth, srcHeight, srcStride, dstStride, d->vcheck, d->vthresh2, d->rcpVthresh0, d->rcpVthresh1, d->rcpVthresh2, d->peak); } @@ -217,7 +217,7 @@ static const VSFrameRef *VS_CC eedi3clGetFrame(int n, int activationReason, void d->dmap.emplace(threadId, dmap); if (d->vcheck) { - float * tline = new (std::nothrow) float[d->vi.width]; + int * tline = new (std::nothrow) int[d->vi.width]; if (!tline) throw std::string{ "malloc failure (tline)" }; d->tline.emplace(threadId, tline); @@ -486,6 +486,53 @@ void VS_CC eedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *co return; } + d->gpu = compute::system::default_device(); + if (device > -1) + d->gpu = compute::system::devices().at(device); + d->ctx = compute::context{ d->gpu }; + + if (!!vsapi->propGetInt(in, "info", 0, &err)) { + vsapi->freeNode(d->sclip); + + std::string text{ "=== Device Info ===\n" }; + text += "Name: " + d->gpu.get_info() + "\n"; + text += "Vendor: " + d->gpu.get_info() + "\n"; + text += "Profile: " + d->gpu.get_info() + "\n"; + text += "Version: " + d->gpu.get_info() + "\n"; + text += "Global Memory Size: " + std::to_string(d->gpu.get_info() / 1024 / 1024) + " MB\n"; + text += "Local Memory Size: " + std::to_string(d->gpu.get_info() / 1024) + " KB\n"; + text += "Local Memory Type: " + std::string{ d->gpu.get_info() == CL_LOCAL ? "CL_LOCAL" : "CL_GLOBAL" } +"\n"; + text += "Image Support: " + std::string{ d->gpu.get_info() ? "CL_TRUE" : "CL_FALSE" } +"\n"; + text += "1D Image Max Buffer Size: " + std::to_string(d->gpu.get_info(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE)) + "\n"; + text += "2D Image Max Width: " + std::to_string(d->gpu.get_info()) + "\n"; + text += "2D Image Max Height: " + std::to_string(d->gpu.get_info()) + "\n"; + text += "Max Constant Arguments: " + std::to_string(d->gpu.get_info()) + "\n"; + text += "Max Constant Buffer Size: " + std::to_string(d->gpu.get_info() / 1024) + " KB\n"; + text += "Max Work-group Size: " + std::to_string(d->gpu.get_info()) + "\n"; + const auto MAX_WORK_ITEM_SIZES = d->gpu.get_info(); + text += "Max Work-item Sizes: (" + std::to_string(MAX_WORK_ITEM_SIZES[0]) + ", " + std::to_string(MAX_WORK_ITEM_SIZES[1]) + ", " + std::to_string(MAX_WORK_ITEM_SIZES[2]) + ")"; + + VSMap * args = vsapi->createMap(); + vsapi->propSetNode(args, "clip", d->node, paReplace); + vsapi->freeNode(d->node); + vsapi->propSetData(args, "text", text.c_str(), -1, paReplace); + + VSMap * ret = vsapi->invoke(vsapi->getPluginById("com.vapoursynth.text", core), "Text", args); + if (vsapi->getError(ret)) { + vsapi->setError(out, vsapi->getError(ret)); + vsapi->freeMap(args); + vsapi->freeMap(ret); + return; + } + + d->node = vsapi->propGetNode(ret, "clip", 0, nullptr); + vsapi->freeMap(args); + vsapi->freeMap(ret); + vsapi->propSetNode(out, "clip", d->node, paReplace); + vsapi->freeNode(d->node); + return; + } + if (d->field > 1) { if (d->vi.numFrames > INT_MAX / 2) throw std::string{ "resulting clip is too long" }; @@ -504,7 +551,7 @@ void VS_CC eedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *co if (d->vcheck && d->sclip) { if (!isSameFormat(vsapi->getVideoInfo(d->sclip), &d->vi)) - throw std::string{ "sclip must have the same dimensions as main clip and be the same format" }; + throw std::string{ "sclip's format doesn't match" }; if (vsapi->getVideoInfo(d->sclip)->numFrames != d->vi.numFrames) throw std::string{ "sclip's number of frames doesn't match" }; @@ -521,6 +568,8 @@ void VS_CC eedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *co d->dmap.reserve(numThreads); d->tline.reserve(numThreads); + selectFunctions(opt, d.get()); + if (d->vi.format->sampleType == stInteger) { d->peak = (1 << d->vi.format->bitsPerSample) - 1; const float scale = d->peak / 255.f; @@ -535,62 +584,20 @@ void VS_CC eedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *co vthresh1 /= 255.f; } - selectFunctions(opt, d.get()); - d->tpitch = d->mdis * 2 + 1; - d->mdisVector = d->mdis * d->vectorSize; d->tpitchVector = d->tpitch * d->vectorSize; + d->mdisVector = d->mdis * d->vectorSize; d->rcpVthresh0 = 1.f / vthresh0; d->rcpVthresh1 = 1.f / vthresh1; d->rcpVthresh2 = 1.f / d->vthresh2; - d->gpu = compute::system::default_device(); - if (device > -1) - d->gpu = compute::system::devices().at(device); - d->ctx = compute::context{ d->gpu }; - - if (!!vsapi->propGetInt(in, "info", 0, &err)) { - vsapi->freeNode(d->sclip); - - std::string text{ "=== Device Info ===\n" }; - text += "Name: " + d->gpu.get_info() + "\n"; - text += "Vendor: " + d->gpu.get_info() + "\n"; - text += "Profile: " + d->gpu.get_info() + "\n"; - text += "Version: " + d->gpu.get_info() + "\n"; - text += "Global Memory Size: " + std::to_string(d->gpu.get_info() / 1024 / 1024) + " MB\n"; - text += "Local Memory Size: " + std::to_string(d->gpu.get_info() / 1024) + " KB\n"; - text += "Local Memory Type: " + std::string{ d->gpu.get_info() == CL_LOCAL ? "CL_LOCAL" : "CL_GLOBAL" } +"\n"; - text += "Image Support: " + std::string{ d->gpu.get_info() ? "CL_TRUE" : "CL_FALSE" } +"\n"; - text += "1D Image Max Buffer Size: " + std::to_string(d->gpu.get_info(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE)) + "\n"; - text += "2D Image Max Width: " + std::to_string(d->gpu.get_info()) + "\n"; - text += "2D Image Max Height: " + std::to_string(d->gpu.get_info()) + "\n"; - text += "Max Constant Arguments: " + std::to_string(d->gpu.get_info()) + "\n"; - text += "Max Constant Buffer Size: " + std::to_string(d->gpu.get_info() / 1024) + " KB\n"; - text += "Max Work-group Size: " + std::to_string(d->gpu.get_info()) + "\n"; - const auto MAX_WORK_ITEM_SIZES = d->gpu.get_info(); - text += "Max Work-item Sizes: (" + std::to_string(MAX_WORK_ITEM_SIZES[0]) + ", " + std::to_string(MAX_WORK_ITEM_SIZES[1]) + ", " + std::to_string(MAX_WORK_ITEM_SIZES[2]) + ")"; - - VSMap * args = vsapi->createMap(); - vsapi->propSetNode(args, "clip", d->node, paReplace); - vsapi->freeNode(d->node); - vsapi->propSetData(args, "text", text.c_str(), -1, paReplace); - - VSMap * ret = vsapi->invoke(vsapi->getPluginById("com.vapoursynth.text", core), "Text", args); - if (vsapi->getError(ret)) { - vsapi->setError(out, vsapi->getError(ret)); - vsapi->freeMap(args); - vsapi->freeMap(ret); - return; - } - - d->node = vsapi->propGetNode(ret, "clip", 0, nullptr); - vsapi->freeMap(args); - vsapi->freeMap(ret); - vsapi->propSetNode(out, "clip", d->node, paReplace); - vsapi->freeNode(d->node); - return; - } + if (d->vi.format->bytesPerSample == 1) + d->clImageFormat = { CL_R, CL_UNSIGNED_INT8 }; + else if (d->vi.format->bytesPerSample == 2) + d->clImageFormat = { CL_R, CL_UNSIGNED_INT16 }; + else + d->clImageFormat = { CL_R, CL_FLOAT }; try { std::setlocale(LC_ALL, "C"); @@ -616,13 +623,6 @@ void VS_CC eedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *co } catch (const compute::opencl_error & error) { throw error.error_string() + "\n" + d->program.build_log(); } - - if (d->vi.format->bytesPerSample == 1) - d->clImageFormat = { CL_R, CL_UNSIGNED_INT8 }; - else if (d->vi.format->bytesPerSample == 2) - d->clImageFormat = { CL_R, CL_UNSIGNED_INT16 }; - else - d->clImageFormat = { CL_R, CL_FLOAT }; } catch (const std::string & error) { vsapi->setError(out, ("EEDI3CL: " + error).c_str()); vsapi->freeNode(d->node); diff --git a/EEDI3/EEDI3CL.hpp b/EEDI3/EEDI3CL.hpp index 0e804a0..aef10bc 100644 --- a/EEDI3/EEDI3CL.hpp +++ b/EEDI3/EEDI3CL.hpp @@ -28,7 +28,7 @@ struct EEDI3CLData { std::unordered_map calculateConnectionCosts; std::unordered_map src; std::unordered_map ccosts; - std::unordered_map pcosts, tline; - std::unordered_map pbackt, fpath, dmap; - void (*filter)(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData *, const VSAPI *); + std::unordered_map pcosts; + std::unordered_map pbackt, fpath, dmap, tline; + void (*filter)(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData * const VS_RESTRICT, const VSAPI *); }; diff --git a/EEDI3/EEDI3CL_SSE2.cpp b/EEDI3/EEDI3CL_SSE2.cpp index ea211f4..23b1768 100644 --- a/EEDI3/EEDI3CL_SSE2.cpp +++ b/EEDI3/EEDI3CL_SSE2.cpp @@ -2,7 +2,8 @@ #include "EEDI3CL.hpp" template -void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3CLData * d, const VSAPI * vsapi) { +void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3CLData * const VS_RESTRICT d, const VSAPI * vsapi) { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -25,7 +26,7 @@ void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * int * _pbackt = d->pbackt.at(threadId) + d->mdisVector; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); const size_t globalWorkSize[] = { static_cast((dstWidth + 15) & -16), static_cast(d->vectorSize) }; constexpr size_t localWorkSize[] = { 16, 4 }; @@ -55,7 +56,6 @@ void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - for (int u = -umax; u <= umax; u++) { Vec4i idx = zero_128b(); Vec4f bval = FLT_MAX; @@ -94,7 +94,7 @@ void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, nullptr, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } queue.enqueue_unmap_buffer(_ccosts, ccosts - d->mdisVector); @@ -105,7 +105,7 @@ void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * const T * scpp = nullptr; if (d->sclip) scpp = reinterpret_cast(vsapi->getReadPtr(scp, plane)) + dstStride * field_n; - T * dstp = _dstp + dstStride * field_n;; + T * dstp = _dstp + dstStride * field_n; vCheck(srcp, scpp, dstp, _dmap, tline, field_n, dstWidth, srcHeight, srcStride, dstStride, d->vcheck, d->vthresh2, d->rcpVthresh0, d->rcpVthresh1, d->rcpVthresh2, d->peak); } @@ -113,7 +113,7 @@ void filterCL_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * } } -template void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData *, const VSAPI *); -template void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData *, const VSAPI *); -template void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData *, const VSAPI *); +template void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData * const VS_RESTRICT, const VSAPI *); +template void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData * const VS_RESTRICT, const VSAPI *); +template void filterCL_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3CLData * const VS_RESTRICT, const VSAPI *); #endif diff --git a/EEDI3/EEDI3_AVX.cpp b/EEDI3/EEDI3_AVX.cpp index 9f894a7..f14c687 100644 --- a/EEDI3/EEDI3_AVX.cpp +++ b/EEDI3/EEDI3_AVX.cpp @@ -5,7 +5,7 @@ #include "EEDI3.hpp" -static inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +static inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m256 * src3p = reinterpret_cast(srcp) + 12; const __m256 * src1p = src3p + stride; const __m256 * src1n = src1p + stride; @@ -13,66 +13,68 @@ static inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec8f s0 = zero_8f(), s1 = zero_8f(), s2 = zero_8f(); - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec8f s0 = zero_8f(), s1 = zero_8f(), s2 = zero_8f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec8f().load_a(src3p + x + u + k) - Vec8f().load_a(src1p + x - u + k)) + - abs(Vec8f().load_a(src1p + x + u + k) - Vec8f().load_a(src1n + x - u + k)) + - abs(Vec8f().load_a(src1n + x + u + k) - Vec8f().load_a(src3n + x - u + k)); - - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec8f().load_a(src3p + x + k) - Vec8f().load_a(src1p + x - u2 + k)) + - abs(Vec8f().load_a(src1p + x + k) - Vec8f().load_a(src1n + x - u2 + k)) + - abs(Vec8f().load_a(src1n + x + k) - Vec8f().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec8f().load_a(src3p + x + u + k) - Vec8f().load_a(src1p + x - u + k)) + + abs(Vec8f().load_a(src1p + x + u + k) - Vec8f().load_a(src1n + x - u + k)) + + abs(Vec8f().load_a(src1n + x + u + k) - Vec8f().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec8f().load_a(src3p + x + k) - Vec8f().load_a(src1p + x - u2 + k)) + + abs(Vec8f().load_a(src1p + x + k) - Vec8f().load_a(src1n + x - u2 + k)) + + abs(Vec8f().load_a(src1n + x + k) - Vec8f().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec8f().load_a(src3p + x + u2 + k) - Vec8f().load_a(src1p + x + k)) + - abs(Vec8f().load_a(src1p + x + u2 + k) - Vec8f().load_a(src1n + x + k)) + - abs(Vec8f().load_a(src1n + x + u2 + k) - Vec8f().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec8f().load_a(src3p + x + u2 + k) - Vec8f().load_a(src1p + x + k)) + + abs(Vec8f().load_a(src1p + x + u2 + k) - Vec8f().load_a(src1n + x + k)) + + abs(Vec8f().load_a(src1n + x + u2 + k) - Vec8f().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec8f ip = (Vec8f().load_a(src1p + x + u) + Vec8f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec8f v = abs(Vec8f().load_a(src1p + x) - ip) + abs(Vec8f().load_a(src1n + x) - ip); - const Vec8f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec8f ip = (Vec8f().load_a(src1p + x + u) + Vec8f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec8f v = abs(Vec8f().load_a(src1p + x) - ip) + abs(Vec8f().load_a(src1n + x) - ip); + const Vec8f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec8f s = zero_8f(); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec8f s = zero_8f(); - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec8f().load_a(src3p + x + u + k) - Vec8f().load_a(src1p + x - u + k)) + - abs(Vec8f().load_a(src1p + x + u + k) - Vec8f().load_a(src1n + x - u + k)) + - abs(Vec8f().load_a(src1n + x + u + k) - Vec8f().load_a(src3n + x - u + k)); - - const Vec8f ip = (Vec8f().load_a(src1p + x + u) + Vec8f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec8f v = abs(Vec8f().load_a(src1p + x) - ip) + abs(Vec8f().load_a(src1n + x) - ip); - const Vec8f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec8f().load_a(src3p + x + u + k) - Vec8f().load_a(src1p + x - u + k)) + + abs(Vec8f().load_a(src1p + x + u + k) - Vec8f().load_a(src1n + x - u + k)) + + abs(Vec8f().load_a(src1n + x + u + k) - Vec8f().load_a(src3n + x - u + k)); + + const Vec8f ip = (Vec8f().load_a(src1p + x + u) + Vec8f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec8f v = abs(Vec8f().load_a(src1p + x) - ip) + abs(Vec8f().load_a(src1n + x) - ip); + const Vec8f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template -void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3Data * d, const VSAPI * vsapi) noexcept { +void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, const VSFrameRef * mclip, VSFrameRef * mcp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -86,14 +88,22 @@ void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst const T1 * _srcp = reinterpret_cast(vsapi->getReadPtr(pad[plane], 0)) + 12; T1 * VS_RESTRICT _dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const uint8_t * maskp = nullptr; + if (d->mclip) { + copyMask(mclip, mcp, plane, field_n, d->dh, vsapi); + maskp = vsapi->getReadPtr(mcp, plane); + } + const auto threadId = std::this_thread::get_id(); T2 * srcVector = reinterpret_cast(d->srcVector.at(threadId)); + uint8_t * _mskVector = d->mskVector.at(threadId); + bool * bmask = d->bmask.at(threadId); float * ccosts = d->ccosts.at(threadId) + d->mdisVector; float * pcosts = d->pcosts.at(threadId) + d->mdisVector; int * _pbackt = d->pbackt.at(threadId) + d->mdisVector; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2, _srcp + srcStride * (4 + 1 - field_n), vsapi->getStride(pad[plane], 0) * 2, @@ -105,9 +115,34 @@ void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst for (int y = field_n; y < dstHeight; y += 2 * d->vectorSize) { const int off = (y - field_n) >> 1; - reorder(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + prepareLines(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); - calculateConnectionCosts(srcVector, ccosts, dstWidth, srcWidth, d); + if (bmask) { + prepareMask(maskp, _mskVector, dstWidth, (dstHeight + field_n) >> 1, vsapi->getStride(mcp, plane), off, d->vectorSize); + + const int64_t * mskVector = reinterpret_cast(_mskVector); + const int mdis = std::min(dstWidth, d->mdis); + int last = -666999; + + for (int x = 0; x < mdis; x++) { + if (mskVector[x]) + last = x + mdis; + } + + for (int x = 0; x < dstWidth - mdis; x++) { + if (mskVector[x + mdis]) + last = x + mdis * 2; + + bmask[x] = (x <= last); + } + + for (int x = dstWidth - mdis; x < dstWidth; x++) + bmask[x] = (x <= last); + + memset(ccosts - d->mdisVector, 0, dstWidth * d->tpitchVector * sizeof(float)); + } + + calculateConnectionCosts(srcVector, bmask, ccosts, dstWidth, srcWidth, d); // calculate path costs Vec8f().load_a(ccosts).store_a(pcosts); @@ -117,23 +152,38 @@ void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst float * pT = pcosts + d->tpitchVector * x; int * piT = _pbackt + d->tpitchVector * (x - 1); - const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); - const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec8i idx = zero_256b(); - Vec8f bval = FLT_MAX; - - for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { - const Vec8f z = Vec8f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); - const Vec8f ccost = min(z, FLT_MAX * 0.9f); - idx = select(Vec8ib(ccost < bval), v, idx); - bval = min(ccost, bval); + if (bmask && !bmask[x]) { + if (x == 1) { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + memcpy(pT - umax * d->vectorSize, tT - umax * d->vectorSize, (umax * 2 + 1) * d->vectorSize * sizeof(float)); + memset(piT - d->mdisVector, 0, d->tpitchVector * sizeof(int)); + } else { + memcpy(pT - d->mdisVector, ppT - d->mdisVector, d->tpitchVector * sizeof(float)); + memcpy(piT - d->mdisVector, piT - d->mdisVector - d->tpitchVector, d->tpitchVector * sizeof(int)); + const int pumax = std::min(x - 1, dstWidth - x); + if (pumax < d->mdis) { + Vec8i(1 - pumax).stream(piT - pumax * d->vectorSize); + Vec8i(pumax - 1).stream(piT + pumax * d->vectorSize); + } + } + } else { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec8i idx = zero_256b(); + Vec8f bval = FLT_MAX; + + for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { + const Vec8f z = Vec8f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); + const Vec8f ccost = min(z, FLT_MAX * 0.9f); + idx = select(Vec8ib(ccost < bval), v, idx); + bval = min(ccost, bval); + } + + const Vec8f z = bval + Vec8f().load_a(tT + u * d->vectorSize); + min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); + idx.stream(piT + u * d->vectorSize); } - - const Vec8f z = bval + Vec8f().load_a(tT + u * d->vectorSize); - min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); - idx.stream(piT + u * d->vectorSize); } } @@ -158,7 +208,7 @@ void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, bmask, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } } @@ -175,7 +225,7 @@ void filter_avx(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst } } -template void filter_avx(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_avx(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_avx(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; +template void filter_avx(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_avx(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_avx(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; #endif diff --git a/EEDI3/EEDI3_AVX512.cpp b/EEDI3/EEDI3_AVX512.cpp index 8eb3507..07b93f6 100644 --- a/EEDI3/EEDI3_AVX512.cpp +++ b/EEDI3/EEDI3_AVX512.cpp @@ -6,7 +6,7 @@ #include "EEDI3.hpp" template -static inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +static inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m512i * src3p = reinterpret_cast(srcp) + 12; const __m512i * src1p = src3p + stride; const __m512i * src1n = src1p + stride; @@ -14,66 +14,67 @@ static inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec16i s0 = zero_512b(), s1 = zero_512b(), s2 = zero_512b(); - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec16i s0 = zero_512b(), s1 = zero_512b(), s2 = zero_512b(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec16i().load_a(src3p + x + u + k) - Vec16i().load_a(src1p + x - u + k)) + - abs(Vec16i().load_a(src1p + x + u + k) - Vec16i().load_a(src1n + x - u + k)) + - abs(Vec16i().load_a(src1n + x + u + k) - Vec16i().load_a(src3n + x - u + k)); - - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec16i().load_a(src3p + x + k) - Vec16i().load_a(src1p + x - u2 + k)) + - abs(Vec16i().load_a(src1p + x + k) - Vec16i().load_a(src1n + x - u2 + k)) + - abs(Vec16i().load_a(src1n + x + k) - Vec16i().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec16i().load_a(src3p + x + u + k) - Vec16i().load_a(src1p + x - u + k)) + + abs(Vec16i().load_a(src1p + x + u + k) - Vec16i().load_a(src1n + x - u + k)) + + abs(Vec16i().load_a(src1n + x + u + k) - Vec16i().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec16i().load_a(src3p + x + k) - Vec16i().load_a(src1p + x - u2 + k)) + + abs(Vec16i().load_a(src1p + x + k) - Vec16i().load_a(src1n + x - u2 + k)) + + abs(Vec16i().load_a(src1n + x + k) - Vec16i().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec16i().load_a(src3p + x + u2 + k) - Vec16i().load_a(src1p + x + k)) + - abs(Vec16i().load_a(src1p + x + u2 + k) - Vec16i().load_a(src1n + x + k)) + - abs(Vec16i().load_a(src1n + x + u2 + k) - Vec16i().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec16i().load_a(src3p + x + u2 + k) - Vec16i().load_a(src1p + x + k)) + + abs(Vec16i().load_a(src1p + x + u2 + k) - Vec16i().load_a(src1n + x + k)) + + abs(Vec16i().load_a(src1n + x + u2 + k) - Vec16i().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec16i ip = (Vec16i().load_a(src1p + x + u) + Vec16i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true - const Vec16f v = to_float(abs(Vec16i().load_a(src1p + x) - ip) + abs(Vec16i().load_a(src1n + x) - ip)); - const Vec16f result = mul_add(d->alpha, to_float(s0 + s1 + s2), d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec16i ip = (Vec16i().load_a(src1p + x + u) + Vec16i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true + const Vec16f v = to_float(abs(Vec16i().load_a(src1p + x) - ip) + abs(Vec16i().load_a(src1n + x) - ip)); + const Vec16f result = mul_add(d->alpha, to_float(s0 + s1 + s2), d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec16i s = zero_512b(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec16i().load_a(src3p + x + u + k) - Vec16i().load_a(src1p + x - u + k)) + - abs(Vec16i().load_a(src1p + x + u + k) - Vec16i().load_a(src1n + x - u + k)) + - abs(Vec16i().load_a(src1n + x + u + k) - Vec16i().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec16i s = zero_512b(); - const Vec16i ip = (Vec16i().load_a(src1p + x + u) + Vec16i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true - const Vec16f v = to_float(abs(Vec16i().load_a(src1p + x) - ip) + abs(Vec16i().load_a(src1n + x) - ip)); - const Vec16f result = mul_add(d->alpha, to_float(s), d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec16i().load_a(src3p + x + u + k) - Vec16i().load_a(src1p + x - u + k)) + + abs(Vec16i().load_a(src1p + x + u + k) - Vec16i().load_a(src1n + x - u + k)) + + abs(Vec16i().load_a(src1n + x + u + k) - Vec16i().load_a(src3n + x - u + k)); + + const Vec16i ip = (Vec16i().load_a(src1p + x + u) + Vec16i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true + const Vec16f v = to_float(abs(Vec16i().load_a(src1p + x) - ip) + abs(Vec16i().load_a(src1n + x) - ip)); + const Vec16f result = mul_add(d->alpha, to_float(s), d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template<> -inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m512 * src3p = reinterpret_cast(srcp) + 12; const __m512 * src1p = src3p + stride; const __m512 * src1n = src1p + stride; @@ -81,66 +82,68 @@ inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec16f s0 = zero_16f(), s1 = zero_16f(), s2 = zero_16f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec16f().load_a(src3p + x + u + k) - Vec16f().load_a(src1p + x - u + k)) + - abs(Vec16f().load_a(src1p + x + u + k) - Vec16f().load_a(src1n + x - u + k)) + - abs(Vec16f().load_a(src1n + x + u + k) - Vec16f().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec16f s0 = zero_16f(), s1 = zero_16f(), s2 = zero_16f(); - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec16f().load_a(src3p + x + k) - Vec16f().load_a(src1p + x - u2 + k)) + - abs(Vec16f().load_a(src1p + x + k) - Vec16f().load_a(src1n + x - u2 + k)) + - abs(Vec16f().load_a(src1n + x + k) - Vec16f().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec16f().load_a(src3p + x + u + k) - Vec16f().load_a(src1p + x - u + k)) + + abs(Vec16f().load_a(src1p + x + u + k) - Vec16f().load_a(src1n + x - u + k)) + + abs(Vec16f().load_a(src1n + x + u + k) - Vec16f().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec16f().load_a(src3p + x + k) - Vec16f().load_a(src1p + x - u2 + k)) + + abs(Vec16f().load_a(src1p + x + k) - Vec16f().load_a(src1n + x - u2 + k)) + + abs(Vec16f().load_a(src1n + x + k) - Vec16f().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec16f().load_a(src3p + x + u2 + k) - Vec16f().load_a(src1p + x + k)) + - abs(Vec16f().load_a(src1p + x + u2 + k) - Vec16f().load_a(src1n + x + k)) + - abs(Vec16f().load_a(src1n + x + u2 + k) - Vec16f().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec16f().load_a(src3p + x + u2 + k) - Vec16f().load_a(src1p + x + k)) + + abs(Vec16f().load_a(src1p + x + u2 + k) - Vec16f().load_a(src1n + x + k)) + + abs(Vec16f().load_a(src1n + x + u2 + k) - Vec16f().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec16f ip = (Vec16f().load_a(src1p + x + u) + Vec16f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec16f v = abs(Vec16f().load_a(src1p + x) - ip) + abs(Vec16f().load_a(src1n + x) - ip); - const Vec16f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec16f ip = (Vec16f().load_a(src1p + x + u) + Vec16f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec16f v = abs(Vec16f().load_a(src1p + x) - ip) + abs(Vec16f().load_a(src1n + x) - ip); + const Vec16f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec16f s = zero_16f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec16f().load_a(src3p + x + u + k) - Vec16f().load_a(src1p + x - u + k)) + - abs(Vec16f().load_a(src1p + x + u + k) - Vec16f().load_a(src1n + x - u + k)) + - abs(Vec16f().load_a(src1n + x + u + k) - Vec16f().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec16f s = zero_16f(); - const Vec16f ip = (Vec16f().load_a(src1p + x + u) + Vec16f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec16f v = abs(Vec16f().load_a(src1p + x) - ip) + abs(Vec16f().load_a(src1n + x) - ip); - const Vec16f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec16f().load_a(src3p + x + u + k) - Vec16f().load_a(src1p + x - u + k)) + + abs(Vec16f().load_a(src1p + x + u + k) - Vec16f().load_a(src1n + x - u + k)) + + abs(Vec16f().load_a(src1n + x + u + k) - Vec16f().load_a(src3n + x - u + k)); + + const Vec16f ip = (Vec16f().load_a(src1p + x + u) + Vec16f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec16f v = abs(Vec16f().load_a(src1p + x) - ip) + abs(Vec16f().load_a(src1n + x) - ip); + const Vec16f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template -void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3Data * d, const VSAPI * vsapi) noexcept { +void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, const VSFrameRef * mclip, VSFrameRef * mcp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -154,14 +157,22 @@ void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * const T1 * _srcp = reinterpret_cast(vsapi->getReadPtr(pad[plane], 0)) + 12; T1 * VS_RESTRICT _dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const uint8_t * maskp = nullptr; + if (d->mclip) { + copyMask(mclip, mcp, plane, field_n, d->dh, vsapi); + maskp = vsapi->getReadPtr(mcp, plane); + } + const auto threadId = std::this_thread::get_id(); T2 * srcVector = reinterpret_cast(d->srcVector.at(threadId)); + uint8_t * _mskVector = d->mskVector.at(threadId); + bool * bmask = d->bmask.at(threadId); float * ccosts = d->ccosts.at(threadId) + d->mdisVector; float * pcosts = d->pcosts.at(threadId) + d->mdisVector; int * _pbackt = d->pbackt.at(threadId) + d->mdisVector; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2, _srcp + srcStride * (4 + 1 - field_n), vsapi->getStride(pad[plane], 0) * 2, @@ -173,9 +184,34 @@ void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * for (int y = field_n; y < dstHeight; y += 2 * d->vectorSize) { const int off = (y - field_n) >> 1; - reorder(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + prepareLines(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + + if (bmask) { + prepareMask(maskp, _mskVector, dstWidth, (dstHeight + field_n) >> 1, vsapi->getStride(mcp, plane), off, d->vectorSize); + + const int64_t * mskVector = reinterpret_cast(_mskVector); + const int mdis = std::min(dstWidth, d->mdis); + int last = -666999; + + for (int x = 0; x < mdis; x++) { + if (mskVector[x * 2] || mskVector[x * 2 + 1]) + last = x + mdis; + } - calculateConnectionCosts(srcVector, ccosts, dstWidth, srcWidth, d); + for (int x = 0; x < dstWidth - mdis; x++) { + if (mskVector[(x + mdis) * 2] || mskVector[(x + mdis) * 2 + 1]) + last = x + mdis * 2; + + bmask[x] = (x <= last); + } + + for (int x = dstWidth - mdis; x < dstWidth; x++) + bmask[x] = (x <= last); + + memset(ccosts - d->mdisVector, 0, dstWidth * d->tpitchVector * sizeof(float)); + } + + calculateConnectionCosts(srcVector, bmask, ccosts, dstWidth, srcWidth, d); // calculate path costs Vec16f().load_a(ccosts).store_a(pcosts); @@ -185,23 +221,38 @@ void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * float * pT = pcosts + d->tpitchVector * x; int * piT = _pbackt + d->tpitchVector * (x - 1); - const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); - const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec16i idx = zero_512b(); - Vec16f bval = FLT_MAX; - - for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { - const Vec16f z = Vec16f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); - const Vec16f ccost = min(z, FLT_MAX * 0.9f); - idx = select(Vec16ib(ccost < bval), v, idx); - bval = min(ccost, bval); + if (bmask && !bmask[x]) { + if (x == 1) { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + memcpy(pT - umax * d->vectorSize, tT - umax * d->vectorSize, (umax * 2 + 1) * d->vectorSize * sizeof(float)); + memset(piT - d->mdisVector, 0, d->tpitchVector * sizeof(int)); + } else { + memcpy(pT - d->mdisVector, ppT - d->mdisVector, d->tpitchVector * sizeof(float)); + memcpy(piT - d->mdisVector, piT - d->mdisVector - d->tpitchVector, d->tpitchVector * sizeof(int)); + const int pumax = std::min(x - 1, dstWidth - x); + if (pumax < d->mdis) { + Vec16i(1 - pumax).stream(piT - pumax * d->vectorSize); + Vec16i(pumax - 1).stream(piT + pumax * d->vectorSize); + } + } + } else { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec16i idx = zero_512b(); + Vec16f bval = FLT_MAX; + + for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { + const Vec16f z = Vec16f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); + const Vec16f ccost = min(z, FLT_MAX * 0.9f); + idx = select(Vec16ib(ccost < bval), v, idx); + bval = min(ccost, bval); + } + + const Vec16f z = bval + Vec16f().load_a(tT + u * d->vectorSize); + min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); + idx.stream(piT + u * d->vectorSize); } - - const Vec16f z = bval + Vec16f().load_a(tT + u * d->vectorSize); - min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); - idx.stream(piT + u * d->vectorSize); } } @@ -226,7 +277,7 @@ void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, bmask, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } } @@ -243,7 +294,7 @@ void filter_avx512(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * } } -template void filter_avx512(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_avx512(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_avx512(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; +template void filter_avx512(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_avx512(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_avx512(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; #endif diff --git a/EEDI3/EEDI3_SSE2.cpp b/EEDI3/EEDI3_SSE2.cpp index 21d7047..83aebaa 100644 --- a/EEDI3/EEDI3_SSE2.cpp +++ b/EEDI3/EEDI3_SSE2.cpp @@ -2,7 +2,7 @@ #include "EEDI3.hpp" template -static inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +static inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m128i * src3p = reinterpret_cast(srcp) + 12; const __m128i * src1p = src3p + stride; const __m128i * src1n = src1p + stride; @@ -10,66 +10,67 @@ static inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec4i s0 = zero_128b(), s1 = zero_128b(), s2 = zero_128b(); - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec4i s0 = zero_128b(), s1 = zero_128b(), s2 = zero_128b(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + - abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + - abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); - - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec4i().load_a(src3p + x + k) - Vec4i().load_a(src1p + x - u2 + k)) + - abs(Vec4i().load_a(src1p + x + k) - Vec4i().load_a(src1n + x - u2 + k)) + - abs(Vec4i().load_a(src1n + x + k) - Vec4i().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + + abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + + abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec4i().load_a(src3p + x + k) - Vec4i().load_a(src1p + x - u2 + k)) + + abs(Vec4i().load_a(src1p + x + k) - Vec4i().load_a(src1n + x - u2 + k)) + + abs(Vec4i().load_a(src1n + x + k) - Vec4i().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec4i().load_a(src3p + x + u2 + k) - Vec4i().load_a(src1p + x + k)) + - abs(Vec4i().load_a(src1p + x + u2 + k) - Vec4i().load_a(src1n + x + k)) + - abs(Vec4i().load_a(src1n + x + u2 + k) - Vec4i().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec4i().load_a(src3p + x + u2 + k) - Vec4i().load_a(src1p + x + k)) + + abs(Vec4i().load_a(src1p + x + u2 + k) - Vec4i().load_a(src1n + x + k)) + + abs(Vec4i().load_a(src1n + x + u2 + k) - Vec4i().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true - const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); - const Vec4f result = mul_add(d->alpha, to_float(s0 + s1 + s2), d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true + const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); + const Vec4f result = mul_add(d->alpha, to_float(s0 + s1 + s2), d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec4i s = zero_128b(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + - abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + - abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec4i s = zero_128b(); - const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true - const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); - const Vec4f result = mul_add(d->alpha, to_float(s), d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + + abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + + abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); + + const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true + const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); + const Vec4f result = mul_add(d->alpha, to_float(s), d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template<> -inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m128 * src3p = reinterpret_cast(srcp) + 12; const __m128 * src1p = src3p + stride; const __m128 * src1n = src1p + stride; @@ -77,66 +78,68 @@ inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec4f s0 = zero_4f(), s1 = zero_4f(), s2 = zero_4f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + - abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + - abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec4f s0 = zero_4f(), s1 = zero_4f(), s2 = zero_4f(); - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec4f().load_a(src3p + x + k) - Vec4f().load_a(src1p + x - u2 + k)) + - abs(Vec4f().load_a(src1p + x + k) - Vec4f().load_a(src1n + x - u2 + k)) + - abs(Vec4f().load_a(src1n + x + k) - Vec4f().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + + abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + + abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec4f().load_a(src3p + x + k) - Vec4f().load_a(src1p + x - u2 + k)) + + abs(Vec4f().load_a(src1p + x + k) - Vec4f().load_a(src1n + x - u2 + k)) + + abs(Vec4f().load_a(src1n + x + k) - Vec4f().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec4f().load_a(src3p + x + u2 + k) - Vec4f().load_a(src1p + x + k)) + - abs(Vec4f().load_a(src1p + x + u2 + k) - Vec4f().load_a(src1n + x + k)) + - abs(Vec4f().load_a(src1n + x + u2 + k) - Vec4f().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec4f().load_a(src3p + x + u2 + k) - Vec4f().load_a(src1p + x + k)) + + abs(Vec4f().load_a(src1p + x + u2 + k) - Vec4f().load_a(src1n + x + k)) + + abs(Vec4f().load_a(src1n + x + u2 + k) - Vec4f().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); - const Vec4f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); + const Vec4f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec4f s = zero_4f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + - abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + - abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec4f s = zero_4f(); - const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); - const Vec4f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + + abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + + abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + + const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); + const Vec4f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template -void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3Data * d, const VSAPI * vsapi) noexcept { +void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, const VSFrameRef * mclip, VSFrameRef * mcp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -150,14 +153,22 @@ void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds const T1 * _srcp = reinterpret_cast(vsapi->getReadPtr(pad[plane], 0)) + 12; T1 * VS_RESTRICT _dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const uint8_t * maskp = nullptr; + if (d->mclip) { + copyMask(mclip, mcp, plane, field_n, d->dh, vsapi); + maskp = vsapi->getReadPtr(mcp, plane); + } + const auto threadId = std::this_thread::get_id(); T2 * srcVector = reinterpret_cast(d->srcVector.at(threadId)); + uint8_t * _mskVector = d->mskVector.at(threadId); + bool * bmask = d->bmask.at(threadId); float * ccosts = d->ccosts.at(threadId) + d->mdisVector; float * pcosts = d->pcosts.at(threadId) + d->mdisVector; int * _pbackt = d->pbackt.at(threadId) + d->mdisVector; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2, _srcp + srcStride * (4 + 1 - field_n), vsapi->getStride(pad[plane], 0) * 2, @@ -169,9 +180,34 @@ void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds for (int y = field_n; y < dstHeight; y += 2 * d->vectorSize) { const int off = (y - field_n) >> 1; - reorder(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + prepareLines(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + + if (bmask) { + prepareMask(maskp, _mskVector, dstWidth, (dstHeight + field_n) >> 1, vsapi->getStride(mcp, plane), off, d->vectorSize); + + const int32_t * mskVector = reinterpret_cast(_mskVector); + const int mdis = std::min(dstWidth, d->mdis); + int last = -666999; + + for (int x = 0; x < mdis; x++) { + if (mskVector[x]) + last = x + mdis; + } - calculateConnectionCosts(srcVector, ccosts, dstWidth, srcWidth, d); + for (int x = 0; x < dstWidth - mdis; x++) { + if (mskVector[x + mdis]) + last = x + mdis * 2; + + bmask[x] = (x <= last); + } + + for (int x = dstWidth - mdis; x < dstWidth; x++) + bmask[x] = (x <= last); + + memset(ccosts - d->mdisVector, 0, dstWidth * d->tpitchVector * sizeof(float)); + } + + calculateConnectionCosts(srcVector, bmask, ccosts, dstWidth, srcWidth, d); // calculate path costs Vec4f().load_a(ccosts).store_a(pcosts); @@ -181,23 +217,38 @@ void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds float * pT = pcosts + d->tpitchVector * x; int * piT = _pbackt + d->tpitchVector * (x - 1); - const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); - const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec4i idx = zero_128b(); - Vec4f bval = FLT_MAX; - - for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { - const Vec4f z = Vec4f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); - const Vec4f ccost = min(z, FLT_MAX * 0.9f); - idx = select(Vec4ib(ccost < bval), v, idx); - bval = min(ccost, bval); + if (bmask && !bmask[x]) { + if (x == 1) { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + memcpy(pT - umax * d->vectorSize, tT - umax * d->vectorSize, (umax * 2 + 1) * d->vectorSize * sizeof(float)); + memset(piT - d->mdisVector, 0, d->tpitchVector * sizeof(int)); + } else { + memcpy(pT - d->mdisVector, ppT - d->mdisVector, d->tpitchVector * sizeof(float)); + memcpy(piT - d->mdisVector, piT - d->mdisVector - d->tpitchVector, d->tpitchVector * sizeof(int)); + const int pumax = std::min(x - 1, dstWidth - x); + if (pumax < d->mdis) { + Vec4i(1 - pumax).stream(piT - pumax * d->vectorSize); + Vec4i(pumax - 1).stream(piT + pumax * d->vectorSize); + } + } + } else { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec4i idx = zero_128b(); + Vec4f bval = FLT_MAX; + + for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { + const Vec4f z = Vec4f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); + const Vec4f ccost = min(z, FLT_MAX * 0.9f); + idx = select(Vec4ib(ccost < bval), v, idx); + bval = min(ccost, bval); + } + + const Vec4f z = bval + Vec4f().load_a(tT + u * d->vectorSize); + min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); + idx.stream(piT + u * d->vectorSize); } - - const Vec4f z = bval + Vec4f().load_a(tT + u * d->vectorSize); - min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); - idx.stream(piT + u * d->vectorSize); } } @@ -222,7 +273,7 @@ void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, bmask, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } } @@ -239,7 +290,7 @@ void filter_sse2(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds } } -template void filter_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_sse2(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; +template void filter_sse2(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_sse2(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_sse2(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; #endif diff --git a/EEDI3/EEDI3_SSE4.cpp b/EEDI3/EEDI3_SSE4.cpp index 9b88bba..0a38012 100644 --- a/EEDI3/EEDI3_SSE4.cpp +++ b/EEDI3/EEDI3_SSE4.cpp @@ -6,7 +6,7 @@ #include "EEDI3.hpp" template -static inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +static inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m128i * src3p = reinterpret_cast(srcp) + 12; const __m128i * src1p = src3p + stride; const __m128i * src1n = src1p + stride; @@ -14,66 +14,67 @@ static inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec4i s0 = zero_128b(), s1 = zero_128b(), s2 = zero_128b(); - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec4i s0 = zero_128b(), s1 = zero_128b(), s2 = zero_128b(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + - abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + - abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); - - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec4i().load_a(src3p + x + k) - Vec4i().load_a(src1p + x - u2 + k)) + - abs(Vec4i().load_a(src1p + x + k) - Vec4i().load_a(src1n + x - u2 + k)) + - abs(Vec4i().load_a(src1n + x + k) - Vec4i().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + + abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + + abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec4i().load_a(src3p + x + k) - Vec4i().load_a(src1p + x - u2 + k)) + + abs(Vec4i().load_a(src1p + x + k) - Vec4i().load_a(src1n + x - u2 + k)) + + abs(Vec4i().load_a(src1n + x + k) - Vec4i().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec4i().load_a(src3p + x + u2 + k) - Vec4i().load_a(src1p + x + k)) + - abs(Vec4i().load_a(src1p + x + u2 + k) - Vec4i().load_a(src1n + x + k)) + - abs(Vec4i().load_a(src1n + x + u2 + k) - Vec4i().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec4i().load_a(src3p + x + u2 + k) - Vec4i().load_a(src1p + x + k)) + + abs(Vec4i().load_a(src1p + x + u2 + k) - Vec4i().load_a(src1n + x + k)) + + abs(Vec4i().load_a(src1n + x + u2 + k) - Vec4i().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true - const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); - const Vec4f result = mul_add(d->alpha, to_float(s0 + s1 + s2), d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true + const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); + const Vec4f result = mul_add(d->alpha, to_float(s0 + s1 + s2), d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec4i s = zero_128b(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + - abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + - abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec4i s = zero_128b(); - const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true - const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); - const Vec4f result = mul_add(d->alpha, to_float(s), d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec4i().load_a(src3p + x + u + k) - Vec4i().load_a(src1p + x - u + k)) + + abs(Vec4i().load_a(src1p + x + u + k) - Vec4i().load_a(src1n + x - u + k)) + + abs(Vec4i().load_a(src1n + x + u + k) - Vec4i().load_a(src3n + x - u + k)); + + const Vec4i ip = (Vec4i().load_a(src1p + x + u) + Vec4i().load_a(src1n + x - u) + 1) >> 1; // should use cubic if ucubic=true + const Vec4f v = to_float(abs(Vec4i().load_a(src1p + x) - ip) + abs(Vec4i().load_a(src1n + x) - ip)); + const Vec4f result = mul_add(d->alpha, to_float(s), d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template<> -inline void calculateConnectionCosts(const void * srcp, float * ccosts, const int width, const int stride, const EEDI3Data * d) noexcept { +inline void calculateConnectionCosts(const void * srcp, const bool * bmask, float * ccosts, const int width, const int stride, const EEDI3Data * const VS_RESTRICT d) noexcept { const __m128 * src3p = reinterpret_cast(srcp) + 12; const __m128 * src1p = src3p + stride; const __m128 * src1n = src1p + stride; @@ -81,66 +82,68 @@ inline void calculateConnectionCosts(const void * srcp, float * ccosts, c if (d->cost3) { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - const int u2 = u * 2; - const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); - const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); - - Vec4f s0 = zero_4f(), s1 = zero_4f(), s2 = zero_4f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s0 += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + - abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + - abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + const int u2 = u * 2; + const bool s1Flag = (u >= 0 && x >= u2) || (u <= 0 && x < width + u2); + const bool s2Flag = (u <= 0 && x >= -u2) || (u >= 0 && x < width - u2); + Vec4f s0 = zero_4f(), s1 = zero_4f(), s2 = zero_4f(); - if (s1Flag) { for (int k = -(d->nrad); k <= d->nrad; k++) - s1 += abs(Vec4f().load_a(src3p + x + k) - Vec4f().load_a(src1p + x - u2 + k)) + - abs(Vec4f().load_a(src1p + x + k) - Vec4f().load_a(src1n + x - u2 + k)) + - abs(Vec4f().load_a(src1n + x + k) - Vec4f().load_a(src3n + x - u2 + k)); - } + s0 += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + + abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + + abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + + if (s1Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s1 += abs(Vec4f().load_a(src3p + x + k) - Vec4f().load_a(src1p + x - u2 + k)) + + abs(Vec4f().load_a(src1p + x + k) - Vec4f().load_a(src1n + x - u2 + k)) + + abs(Vec4f().load_a(src1n + x + k) - Vec4f().load_a(src3n + x - u2 + k)); + } - if (s2Flag) { - for (int k = -(d->nrad); k <= d->nrad; k++) - s2 += abs(Vec4f().load_a(src3p + x + u2 + k) - Vec4f().load_a(src1p + x + k)) + - abs(Vec4f().load_a(src1p + x + u2 + k) - Vec4f().load_a(src1n + x + k)) + - abs(Vec4f().load_a(src1n + x + u2 + k) - Vec4f().load_a(src3n + x + k)); - } + if (s2Flag) { + for (int k = -(d->nrad); k <= d->nrad; k++) + s2 += abs(Vec4f().load_a(src3p + x + u2 + k) - Vec4f().load_a(src1p + x + k)) + + abs(Vec4f().load_a(src1p + x + u2 + k) - Vec4f().load_a(src1n + x + k)) + + abs(Vec4f().load_a(src1n + x + u2 + k) - Vec4f().load_a(src3n + x + k)); + } - s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); - s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); + s1 = s1Flag ? s1 : (s2Flag ? s2 : s0); + s2 = s2Flag ? s2 : (s1Flag ? s1 : s0); - const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); - const Vec4f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); + const Vec4f result = mul_add(d->alpha, s0 + s1 + s2, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } else { for (int x = 0; x < width; x++) { - const int umax = std::min({ x, width - 1 - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec4f s = zero_4f(); - - for (int k = -(d->nrad); k <= d->nrad; k++) - s += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + - abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + - abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + if (!bmask || bmask[x]) { + const int umax = std::min({ x, width - 1 - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec4f s = zero_4f(); - const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true - const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); - const Vec4f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; - result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + for (int k = -(d->nrad); k <= d->nrad; k++) + s += abs(Vec4f().load_a(src3p + x + u + k) - Vec4f().load_a(src1p + x - u + k)) + + abs(Vec4f().load_a(src1p + x + u + k) - Vec4f().load_a(src1n + x - u + k)) + + abs(Vec4f().load_a(src1n + x + u + k) - Vec4f().load_a(src3n + x - u + k)); + + const Vec4f ip = (Vec4f().load_a(src1p + x + u) + Vec4f().load_a(src1n + x - u)) * 0.5f; // should use cubic if ucubic=true + const Vec4f v = abs(Vec4f().load_a(src1p + x) - ip) + abs(Vec4f().load_a(src1n + x) - ip); + const Vec4f result = mul_add(d->alpha, s, d->beta * std::abs(u)) + d->remainingWeight * v; + result.stream(ccosts + (d->tpitch * x + u) * d->vectorSize); + } } } } } template -void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * dst, VSFrameRef ** pad, const int field_n, const EEDI3Data * d, const VSAPI * vsapi) noexcept { +void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, const VSFrameRef * mclip, VSFrameRef * mcp, VSFrameRef * dst, VSFrameRef ** pad, + const int field_n, const EEDI3Data * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { for (int plane = 0; plane < d->vi.format->numPlanes; plane++) { if (d->process[plane]) { copyPad(src, pad[plane], plane, 1 - field_n, d->dh, vsapi); @@ -154,14 +157,22 @@ void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds const T1 * _srcp = reinterpret_cast(vsapi->getReadPtr(pad[plane], 0)) + 12; T1 * VS_RESTRICT _dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const uint8_t * maskp = nullptr; + if (d->mclip) { + copyMask(mclip, mcp, plane, field_n, d->dh, vsapi); + maskp = vsapi->getReadPtr(mcp, plane); + } + const auto threadId = std::this_thread::get_id(); T2 * srcVector = reinterpret_cast(d->srcVector.at(threadId)); + uint8_t * _mskVector = d->mskVector.at(threadId); + bool * bmask = d->bmask.at(threadId); float * ccosts = d->ccosts.at(threadId) + d->mdisVector; float * pcosts = d->pcosts.at(threadId) + d->mdisVector; int * _pbackt = d->pbackt.at(threadId) + d->mdisVector; int * fpath = d->fpath.at(threadId); int * _dmap = d->dmap.at(threadId); - float * tline = d->tline.at(threadId); + int * tline = d->tline.at(threadId); vs_bitblt(_dstp + dstStride * (1 - field_n), vsapi->getStride(dst, plane) * 2, _srcp + srcStride * (4 + 1 - field_n), vsapi->getStride(pad[plane], 0) * 2, @@ -173,9 +184,34 @@ void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds for (int y = field_n; y < dstHeight; y += 2 * d->vectorSize) { const int off = (y - field_n) >> 1; - reorder(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + prepareLines(_srcp + srcStride * (1 - field_n), srcVector, dstWidth, (dstHeight + field_n) >> 1, srcStride * 2, srcWidth, off + field_n, d->vectorSize); + + if (bmask) { + prepareMask(maskp, _mskVector, dstWidth, (dstHeight + field_n) >> 1, vsapi->getStride(mcp, plane), off, d->vectorSize); + + const int32_t * mskVector = reinterpret_cast(_mskVector); + const int mdis = std::min(dstWidth, d->mdis); + int last = -666999; + + for (int x = 0; x < mdis; x++) { + if (mskVector[x]) + last = x + mdis; + } - calculateConnectionCosts(srcVector, ccosts, dstWidth, srcWidth, d); + for (int x = 0; x < dstWidth - mdis; x++) { + if (mskVector[x + mdis]) + last = x + mdis * 2; + + bmask[x] = (x <= last); + } + + for (int x = dstWidth - mdis; x < dstWidth; x++) + bmask[x] = (x <= last); + + memset(ccosts - d->mdisVector, 0, dstWidth * d->tpitchVector * sizeof(float)); + } + + calculateConnectionCosts(srcVector, bmask, ccosts, dstWidth, srcWidth, d); // calculate path costs Vec4f().load_a(ccosts).store_a(pcosts); @@ -185,23 +221,38 @@ void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds float * pT = pcosts + d->tpitchVector * x; int * piT = _pbackt + d->tpitchVector * (x - 1); - const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); - const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); - - for (int u = -umax; u <= umax; u++) { - Vec4i idx = zero_128b(); - Vec4f bval = FLT_MAX; - - for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { - const Vec4f z = Vec4f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); - const Vec4f ccost = min(z, FLT_MAX * 0.9f); - idx = select(Vec4ib(ccost < bval), v, idx); - bval = min(ccost, bval); + if (bmask && !bmask[x]) { + if (x == 1) { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + memcpy(pT - umax * d->vectorSize, tT - umax * d->vectorSize, (umax * 2 + 1) * d->vectorSize * sizeof(float)); + memset(piT - d->mdisVector, 0, d->tpitchVector * sizeof(int)); + } else { + memcpy(pT - d->mdisVector, ppT - d->mdisVector, d->tpitchVector * sizeof(float)); + memcpy(piT - d->mdisVector, piT - d->mdisVector - d->tpitchVector, d->tpitchVector * sizeof(int)); + const int pumax = std::min(x - 1, dstWidth - x); + if (pumax < d->mdis) { + Vec4i(1 - pumax).stream(piT - pumax * d->vectorSize); + Vec4i(pumax - 1).stream(piT + pumax * d->vectorSize); + } + } + } else { + const int umax = std::min({ x, dstWidth - 1 - x, d->mdis }); + const int umax2 = std::min({ x - 1, dstWidth - x, d->mdis }); + for (int u = -umax; u <= umax; u++) { + Vec4i idx = zero_128b(); + Vec4f bval = FLT_MAX; + + for (int v = std::max(-umax2, u - 1); v <= std::min(umax2, u + 1); v++) { + const Vec4f z = Vec4f().load_a(ppT + v * d->vectorSize) + d->gamma * std::abs(u - v); + const Vec4f ccost = min(z, FLT_MAX * 0.9f); + idx = select(Vec4ib(ccost < bval), v, idx); + bval = min(ccost, bval); + } + + const Vec4f z = bval + Vec4f().load_a(tT + u * d->vectorSize); + min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); + idx.stream(piT + u * d->vectorSize); } - - const Vec4f z = bval + Vec4f().load_a(tT + u * d->vectorSize); - min(z, FLT_MAX * 0.9f).store_a(pT + u * d->vectorSize); - idx.stream(piT + u * d->vectorSize); } } @@ -226,7 +277,7 @@ void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds for (int x = dstWidth - 2; x >= 0; x--) fpath[x] = pbackt[(d->tpitch * x + fpath[x + 1]) * d->vectorSize]; - interpolate(src3p, src1p, src1n, src3n, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); + interpolate(src3p, src1p, src1n, src3n, bmask, fpath, dmap, dstp, dstWidth, d->ucubic, d->peak); } } @@ -243,7 +294,7 @@ void filter_sse4(const VSFrameRef * src, const VSFrameRef * scp, VSFrameRef * ds } } -template void filter_sse4(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_sse4(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; -template void filter_sse4(const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data *, const VSAPI *) noexcept; +template void filter_sse4(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_sse4(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; +template void filter_sse4(const VSFrameRef *, const VSFrameRef *, const VSFrameRef *, VSFrameRef *, VSFrameRef *, VSFrameRef **, const int, const EEDI3Data * const VS_RESTRICT, const VSAPI *) noexcept; #endif diff --git a/EEDI3/shared.hpp b/EEDI3/shared.hpp index 42cde37..dd13b1a 100644 --- a/EEDI3/shared.hpp +++ b/EEDI3/shared.hpp @@ -52,36 +52,54 @@ static void copyPad(const VSFrameRef * src, VSFrameRef * dst, const int plane, c } template -static inline void interpolate(const T * src3p, const T * src1p, const T * src1n, const T * src3n, const int * fpath, int * VS_RESTRICT dmap, T * VS_RESTRICT dstp, - const int width, const bool ucubic, const int peak) noexcept { +static inline void interpolate(const T * src3p, const T * src1p, const T * src1n, const T * src3n, const bool * bmask, const int * fpath, + int * VS_RESTRICT dmap, T * VS_RESTRICT dstp, const int width, const bool ucubic, const int peak) noexcept { for (int x = 0; x < width; x++) { - const int dir = fpath[x]; - const int dir3 = dir * 3; - const int absDir3 = std::abs(dir3); - - dmap[x] = dir; - - if (ucubic && x >= absDir3 && x <= width - 1 - absDir3) - dstp[x] = std::min(std::max((36 * (src1p[x + dir] + src1n[x - dir]) - 4 * (src3p[x + dir3] + src3n[x - dir3]) + 32) >> 6, 0), peak); - else - dstp[x] = (src1p[x + dir] + src1n[x - dir] + 1) >> 1; + if (bmask && !bmask[x]) { + dmap[x] = 0; + + if (ucubic) + dstp[x] = std::min(std::max((9 * (src1p[x] + src1n[x]) - (src3p[x] + src3n[x]) + 8) >> 4, 0), peak); + else + dstp[x] = (src1p[x] + src1n[x] + 1) >> 1; + } else { + const int dir = fpath[x]; + const int dir3 = dir * 3; + const int absDir3 = std::abs(dir3); + + dmap[x] = dir; + + if (ucubic && x >= absDir3 && x <= width - 1 - absDir3) + dstp[x] = std::min(std::max((9 * (src1p[x + dir] + src1n[x - dir]) - (src3p[x + dir3] + src3n[x - dir3]) + 8) >> 4, 0), peak); + else + dstp[x] = (src1p[x + dir] + src1n[x - dir] + 1) >> 1; + } } } template<> -inline void interpolate(const float * src3p, const float * src1p, const float * src1n, const float * src3n, const int * fpath, int * VS_RESTRICT dmap, float * VS_RESTRICT dstp, - const int width, const bool ucubic, const int peak) noexcept { +inline void interpolate(const float * src3p, const float * src1p, const float * src1n, const float * src3n, const bool * bmask, const int * fpath, + int * VS_RESTRICT dmap, float * VS_RESTRICT dstp, const int width, const bool ucubic, const int peak) noexcept { for (int x = 0; x < width; x++) { - const int dir = fpath[x]; - const int dir3 = dir * 3; - const int absDir3 = std::abs(dir3); - - dmap[x] = dir; - - if (ucubic && x >= absDir3 && x <= width - 1 - absDir3) - dstp[x] = 0.5625f * (src1p[x + dir] + src1n[x - dir]) - 0.0625f * (src3p[x + dir3] + src3n[x - dir3]); - else - dstp[x] = (src1p[x + dir] + src1n[x - dir]) / 2.f; + if (bmask && !bmask[x]) { + dmap[x] = 0; + + if (ucubic) + dstp[x] = 0.5625f * (src1p[x] + src1n[x]) - 0.0625f * (src3p[x] + src3n[x]); + else + dstp[x] = (src1p[x] + src1n[x]) / 2.f; + } else { + const int dir = fpath[x]; + const int dir3 = dir * 3; + const int absDir3 = std::abs(dir3); + + dmap[x] = dir; + + if (ucubic && x >= absDir3 && x <= width - 1 - absDir3) + dstp[x] = 0.5625f * (src1p[x + dir] + src1n[x - dir]) - 0.0625f * (src3p[x + dir3] + src3n[x - dir3]); + else + dstp[x] = (src1p[x + dir] + src1n[x - dir]) / 2.f; + } } } @@ -101,7 +119,7 @@ static void vCheck(const T * srcp, const T * scpp, T * VS_RESTRICT dstp, const i for (int x = 0; x < dstWidth; x++) { const int dirc = dmap[x]; - const T cint = scpp ? scpp[x] : std::min(std::max((36 * (dst1p[x] + dst1n[x]) - 4 * (dst3p[x] + dst3n[x]) + 32) >> 6, 0), peak); + const T cint = scpp ? scpp[x] : std::min(std::max((9 * (dst1p[x] + dst1n[x]) - (dst3p[x] + dst3n[x]) + 8) >> 4, 0), peak); if (dirc == 0) { tline[x] = cint; diff --git a/README.md b/README.md index 34b8f59..34c8dd9 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Ported from AviSynth plugin http://bengal.missouri.edu/~kes25c/ and http://ldeso Usage ===== - eedi3m.EEDI3(clip clip, int field[, bint dh=False, int[] planes, float alpha=0.2, float beta=0.25, float gamma=20.0, int nrad=2, int mdis=20, bint hp=False, bint ucubic=True, bint cost3=True, int vcheck=2, float vthresh0=32.0, float vthresh1=64.0, float vthresh2=4.0, clip sclip=None, int opt=0]) + eedi3m.EEDI3(clip clip, int field[, bint dh=False, int[] planes, float alpha=0.2, float beta=0.25, float gamma=20.0, int nrad=2, int mdis=20, bint hp=False, bint ucubic=True, bint cost3=True, int vcheck=2, float vthresh0=32.0, float vthresh1=64.0, float vthresh2=4.0, clip sclip=None, clip mclip=None, int opt=0]) * clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported. @@ -19,15 +19,15 @@ Usage * 2 = double rate (alternates each frame), starts with bottom * 3 = double rate (alternates each frame), starts with top -* dh: Doubles the height of the input. Each line of the input is copied to every other line of the output and the missing lines are interpolated. If field=0, the input is copied to the odd lines of the output. If field=1, the input is copied to the even lines of the output. field must be set to either 0 or 1 when using dh=True. +* dh: Doubles the height of the input. Each line of the input is copied to every other line of the output and the missing lines are interpolated. If `field=0`, the input is copied to the odd lines of the output. If `field=1`, the input is copied to the even lines of the output. `field` must be set to either 0 or 1 when using `dh=True`. * planes: A list of the planes to process. By default all planes are processed. -* alpha/beta/gamma: These trade off line/edge connection vs artifacts created. alpha and beta must be in the range [0,1], and the sum alpha+beta must be in the range [0,1]. alpha is the weight given to connecting similar neighborhoods. The larger alpha is the more lines/edges should be connected. beta is the weight given to vertical difference created by the interpolation. The larger beta is the less edges/lines will be connected (at 1.0 you get no edge directedness at all). The remaining weight (1.0-alpha-beta) is given to interpolation direction (large directions (away from vertical) cost more). So the more weight you have here the more shorter connections will be favored. Finally, gamma penalizes changes in interpolation direction, the larger gamma is the smoother the interpolation field between two lines (range is [0,inf]. If lines aren't getting connected then increase alpha and maybe decrease beta/gamma. Go the other way if you are getting unwanted artifacts. +* alpha/beta/gamma: These trade off line/edge connection vs artifacts created. `alpha` and `beta` must be in the range [0,1], and the sum `alpha`+`beta` must be in the range [0,1]. `alpha` is the weight given to connecting similar neighborhoods. The larger `alpha` is the more lines/edges should be connected. `beta` is the weight given to vertical difference created by the interpolation. The larger `beta` is the less edges/lines will be connected (at 1.0 you get no edge directedness at all). The remaining weight (1.0-`alpha`-`beta`) is given to interpolation direction (large directions (away from vertical) cost more). So the more weight you have here the more shorter connections will be favored. Finally, `gamma` penalizes changes in interpolation direction, the larger `gamma` is the smoother the interpolation field between two lines (range is [0,inf]. If lines aren't getting connected then increase `alpha` and maybe decrease `beta`/`gamma`. Go the other way if you are getting unwanted artifacts. -* nrad/mdis: nrad sets the radius used for computing neighborhood similarity. Valid range is [0,3]. mdis sets the maximum connection radius. Valid range is [1,40]. If mdis=20, then when interpolating pixel (50,10) (x,y), the farthest connections allowed would be between (30,9)/(70,11) and (70,9)/(30,11). Larger mdis will allow connecting lines of smaller slope, but also increases the chance of artifacts. Larger mdis will be slower. Larger nrad will be slower. +* nrad/mdis: `nrad` sets the radius used for computing neighborhood similarity. Valid range is [0,3]. `mdis` sets the maximum connection radius. Valid range is [1,40]. If `mdis=20`, then when interpolating pixel (50,10) (x,y), the farthest connections allowed would be between (30,9)/(70,11) and (70,9)/(30,11). Larger `mdis` will allow connecting lines of smaller slope, but also increases the chance of artifacts. Larger `mdis` will be slower. Larger `nrad` will be slower. -* hp/ucubic/cost3: These are speed vs quality options. hp=True, use half pel steps, hp=False, use full pel steps. Currently only full pel is implemented and this parameter has no effect. ucubic=True, use cubic 4 point interpolation, ucubic=False, use 2 point linear interpolation. cost3=True, use 3 neighborhood cost function to define similarity, cost3=False, use 1 neighborhood cost function. +* hp/ucubic/cost3: These are speed vs quality options. `hp=True`, use half pel steps, `hp=False`, use full pel steps. Currently only full pel is implemented and this parameter has no effect. `ucubic=True`, use cubic 4 point interpolation, `ucubic=False`, use 2 point linear interpolation. `cost3=True`, use 3 neighborhood cost function to define similarity, `cost3=False`, use 1 neighborhood cost function. * vcheck/vthresh0/vthresh1/vthresh2/sclip: ``` @@ -77,6 +77,8 @@ Usage then vertical cubic interpolation is used to create it. ``` +* mclip: A mask to use edge-directed interpolation only on specified pixels. Pixels where the mask is 0 are generated using cubic linear or bicubic interpolation. The main goal of the mask is to save calculations. This parameter does not exist in `EEDI3CL` because it even slows down. Note that code paths of SSE4.1 and above could be slower than SSE2 when mclip is used. + * opt: Sets which cpu optimizations to use. * 0 = auto detect * 1 = use c @@ -91,9 +93,9 @@ Usage * device: Sets target OpenCL device. Use `list_device` to get the index of the available devices. By default the default device is selected. -* list_device: Whether the devices list is drawn on the frame. +* list_device: Whether to draw the devices list on the frame. -* info: Whether the OpenCL-related info is drawn on the frame. +* info: Whether to draw the OpenCL-related info on the frame. * opt: Sets which cpu optimizations to use. * 0 = auto detect