From 34c0b6d55e34f9106f2086e077674d3d7293eeb1 Mon Sep 17 00:00:00 2001
From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com>
Date: Mon, 11 Dec 2023 23:23:37 -0800
Subject: [PATCH 1/4] Unify even/odd width blit path

Benchmarks show a small decline on even surfaces, big (nearly 2x) improvement on odd surfaces.
---
 src_c/simd_blitters_sse2.c | 249 ++++++++++++++++---------------------
 1 file changed, 110 insertions(+), 139 deletions(-)

diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c
index 0ed96a4847..04c3f964ac 100644
--- a/src_c/simd_blitters_sse2.c
+++ b/src_c/simd_blitters_sse2.c
@@ -479,164 +479,135 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
     int srcskip = info->s_skip >> 2;
     int dstskip = info->d_skip >> 2;
 
-    Uint64 *srcp64 = (Uint64 *)info->s_pixels;
-    Uint64 *dstp64 = (Uint64 *)info->d_pixels;
-
-    Uint64 rgb_mask64 = 0x00FFFFFF00FFFFFF;
-    Uint32 rgb_mask32 = 0x00FFFFFF;
-
     Uint32 *srcp32 = (Uint32 *)info->s_pixels;
     Uint32 *dstp32 = (Uint32 *)info->d_pixels;
 
-    __m128i src1, dst1, sub_dst, mm_src_alpha, mm_zero, mm_rgb_mask;
+    int pre_2_width = width % 2;
+    int post_2_width = width / 2;
 
-    /* There are two paths through this blitter:
-           1. Two pixels at once.
-           2. One pixel at a time.
-    */
-    if (((width % 2) == 0) && ((srcskip % 2) == 0) && ((dstskip % 2) == 0)) {
-        width = width / 2;
-        srcskip = srcskip / 2;
-        dstskip = dstskip / 2;
+    __m128i src1, dst1, sub_dst, mm_src_alpha;
+    __m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF);
+    __m128i mm_zero = _mm_setzero_si128();
 
-        mm_zero = _mm_setzero_si128();
+    while (height--) {
+        LOOP_UNROLLED4(
+            {
+                /* src(ARGB) -> src1 (00000000ARGBARGB) */
+                LOAD_64_INTO_M128((Uint64 *)srcp32, &src1);
+
+                /* isolate alpha channels
+                 * 00000000A1000A2000 -> mm_src_alpha */
+                mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1);
+
+                /* shift right to position alpha channels for manipulation
+                 * 000000000A1000A200 -> mm_src_alpha*/
+                mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
+
+                /* shuffle alpha channels to duplicate 16 bit pairs
+                 * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
+                 * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
+                 * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
+                 * Therefore the previous contents of 16 bit number #1
+                 * Goes into 16 bit number #1 and #2, and the previous
+                 * content of 16 bit number #3 goes into #2 and #3 */
+                mm_src_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
+
+                /* finally move into final config
+                 * spread out so they can be multiplied in 16 bit math
+                 * against all RGBA of both pixels being blit
+                 * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
+                mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
+
+                /* 0A0R0G0B0A0R0G0B -> src1 */
+                src1 = _mm_unpacklo_epi8(src1, mm_zero);
+
+                /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
+                LOAD_64_INTO_M128((Uint64 *)dstp32, &dst1);
+                /* 0A0R0G0B0A0R0G0B -> dst1 */
+                dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
+
+                /* (srcRGB - dstRGB) */
+                sub_dst = _mm_sub_epi16(src1, dst1);
+
+                /* (srcRGB - dstRGB) * srcA */
+                sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
+
+                /* (srcRGB - dstRGB) * srcA + srcRGB */
+                sub_dst = _mm_add_epi16(sub_dst, src1);
+
+                /* (dstRGB << 8) */
+                dst1 = _mm_slli_epi16(dst1, 8);
+
+                /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
+                sub_dst = _mm_add_epi16(sub_dst, dst1);
+
+                /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
+                 * 8)*/
+                sub_dst = _mm_srli_epi16(sub_dst, 8);
+
+                /* pack everything back into a pixel with zeroed out alpha
+                 */
+                sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
+                sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
+                STORE_M128_INTO_64(&sub_dst, (Uint64 *)dstp32);
+
+                srcp32 += 2;
+                dstp32 += 2;
+            },
+            n, post_2_width);
 
-        /* two pixels at a time */
-        LOAD_64_INTO_M128(&rgb_mask64, &mm_rgb_mask);
-        while (height--) {
-            LOOP_UNROLLED4(
-                {
-                    /* src(ARGB) -> src1 (00000000ARGBARGB) */
-                    LOAD_64_INTO_M128(srcp64, &src1);
+        for (int i = 0; i < pre_2_width; i++) {
+            /* Do the actual blend */
+            /* src(ARGB) -> src1 (000000000000ARGB) */
+            src1 = _mm_cvtsi32_si128(*srcp32);
+            /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
+            mm_src_alpha = _mm_srli_si128(src1, 3);
 
-                    /* isolate alpha channels
-                     * 00000000A1000A2000 -> mm_src_alpha */
-                    mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1);
+            /* Then Calc RGB */
+            /* 0000000000000A0A -> rgb_src_alpha */
+            mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
+            /* 000000000A0A0A0A -> rgb_src_alpha */
+            mm_src_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
 
-                    /* shift right to position alpha channels for manipulation
-                     * 000000000A1000A200 -> mm_src_alpha*/
-                    mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
+            /* 000000000A0R0G0B -> src1 */
+            src1 = _mm_unpacklo_epi8(src1, mm_zero);
 
-                    /* shuffle alpha channels to duplicate 16 bit pairs
-                     * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
-                     * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
-                     * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
-                     * Therefore the previous contents of 16 bit number #1
-                     * Goes into 16 bit number #1 and #2, and the previous
-                     * content of 16 bit number #3 goes into #2 and #3 */
-                    mm_src_alpha =
-                        _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
+            /* dst(ARGB) -> dst1 (000000000000ARGB) */
+            dst1 = _mm_cvtsi32_si128(*dstp32);
+            /* 000000000A0R0G0B -> dst1 */
+            dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
 
-                    /* finally move into final config
-                     * spread out so they can be multiplied in 16 bit math
-                     * against all RGBA of both pixels being blit
-                     * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
-                    mm_src_alpha =
-                        _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
+            /* (srcRGB - dstRGB) */
+            sub_dst = _mm_sub_epi16(src1, dst1);
 
-                    /* 0A0R0G0B0A0R0G0B -> src1 */
-                    src1 = _mm_unpacklo_epi8(src1, mm_zero);
+            /* (srcRGB - dstRGB) * srcA */
+            sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
 
-                    /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
-                    LOAD_64_INTO_M128(dstp64, &dst1);
-                    /* 0A0R0G0B0A0R0G0B -> dst1 */
-                    dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
+            /* (srcRGB - dstRGB) * srcA + srcRGB */
+            sub_dst = _mm_add_epi16(sub_dst, src1);
 
-                    /* (srcRGB - dstRGB) */
-                    sub_dst = _mm_sub_epi16(src1, dst1);
-
-                    /* (srcRGB - dstRGB) * srcA */
-                    sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
-
-                    /* (srcRGB - dstRGB) * srcA + srcRGB */
-                    sub_dst = _mm_add_epi16(sub_dst, src1);
-
-                    /* (dstRGB << 8) */
-                    dst1 = _mm_slli_epi16(dst1, 8);
+            /* (dstRGB << 8) */
+            dst1 = _mm_slli_epi16(dst1, 8);
 
-                    /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
-                    sub_dst = _mm_add_epi16(sub_dst, dst1);
+            /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
+            sub_dst = _mm_add_epi16(sub_dst, dst1);
 
-                    /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
-                     * 8)*/
-                    sub_dst = _mm_srli_epi16(sub_dst, 8);
+            /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
+             * 8)*/
+            sub_dst = _mm_srli_epi16(sub_dst, 8);
 
-                    /* pack everything back into a pixel with zeroed out alpha
-                     */
-                    sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
-                    sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
-                    STORE_M128_INTO_64(&sub_dst, dstp64);
+            /* pack everything back into a pixel */
+            sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
+            sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
+            /* reset alpha to 0 */
+            *dstp32 = _mm_cvtsi128_si32(sub_dst);
 
-                    ++srcp64;
-                    ++dstp64;
-                },
-                n, width);
-            srcp64 += srcskip;
-            dstp64 += dstskip;
+            srcp32++;
+            dstp32++;
         }
-    }
-    else {
-        /* one pixel at a time */
-        mm_zero = _mm_setzero_si128();
-        mm_rgb_mask = _mm_cvtsi32_si128(rgb_mask32);
-
-        while (height--) {
-            LOOP_UNROLLED4(
-                {
-                    /* Do the actual blend */
-                    /* src(ARGB) -> src1 (000000000000ARGB) */
-                    src1 = _mm_cvtsi32_si128(*srcp32);
-                    /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
-                    mm_src_alpha = _mm_srli_si128(src1, 3);
-
-                    /* Then Calc RGB */
-                    /* 0000000000000A0A -> rgb_src_alpha */
-                    mm_src_alpha =
-                        _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
-                    /* 000000000A0A0A0A -> rgb_src_alpha */
-                    mm_src_alpha =
-                        _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
-
-                    /* 000000000A0R0G0B -> src1 */
-                    src1 = _mm_unpacklo_epi8(src1, mm_zero);
-
-                    /* dst(ARGB) -> dst1 (000000000000ARGB) */
-                    dst1 = _mm_cvtsi32_si128(*dstp32);
-                    /* 000000000A0R0G0B -> dst1 */
-                    dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
-
-                    /* (srcRGB - dstRGB) */
-                    sub_dst = _mm_sub_epi16(src1, dst1);
-
-                    /* (srcRGB - dstRGB) * srcA */
-                    sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
-
-                    /* (srcRGB - dstRGB) * srcA + srcRGB */
-                    sub_dst = _mm_add_epi16(sub_dst, src1);
-
-                    /* (dstRGB << 8) */
-                    dst1 = _mm_slli_epi16(dst1, 8);
-
-                    /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
-                    sub_dst = _mm_add_epi16(sub_dst, dst1);
-
-                    /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
-                     * 8)*/
-                    sub_dst = _mm_srli_epi16(sub_dst, 8);
 
-                    /* pack everything back into a pixel */
-                    sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
-                    sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
-                    /* reset alpha to 0 */
-                    *dstp32 = _mm_cvtsi128_si32(sub_dst);
-
-                    ++srcp32;
-                    ++dstp32;
-                },
-                n, width);
-            srcp32 += srcskip;
-            dstp32 += dstskip;
-        }
+        srcp32 += srcskip;
+        dstp32 += dstskip;
     }
 }
 

From c43140e72f139d6364360d292862f140d79fcddd Mon Sep 17 00:00:00 2001
From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com>
Date: Tue, 12 Dec 2023 22:18:15 -0800
Subject: [PATCH 2/4] Upgrade alpha blitter from 2px to 4px batches

---
 src_c/simd_blitters_sse2.c | 110 ++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 26 deletions(-)

diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c
index 04c3f964ac..e51d43171b 100644
--- a/src_c/simd_blitters_sse2.c
+++ b/src_c/simd_blitters_sse2.c
@@ -482,55 +482,105 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
     Uint32 *srcp32 = (Uint32 *)info->s_pixels;
     Uint32 *dstp32 = (Uint32 *)info->d_pixels;
 
-    int pre_2_width = width % 2;
-    int post_2_width = width / 2;
+    int pre_4_width = width % 4;
+    int post_4_width = width / 4;
 
     __m128i src1, dst1, sub_dst, mm_src_alpha;
+    __m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
+    __m128i *srcp128, *dstp128;
     __m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF);
     __m128i mm_zero = _mm_setzero_si128();
 
     while (height--) {
+        srcp128 = (__m128i *)srcp32;
+        dstp128 = (__m128i *)dstp32;
+
         LOOP_UNROLLED4(
             {
-                /* src(ARGB) -> src1 (00000000ARGBARGB) */
-                LOAD_64_INTO_M128((Uint64 *)srcp32, &src1);
+                /*
+                 * 4 pixel preparations
+                 */
+
+                /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
+                pixels_src = _mm_loadu_si128(srcp128);
 
                 /* isolate alpha channels
-                 * 00000000A1000A2000 -> mm_src_alpha */
-                mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1);
+                 * A1000A2000A3000A4000 -> mm_src_alpha */
+                mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
 
                 /* shift right to position alpha channels for manipulation
-                 * 000000000A1000A200 -> mm_src_alpha*/
+                 * 0A1000A2000A3000A400 -> mm_src_alpha*/
                 mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
 
+                /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
+                pixels_dst = _mm_loadu_si128(dstp128);
+
+                /*
+                 * BATCH A (the 2 low pixels)
+                 */
+
                 /* shuffle alpha channels to duplicate 16 bit pairs
                  * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
-                 * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
-                 * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
-                 * Therefore the previous contents of 16 bit number #1
-                 * Goes into 16 bit number #1 and #2, and the previous
-                 * content of 16 bit number #3 goes into #2 and #3 */
-                mm_src_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
+                 * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
+                 * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
+                 * Therefore the previous contents of 16 bit lane 1
+                 * Goes into 16 bit lanes 0 and 1, and the previous
+                 * content of 16 bit lane 3 goes into lanes 2 and 3*/
+                unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
 
                 /* finally move into final config
                  * spread out so they can be multiplied in 16 bit math
                  * against all RGBA of both pixels being blit
-                 * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
-                mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
+                 * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
+                 */
+                unpacked_alpha =
+                    _mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
+
+                /* 0A0R0G0B0A0R0G0B -> src1 */
+                src1 = _mm_unpacklo_epi8(pixels_src, mm_zero);
+
+                /* 0A0R0G0B0A0R0G0B -> dst1 */
+                dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);
+
+                /* (srcRGB - dstRGB) */
+                sub_dst = _mm_sub_epi16(src1, dst1);
+
+                /* (srcRGB - dstRGB) * srcA */
+                sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);
+
+                /* (srcRGB - dstRGB) * srcA + srcRGB */
+                sub_dst = _mm_add_epi16(sub_dst, src1);
+
+                /* (dstRGB << 8) */
+                dst1 = _mm_slli_epi16(dst1, 8);
+
+                /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
+                sub_dst = _mm_add_epi16(sub_dst, dst1);
+
+                /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
+                 * 8)*/
+                batch_a_dst = _mm_srli_epi16(sub_dst, 8);
+
+                /*
+                 * BATCH B (the 2 high pixels)
+                 */
+
+                unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
+
+                unpacked_alpha =
+                    _mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
 
                 /* 0A0R0G0B0A0R0G0B -> src1 */
-                src1 = _mm_unpacklo_epi8(src1, mm_zero);
+                src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
 
-                /* dst(ARGB) -> dst1 (00000000ARGBARGB) */
-                LOAD_64_INTO_M128((Uint64 *)dstp32, &dst1);
                 /* 0A0R0G0B0A0R0G0B -> dst1 */
-                dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
+                dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
 
                 /* (srcRGB - dstRGB) */
                 sub_dst = _mm_sub_epi16(src1, dst1);
 
                 /* (srcRGB - dstRGB) * srcA */
-                sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
+                sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);
 
                 /* (srcRGB - dstRGB) * srcA + srcRGB */
                 sub_dst = _mm_add_epi16(sub_dst, src1);
@@ -545,18 +595,26 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
                  * 8)*/
                 sub_dst = _mm_srli_epi16(sub_dst, 8);
 
+                /*
+                 * Combine the batches and store
+                 */
+
                 /* pack everything back into a pixel with zeroed out alpha
                  */
-                sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
+                sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
                 sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
-                STORE_M128_INTO_64(&sub_dst, (Uint64 *)dstp32);
 
-                srcp32 += 2;
-                dstp32 += 2;
+                _mm_storeu_si128(dstp128, sub_dst);
+
+                srcp128++;
+                dstp128++;
             },
-            n, post_2_width);
+            n, post_4_width);
+
+        srcp32 = (Uint32 *)srcp128;
+        dstp32 = (Uint32 *)dstp128;
 
-        for (int i = 0; i < pre_2_width; i++) {
+        for (int i = 0; i < pre_4_width; i++) {
             /* Do the actual blend */
             /* src(ARGB) -> src1 (000000000000ARGB) */
             src1 = _mm_cvtsi32_si128(*srcp32);

From 8ee4f9b081535ab51f0cbef0063fc847ac7d05eb Mon Sep 17 00:00:00 2001
From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com>
Date: Tue, 12 Dec 2023 23:01:35 -0800
Subject: [PATCH 3/4] Move core alpha_opaque_dst op to macro

---
 src_c/simd_blitters_sse2.c | 85 ++++++++++++--------------------------
 1 file changed, 27 insertions(+), 58 deletions(-)

diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c
index e51d43171b..fc618c3b58 100644
--- a/src_c/simd_blitters_sse2.c
+++ b/src_c/simd_blitters_sse2.c
@@ -470,6 +470,27 @@ alphablit_alpha_sse2_argb_no_surf_alpha(SDL_BlitInfo *info)
     }
 }
 
+/* Defines the blit procedure at the core of
+ * alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst
+ *
+ * Input variables: src1, dst1, unpacked_alpha
+ *       containing unpacked 16 bit lanes of src, dst, and src alpha
+ * Output variables: sub_dst
+ * */
+#define ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE                      \
+    /* (srcRGB - dstRGB) */                                          \
+    sub_dst = _mm_sub_epi16(src1, dst1);                             \
+    /* (srcRGB - dstRGB) * srcA */                                   \
+    sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);              \
+    /* (srcRGB - dstRGB) * srcA + srcRGB */                          \
+    sub_dst = _mm_add_epi16(sub_dst, src1);                          \
+    /* (dstRGB << 8) */                                              \
+    dst1 = _mm_slli_epi16(dst1, 8);                                  \
+    /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */        \
+    sub_dst = _mm_add_epi16(sub_dst, dst1);                          \
+    /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> 8) */ \
+    sub_dst = _mm_srli_epi16(sub_dst, 8);
+
 void
 alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
 {
@@ -542,24 +563,9 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
                 /* 0A0R0G0B0A0R0G0B -> dst1 */
                 dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);
 
-                /* (srcRGB - dstRGB) */
-                sub_dst = _mm_sub_epi16(src1, dst1);
-
-                /* (srcRGB - dstRGB) * srcA */
-                sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);
-
-                /* (srcRGB - dstRGB) * srcA + srcRGB */
-                sub_dst = _mm_add_epi16(sub_dst, src1);
-
-                /* (dstRGB << 8) */
-                dst1 = _mm_slli_epi16(dst1, 8);
-
-                /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
-                sub_dst = _mm_add_epi16(sub_dst, dst1);
+                ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
 
-                /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
-                 * 8)*/
-                batch_a_dst = _mm_srli_epi16(sub_dst, 8);
+                batch_a_dst = sub_dst;
 
                 /*
                  * BATCH B (the 2 high pixels)
@@ -576,34 +582,14 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
                 /* 0A0R0G0B0A0R0G0B -> dst1 */
                 dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
 
-                /* (srcRGB - dstRGB) */
-                sub_dst = _mm_sub_epi16(src1, dst1);
-
-                /* (srcRGB - dstRGB) * srcA */
-                sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha);
-
-                /* (srcRGB - dstRGB) * srcA + srcRGB */
-                sub_dst = _mm_add_epi16(sub_dst, src1);
-
-                /* (dstRGB << 8) */
-                dst1 = _mm_slli_epi16(dst1, 8);
-
-                /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
-                sub_dst = _mm_add_epi16(sub_dst, dst1);
-
-                /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
-                 * 8)*/
-                sub_dst = _mm_srli_epi16(sub_dst, 8);
+                ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
 
                 /*
                  * Combine the batches and store
-                 */
-
-                /* pack everything back into a pixel with zeroed out alpha
+                 * pack everything back into a pixel with zeroed out alpha
                  */
                 sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
                 sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
-
                 _mm_storeu_si128(dstp128, sub_dst);
 
                 srcp128++;
@@ -625,7 +611,7 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
             /* 0000000000000A0A -> rgb_src_alpha */
             mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
             /* 000000000A0A0A0A -> rgb_src_alpha */
-            mm_src_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
+            unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
 
             /* 000000000A0R0G0B -> src1 */
             src1 = _mm_unpacklo_epi8(src1, mm_zero);
@@ -635,24 +621,7 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
             /* 000000000A0R0G0B -> dst1 */
             dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
 
-            /* (srcRGB - dstRGB) */
-            sub_dst = _mm_sub_epi16(src1, dst1);
-
-            /* (srcRGB - dstRGB) * srcA */
-            sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
-
-            /* (srcRGB - dstRGB) * srcA + srcRGB */
-            sub_dst = _mm_add_epi16(sub_dst, src1);
-
-            /* (dstRGB << 8) */
-            dst1 = _mm_slli_epi16(dst1, 8);
-
-            /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
-            sub_dst = _mm_add_epi16(sub_dst, dst1);
-
-            /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
-             * 8)*/
-            sub_dst = _mm_srli_epi16(sub_dst, 8);
+            ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
 
             /* pack everything back into a pixel */
             sub_dst = _mm_packus_epi16(sub_dst, mm_zero);

From 1f1ebf0991c6895fb600d66b9064daf0b9b4c977 Mon Sep 17 00:00:00 2001
From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com>
Date: Tue, 12 Dec 2023 23:56:55 -0800
Subject: [PATCH 4/4] Comments pass for alpha_opaque_dst blitter

---
 src_c/simd_blitters_sse2.c | 74 ++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 43 deletions(-)

diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c
index fc618c3b58..3f329c80ab 100644
--- a/src_c/simd_blitters_sse2.c
+++ b/src_c/simd_blitters_sse2.c
@@ -503,8 +503,8 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
     Uint32 *srcp32 = (Uint32 *)info->s_pixels;
     Uint32 *dstp32 = (Uint32 *)info->d_pixels;
 
-    int pre_4_width = width % 4;
-    int post_4_width = width / 4;
+    int pxl_excess = width % 4;
+    int n_iters_4 = width / 4;
 
     __m128i src1, dst1, sub_dst, mm_src_alpha;
     __m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
@@ -518,42 +518,30 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
 
         LOOP_UNROLLED4(
             {
-                /*
-                 * 4 pixel preparations
-                 */
+                /* ==== load 4 pixels into SSE registers ==== */
 
-                /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
+                /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
                 pixels_src = _mm_loadu_si128(srcp128);
 
                 /* isolate alpha channels
-                 * A1000A2000A3000A4000 -> mm_src_alpha */
+                 * [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
                 mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
 
                 /* shift right to position alpha channels for manipulation
-                 * 0A1000A2000A3000A400 -> mm_src_alpha*/
+                 * [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
                 mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
 
-                /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
+                /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
                 pixels_dst = _mm_loadu_si128(dstp128);
 
-                /*
-                 * BATCH A (the 2 low pixels)
-                 */
+                /* ==== BATCH A (the 2 low pixels) ==== */
 
                 /* shuffle alpha channels to duplicate 16 bit pairs
-                 * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
-                 * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
-                 * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
-                 * Therefore the previous contents of 16 bit lane 1
-                 * Goes into 16 bit lanes 0 and 1, and the previous
-                 * content of 16 bit lane 3 goes into lanes 2 and 3*/
+                 * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
                 unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
 
-                /* finally move into final config
-                 * spread out so they can be multiplied in 16 bit math
-                 * against all RGBA of both pixels being blit
-                 * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
-                 */
+                /* spread alpha into final config for 16 bit math
+                 * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
                 unpacked_alpha =
                     _mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
 
@@ -567,58 +555,58 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
 
                 batch_a_dst = sub_dst;
 
-                /*
-                 * BATCH B (the 2 high pixels)
-                 */
+                /* ==== BATCH B (the 2 high pixels) ==== */
 
+                /*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
                 unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
 
+                /*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
                 unpacked_alpha =
                     _mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
 
-                /* 0A0R0G0B0A0R0G0B -> src1 */
+                /*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
                 src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
 
-                /* 0A0R0G0B0A0R0G0B -> dst1 */
+                /*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
                 dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
 
                 ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
 
-                /*
-                 * Combine the batches and store
-                 * pack everything back into a pixel with zeroed out alpha
-                 */
+                /* ==== combine batches and store ==== */
+
                 sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
+                /* zero out alpha */
                 sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
                 _mm_storeu_si128(dstp128, sub_dst);
 
                 srcp128++;
                 dstp128++;
             },
-            n, post_4_width);
+            n, n_iters_4);
 
         srcp32 = (Uint32 *)srcp128;
         dstp32 = (Uint32 *)dstp128;
 
-        for (int i = 0; i < pre_4_width; i++) {
-            /* Do the actual blend */
-            /* src(ARGB) -> src1 (000000000000ARGB) */
+        for (int i = 0; i < pxl_excess; i++) {
+            /*[00][00][00][00][00][00][AR][GB] -> src1*/
             src1 = _mm_cvtsi32_si128(*srcp32);
-            /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
+
+            /*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
             mm_src_alpha = _mm_srli_si128(src1, 3);
 
-            /* Then Calc RGB */
-            /* 0000000000000A0A -> rgb_src_alpha */
+            /*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
             mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
-            /* 000000000A0A0A0A -> rgb_src_alpha */
+
+            /*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
             unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
 
-            /* 000000000A0R0G0B -> src1 */
+            /*[00][00][00][00][0A][0R][0G][0B] -> src1*/
             src1 = _mm_unpacklo_epi8(src1, mm_zero);
 
-            /* dst(ARGB) -> dst1 (000000000000ARGB) */
+            /*[00][00][00][00][00][00][AR][GB] -> dst1*/
             dst1 = _mm_cvtsi32_si128(*dstp32);
-            /* 000000000A0R0G0B -> dst1 */
+
+            /*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
             dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
 
             ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE