From 34c0b6d55e34f9106f2086e077674d3d7293eeb1 Mon Sep 17 00:00:00 2001 From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com> Date: Mon, 11 Dec 2023 23:23:37 -0800 Subject: [PATCH 1/4] Unify even/odd width blit path Benchmarks show a small decline on even surfaces, big (nearly 2x) improvement on odd surfaces. --- src_c/simd_blitters_sse2.c | 249 ++++++++++++++++--------------------- 1 file changed, 110 insertions(+), 139 deletions(-) diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c index 0ed96a4847..04c3f964ac 100644 --- a/src_c/simd_blitters_sse2.c +++ b/src_c/simd_blitters_sse2.c @@ -479,164 +479,135 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) int srcskip = info->s_skip >> 2; int dstskip = info->d_skip >> 2; - Uint64 *srcp64 = (Uint64 *)info->s_pixels; - Uint64 *dstp64 = (Uint64 *)info->d_pixels; - - Uint64 rgb_mask64 = 0x00FFFFFF00FFFFFF; - Uint32 rgb_mask32 = 0x00FFFFFF; - Uint32 *srcp32 = (Uint32 *)info->s_pixels; Uint32 *dstp32 = (Uint32 *)info->d_pixels; - __m128i src1, dst1, sub_dst, mm_src_alpha, mm_zero, mm_rgb_mask; + int pre_2_width = width % 2; + int post_2_width = width / 2; - /* There are two paths through this blitter: - 1. Two pixels at once. - 2. One pixel at a time. - */ - if (((width % 2) == 0) && ((srcskip % 2) == 0) && ((dstskip % 2) == 0)) { - width = width / 2; - srcskip = srcskip / 2; - dstskip = dstskip / 2; + __m128i src1, dst1, sub_dst, mm_src_alpha; + __m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF); + __m128i mm_zero = _mm_setzero_si128(); - mm_zero = _mm_setzero_si128(); + while (height--) { + LOOP_UNROLLED4( + { + /* src(ARGB) -> src1 (00000000ARGBARGB) */ + LOAD_64_INTO_M128((Uint64 *)srcp32, &src1); + + /* isolate alpha channels + * 00000000A1000A2000 -> mm_src_alpha */ + mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1); + + /* shift right to position alpha channels for manipulation + * 000000000A1000A200 -> mm_src_alpha*/ + mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1); + + /* shuffle alpha channels to duplicate 16 bit pairs + * shuffle (3, 3, 1, 1) (backed 2 bit numbers) + * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha + * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ] + * Therefore the previous contents of 16 bit number #1 + * Goes into 16 bit number #1 and #2, and the previous + * content of 16 bit number #3 goes into #2 and #3 */ + mm_src_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101); + + /* finally move into final config + * spread out so they can be multiplied in 16 bit math + * against all RGBA of both pixels being blit + * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */ + mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); + + /* 0A0R0G0B0A0R0G0B -> src1 */ + src1 = _mm_unpacklo_epi8(src1, mm_zero); + + /* dst(ARGB) -> dst1 (00000000ARGBARGB) */ + LOAD_64_INTO_M128((Uint64 *)dstp32, &dst1); + /* 0A0R0G0B0A0R0G0B -> dst1 */ + dst1 = _mm_unpacklo_epi8(dst1, mm_zero); + + /* (srcRGB - dstRGB) */ + sub_dst = _mm_sub_epi16(src1, dst1); + + /* (srcRGB - dstRGB) * srcA */ + sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha); + + /* (srcRGB - dstRGB) * srcA + srcRGB */ + sub_dst = _mm_add_epi16(sub_dst, src1); + + /* (dstRGB << 8) */ + dst1 = _mm_slli_epi16(dst1, 8); + + /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ + sub_dst = _mm_add_epi16(sub_dst, dst1); + + /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> + * 8)*/ + sub_dst = _mm_srli_epi16(sub_dst, 8); + + /* pack everything back into a pixel with zeroed out alpha + */ + sub_dst = _mm_packus_epi16(sub_dst, mm_zero); + sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); + STORE_M128_INTO_64(&sub_dst, (Uint64 *)dstp32); + + srcp32 += 2; + dstp32 += 2; + }, + n, post_2_width); - /* two pixels at a time */ - LOAD_64_INTO_M128(&rgb_mask64, &mm_rgb_mask); - while (height--) { - LOOP_UNROLLED4( - { - /* src(ARGB) -> src1 (00000000ARGBARGB) */ - LOAD_64_INTO_M128(srcp64, &src1); + for (int i = 0; i < pre_2_width; i++) { + /* Do the actual blend */ + /* src(ARGB) -> src1 (000000000000ARGB) */ + src1 = _mm_cvtsi32_si128(*srcp32); + /* src1 >> ashift -> mm_src_alpha(000000000000000A) */ + mm_src_alpha = _mm_srli_si128(src1, 3); - /* isolate alpha channels - * 00000000A1000A2000 -> mm_src_alpha */ - mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1); + /* Then Calc RGB */ + /* 0000000000000A0A -> rgb_src_alpha */ + mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); + /* 000000000A0A0A0A -> rgb_src_alpha */ + mm_src_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha); - /* shift right to position alpha channels for manipulation - * 000000000A1000A200 -> mm_src_alpha*/ - mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1); + /* 000000000A0R0G0B -> src1 */ + src1 = _mm_unpacklo_epi8(src1, mm_zero); - /* shuffle alpha channels to duplicate 16 bit pairs - * shuffle (3, 3, 1, 1) (backed 2 bit numbers) - * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha - * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ] - * Therefore the previous contents of 16 bit number #1 - * Goes into 16 bit number #1 and #2, and the previous - * content of 16 bit number #3 goes into #2 and #3 */ - mm_src_alpha = - _mm_shufflelo_epi16(mm_src_alpha, 0b11110101); + /* dst(ARGB) -> dst1 (000000000000ARGB) */ + dst1 = _mm_cvtsi32_si128(*dstp32); + /* 000000000A0R0G0B -> dst1 */ + dst1 = _mm_unpacklo_epi8(dst1, mm_zero); - /* finally move into final config - * spread out so they can be multiplied in 16 bit math - * against all RGBA of both pixels being blit - * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */ - mm_src_alpha = - _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); + /* (srcRGB - dstRGB) */ + sub_dst = _mm_sub_epi16(src1, dst1); - /* 0A0R0G0B0A0R0G0B -> src1 */ - src1 = _mm_unpacklo_epi8(src1, mm_zero); + /* (srcRGB - dstRGB) * srcA */ + sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha); - /* dst(ARGB) -> dst1 (00000000ARGBARGB) */ - LOAD_64_INTO_M128(dstp64, &dst1); - /* 0A0R0G0B0A0R0G0B -> dst1 */ - dst1 = _mm_unpacklo_epi8(dst1, mm_zero); + /* (srcRGB - dstRGB) * srcA + srcRGB */ + sub_dst = _mm_add_epi16(sub_dst, src1); - /* (srcRGB - dstRGB) */ - sub_dst = _mm_sub_epi16(src1, dst1); - - /* (srcRGB - dstRGB) * srcA */ - sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha); - - /* (srcRGB - dstRGB) * srcA + srcRGB */ - sub_dst = _mm_add_epi16(sub_dst, src1); - - /* (dstRGB << 8) */ - dst1 = _mm_slli_epi16(dst1, 8); + /* (dstRGB << 8) */ + dst1 = _mm_slli_epi16(dst1, 8); - /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ - sub_dst = _mm_add_epi16(sub_dst, dst1); + /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ + sub_dst = _mm_add_epi16(sub_dst, dst1); - /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> - * 8)*/ - sub_dst = _mm_srli_epi16(sub_dst, 8); + /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> + * 8)*/ + sub_dst = _mm_srli_epi16(sub_dst, 8); - /* pack everything back into a pixel with zeroed out alpha - */ - sub_dst = _mm_packus_epi16(sub_dst, mm_zero); - sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); - STORE_M128_INTO_64(&sub_dst, dstp64); + /* pack everything back into a pixel */ + sub_dst = _mm_packus_epi16(sub_dst, mm_zero); + sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); + /* reset alpha to 0 */ + *dstp32 = _mm_cvtsi128_si32(sub_dst); - ++srcp64; - ++dstp64; - }, - n, width); - srcp64 += srcskip; - dstp64 += dstskip; + srcp32++; + dstp32++; } - } - else { - /* one pixel at a time */ - mm_zero = _mm_setzero_si128(); - mm_rgb_mask = _mm_cvtsi32_si128(rgb_mask32); - - while (height--) { - LOOP_UNROLLED4( - { - /* Do the actual blend */ - /* src(ARGB) -> src1 (000000000000ARGB) */ - src1 = _mm_cvtsi32_si128(*srcp32); - /* src1 >> ashift -> mm_src_alpha(000000000000000A) */ - mm_src_alpha = _mm_srli_si128(src1, 3); - - /* Then Calc RGB */ - /* 0000000000000A0A -> rgb_src_alpha */ - mm_src_alpha = - _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); - /* 000000000A0A0A0A -> rgb_src_alpha */ - mm_src_alpha = - _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha); - - /* 000000000A0R0G0B -> src1 */ - src1 = _mm_unpacklo_epi8(src1, mm_zero); - - /* dst(ARGB) -> dst1 (000000000000ARGB) */ - dst1 = _mm_cvtsi32_si128(*dstp32); - /* 000000000A0R0G0B -> dst1 */ - dst1 = _mm_unpacklo_epi8(dst1, mm_zero); - - /* (srcRGB - dstRGB) */ - sub_dst = _mm_sub_epi16(src1, dst1); - - /* (srcRGB - dstRGB) * srcA */ - sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha); - - /* (srcRGB - dstRGB) * srcA + srcRGB */ - sub_dst = _mm_add_epi16(sub_dst, src1); - - /* (dstRGB << 8) */ - dst1 = _mm_slli_epi16(dst1, 8); - - /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ - sub_dst = _mm_add_epi16(sub_dst, dst1); - - /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> - * 8)*/ - sub_dst = _mm_srli_epi16(sub_dst, 8); - /* pack everything back into a pixel */ - sub_dst = _mm_packus_epi16(sub_dst, mm_zero); - sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); - /* reset alpha to 0 */ - *dstp32 = _mm_cvtsi128_si32(sub_dst); - - ++srcp32; - ++dstp32; - }, - n, width); - srcp32 += srcskip; - dstp32 += dstskip; - } + srcp32 += srcskip; + dstp32 += dstskip; } } From c43140e72f139d6364360d292862f140d79fcddd Mon Sep 17 00:00:00 2001 From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com> Date: Tue, 12 Dec 2023 22:18:15 -0800 Subject: [PATCH 2/4] Upgrade alpha blitter from 2px to 4px batches --- src_c/simd_blitters_sse2.c | 110 ++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 26 deletions(-) diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c index 04c3f964ac..e51d43171b 100644 --- a/src_c/simd_blitters_sse2.c +++ b/src_c/simd_blitters_sse2.c @@ -482,55 +482,105 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) Uint32 *srcp32 = (Uint32 *)info->s_pixels; Uint32 *dstp32 = (Uint32 *)info->d_pixels; - int pre_2_width = width % 2; - int post_2_width = width / 2; + int pre_4_width = width % 4; + int post_4_width = width / 4; __m128i src1, dst1, sub_dst, mm_src_alpha; + __m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst; + __m128i *srcp128, *dstp128; __m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF); __m128i mm_zero = _mm_setzero_si128(); while (height--) { + srcp128 = (__m128i *)srcp32; + dstp128 = (__m128i *)dstp32; + LOOP_UNROLLED4( { - /* src(ARGB) -> src1 (00000000ARGBARGB) */ - LOAD_64_INTO_M128((Uint64 *)srcp32, &src1); + /* + * 4 pixel preparations + */ + + /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */ + pixels_src = _mm_loadu_si128(srcp128); /* isolate alpha channels - * 00000000A1000A2000 -> mm_src_alpha */ - mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1); + * A1000A2000A3000A4000 -> mm_src_alpha */ + mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src); /* shift right to position alpha channels for manipulation - * 000000000A1000A200 -> mm_src_alpha*/ + * 0A1000A2000A3000A400 -> mm_src_alpha*/ mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1); + /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */ + pixels_dst = _mm_loadu_si128(dstp128); + + /* + * BATCH A (the 2 low pixels) + */ + /* shuffle alpha channels to duplicate 16 bit pairs * shuffle (3, 3, 1, 1) (backed 2 bit numbers) - * [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha - * [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ] - * Therefore the previous contents of 16 bit number #1 - * Goes into 16 bit number #1 and #2, and the previous - * content of 16 bit number #3 goes into #2 and #3 */ - mm_src_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101); + * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha + * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ] + * Therefore the previous contents of 16 bit lane 1 + * Goes into 16 bit lanes 0 and 1, and the previous + * content of 16 bit lane 3 goes into lanes 2 and 3*/ + unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101); /* finally move into final config * spread out so they can be multiplied in 16 bit math * against all RGBA of both pixels being blit - * 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */ - mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); + * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha + */ + unpacked_alpha = + _mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha); + + /* 0A0R0G0B0A0R0G0B -> src1 */ + src1 = _mm_unpacklo_epi8(pixels_src, mm_zero); + + /* 0A0R0G0B0A0R0G0B -> dst1 */ + dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero); + + /* (srcRGB - dstRGB) */ + sub_dst = _mm_sub_epi16(src1, dst1); + + /* (srcRGB - dstRGB) * srcA */ + sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); + + /* (srcRGB - dstRGB) * srcA + srcRGB */ + sub_dst = _mm_add_epi16(sub_dst, src1); + + /* (dstRGB << 8) */ + dst1 = _mm_slli_epi16(dst1, 8); + + /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ + sub_dst = _mm_add_epi16(sub_dst, dst1); + + /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> + * 8)*/ + batch_a_dst = _mm_srli_epi16(sub_dst, 8); + + /* + * BATCH B (the 2 high pixels) + */ + + unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101); + + unpacked_alpha = + _mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha); /* 0A0R0G0B0A0R0G0B -> src1 */ - src1 = _mm_unpacklo_epi8(src1, mm_zero); + src1 = _mm_unpackhi_epi8(pixels_src, mm_zero); - /* dst(ARGB) -> dst1 (00000000ARGBARGB) */ - LOAD_64_INTO_M128((Uint64 *)dstp32, &dst1); /* 0A0R0G0B0A0R0G0B -> dst1 */ - dst1 = _mm_unpacklo_epi8(dst1, mm_zero); + dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero); /* (srcRGB - dstRGB) */ sub_dst = _mm_sub_epi16(src1, dst1); /* (srcRGB - dstRGB) * srcA */ - sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha); + sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); /* (srcRGB - dstRGB) * srcA + srcRGB */ sub_dst = _mm_add_epi16(sub_dst, src1); @@ -545,18 +595,26 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) * 8)*/ sub_dst = _mm_srli_epi16(sub_dst, 8); + /* + * Combine the batches and store + */ + /* pack everything back into a pixel with zeroed out alpha */ - sub_dst = _mm_packus_epi16(sub_dst, mm_zero); + sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst); sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); - STORE_M128_INTO_64(&sub_dst, (Uint64 *)dstp32); - srcp32 += 2; - dstp32 += 2; + _mm_storeu_si128(dstp128, sub_dst); + + srcp128++; + dstp128++; }, - n, post_2_width); + n, post_4_width); + + srcp32 = (Uint32 *)srcp128; + dstp32 = (Uint32 *)dstp128; - for (int i = 0; i < pre_2_width; i++) { + for (int i = 0; i < pre_4_width; i++) { /* Do the actual blend */ /* src(ARGB) -> src1 (000000000000ARGB) */ src1 = _mm_cvtsi32_si128(*srcp32); From 8ee4f9b081535ab51f0cbef0063fc847ac7d05eb Mon Sep 17 00:00:00 2001 From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com> Date: Tue, 12 Dec 2023 23:01:35 -0800 Subject: [PATCH 3/4] Move core alpha_opaque_dst op to macro --- src_c/simd_blitters_sse2.c | 85 ++++++++++++-------------------------- 1 file changed, 27 insertions(+), 58 deletions(-) diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c index e51d43171b..fc618c3b58 100644 --- a/src_c/simd_blitters_sse2.c +++ b/src_c/simd_blitters_sse2.c @@ -470,6 +470,27 @@ alphablit_alpha_sse2_argb_no_surf_alpha(SDL_BlitInfo *info) } } +/* Defines the blit procedure at the core of + * alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst + * + * Input variables: src1, dst1, unpacked_alpha + * containing unpacked 16 bit lanes of src, dst, and src alpha + * Output variables: sub_dst + * */ +#define ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE \ + /* (srcRGB - dstRGB) */ \ + sub_dst = _mm_sub_epi16(src1, dst1); \ + /* (srcRGB - dstRGB) * srcA */ \ + sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); \ + /* (srcRGB - dstRGB) * srcA + srcRGB */ \ + sub_dst = _mm_add_epi16(sub_dst, src1); \ + /* (dstRGB << 8) */ \ + dst1 = _mm_slli_epi16(dst1, 8); \ + /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ \ + sub_dst = _mm_add_epi16(sub_dst, dst1); \ + /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> 8) */ \ + sub_dst = _mm_srli_epi16(sub_dst, 8); + void alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) { @@ -542,24 +563,9 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) /* 0A0R0G0B0A0R0G0B -> dst1 */ dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero); - /* (srcRGB - dstRGB) */ - sub_dst = _mm_sub_epi16(src1, dst1); - - /* (srcRGB - dstRGB) * srcA */ - sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); - - /* (srcRGB - dstRGB) * srcA + srcRGB */ - sub_dst = _mm_add_epi16(sub_dst, src1); - - /* (dstRGB << 8) */ - dst1 = _mm_slli_epi16(dst1, 8); - - /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ - sub_dst = _mm_add_epi16(sub_dst, dst1); + ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE - /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> - * 8)*/ - batch_a_dst = _mm_srli_epi16(sub_dst, 8); + batch_a_dst = sub_dst; /* * BATCH B (the 2 high pixels) @@ -576,34 +582,14 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) /* 0A0R0G0B0A0R0G0B -> dst1 */ dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero); - /* (srcRGB - dstRGB) */ - sub_dst = _mm_sub_epi16(src1, dst1); - - /* (srcRGB - dstRGB) * srcA */ - sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); - - /* (srcRGB - dstRGB) * srcA + srcRGB */ - sub_dst = _mm_add_epi16(sub_dst, src1); - - /* (dstRGB << 8) */ - dst1 = _mm_slli_epi16(dst1, 8); - - /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ - sub_dst = _mm_add_epi16(sub_dst, dst1); - - /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> - * 8)*/ - sub_dst = _mm_srli_epi16(sub_dst, 8); + ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE /* * Combine the batches and store - */ - - /* pack everything back into a pixel with zeroed out alpha + * pack everything back into a pixel with zeroed out alpha */ sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst); sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); - _mm_storeu_si128(dstp128, sub_dst); srcp128++; @@ -625,7 +611,7 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) /* 0000000000000A0A -> rgb_src_alpha */ mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); /* 000000000A0A0A0A -> rgb_src_alpha */ - mm_src_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha); + unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha); /* 000000000A0R0G0B -> src1 */ src1 = _mm_unpacklo_epi8(src1, mm_zero); @@ -635,24 +621,7 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) /* 000000000A0R0G0B -> dst1 */ dst1 = _mm_unpacklo_epi8(dst1, mm_zero); - /* (srcRGB - dstRGB) */ - sub_dst = _mm_sub_epi16(src1, dst1); - - /* (srcRGB - dstRGB) * srcA */ - sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha); - - /* (srcRGB - dstRGB) * srcA + srcRGB */ - sub_dst = _mm_add_epi16(sub_dst, src1); - - /* (dstRGB << 8) */ - dst1 = _mm_slli_epi16(dst1, 8); - - /* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ - sub_dst = _mm_add_epi16(sub_dst, dst1); - - /* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> - * 8)*/ - sub_dst = _mm_srli_epi16(sub_dst, 8); + ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE /* pack everything back into a pixel */ sub_dst = _mm_packus_epi16(sub_dst, mm_zero); From 1f1ebf0991c6895fb600d66b9064daf0b9b4c977 Mon Sep 17 00:00:00 2001 From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com> Date: Tue, 12 Dec 2023 23:56:55 -0800 Subject: [PATCH 4/4] Comments pass for alpha_opaque_dst blitter --- src_c/simd_blitters_sse2.c | 74 ++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 43 deletions(-) diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c index fc618c3b58..3f329c80ab 100644 --- a/src_c/simd_blitters_sse2.c +++ b/src_c/simd_blitters_sse2.c @@ -503,8 +503,8 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) Uint32 *srcp32 = (Uint32 *)info->s_pixels; Uint32 *dstp32 = (Uint32 *)info->d_pixels; - int pre_4_width = width % 4; - int post_4_width = width / 4; + int pxl_excess = width % 4; + int n_iters_4 = width / 4; __m128i src1, dst1, sub_dst, mm_src_alpha; __m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst; @@ -518,42 +518,30 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) LOOP_UNROLLED4( { - /* - * 4 pixel preparations - */ + /* ==== load 4 pixels into SSE registers ==== */ - /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */ + /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/ pixels_src = _mm_loadu_si128(srcp128); /* isolate alpha channels - * A1000A2000A3000A4000 -> mm_src_alpha */ + * [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/ mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src); /* shift right to position alpha channels for manipulation - * 0A1000A2000A3000A400 -> mm_src_alpha*/ + * [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/ mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1); - /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */ + /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/ pixels_dst = _mm_loadu_si128(dstp128); - /* - * BATCH A (the 2 low pixels) - */ + /* ==== BATCH A (the 2 low pixels) ==== */ /* shuffle alpha channels to duplicate 16 bit pairs - * shuffle (3, 3, 1, 1) (backed 2 bit numbers) - * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha - * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ] - * Therefore the previous contents of 16 bit lane 1 - * Goes into 16 bit lanes 0 and 1, and the previous - * content of 16 bit lane 3 goes into lanes 2 and 3*/ + * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/ unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101); - /* finally move into final config - * spread out so they can be multiplied in 16 bit math - * against all RGBA of both pixels being blit - * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha - */ + /* spread alpha into final config for 16 bit math + * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/ unpacked_alpha = _mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha); @@ -567,58 +555,58 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) batch_a_dst = sub_dst; - /* - * BATCH B (the 2 high pixels) - */ + /* ==== BATCH B (the 2 high pixels) ==== */ + /*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/ unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101); + /*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/ unpacked_alpha = _mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha); - /* 0A0R0G0B0A0R0G0B -> src1 */ + /*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/ src1 = _mm_unpackhi_epi8(pixels_src, mm_zero); - /* 0A0R0G0B0A0R0G0B -> dst1 */ + /*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/ dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero); ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE - /* - * Combine the batches and store - * pack everything back into a pixel with zeroed out alpha - */ + /* ==== combine batches and store ==== */ + sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst); + /* zero out alpha */ sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask); _mm_storeu_si128(dstp128, sub_dst); srcp128++; dstp128++; }, - n, post_4_width); + n, n_iters_4); srcp32 = (Uint32 *)srcp128; dstp32 = (Uint32 *)dstp128; - for (int i = 0; i < pre_4_width; i++) { - /* Do the actual blend */ - /* src(ARGB) -> src1 (000000000000ARGB) */ + for (int i = 0; i < pxl_excess; i++) { + /*[00][00][00][00][00][00][AR][GB] -> src1*/ src1 = _mm_cvtsi32_si128(*srcp32); - /* src1 >> ashift -> mm_src_alpha(000000000000000A) */ + + /*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/ mm_src_alpha = _mm_srli_si128(src1, 3); - /* Then Calc RGB */ - /* 0000000000000A0A -> rgb_src_alpha */ + /*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/ mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha); - /* 000000000A0A0A0A -> rgb_src_alpha */ + + /*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/ unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha); - /* 000000000A0R0G0B -> src1 */ + /*[00][00][00][00][0A][0R][0G][0B] -> src1*/ src1 = _mm_unpacklo_epi8(src1, mm_zero); - /* dst(ARGB) -> dst1 (000000000000ARGB) */ + /*[00][00][00][00][00][00][AR][GB] -> dst1*/ dst1 = _mm_cvtsi32_si128(*dstp32); - /* 000000000A0R0G0B -> dst1 */ + + /*[00][00][00][00][0A][0R][0G][0B] -> dst1*/ dst1 = _mm_unpacklo_epi8(dst1, mm_zero); ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE