Skip to content

Commit

Permalink
Merge pull request pygame-community#2601 from Starbuck5/improve-alpha…
Browse files Browse the repository at this point in the history
…blit_alpha_sse2_argb_no_surf_alpha_opaque_dst

Improve performance of SSE2 no_surf_alpha_opaque_dst blitter
  • Loading branch information
Starbuck5 authored Jan 4, 2024
2 parents ff166fe + 1f1ebf0 commit 717d032
Showing 1 changed file with 111 additions and 125 deletions.
236 changes: 111 additions & 125 deletions src_c/simd_blitters_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,27 @@ alphablit_alpha_sse2_argb_no_surf_alpha(SDL_BlitInfo *info)
}
}

/* Defines the blit procedure at the core of
* alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst
*
* Input variables: src1, dst1, unpacked_alpha
* containing unpacked 16 bit lanes of src, dst, and src alpha
* Output variables: sub_dst
* */
#define ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE \
/* (srcRGB - dstRGB) */ \
sub_dst = _mm_sub_epi16(src1, dst1); \
/* (srcRGB - dstRGB) * srcA */ \
sub_dst = _mm_mullo_epi16(sub_dst, unpacked_alpha); \
/* (srcRGB - dstRGB) * srcA + srcRGB */ \
sub_dst = _mm_add_epi16(sub_dst, src1); \
/* (dstRGB << 8) */ \
dst1 = _mm_slli_epi16(dst1, 8); \
/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */ \
sub_dst = _mm_add_epi16(sub_dst, dst1); \
/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >> 8) */ \
sub_dst = _mm_srli_epi16(sub_dst, 8);

void
alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
{
Expand All @@ -479,164 +500,129 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
int srcskip = info->s_skip >> 2;
int dstskip = info->d_skip >> 2;

Uint64 *srcp64 = (Uint64 *)info->s_pixels;
Uint64 *dstp64 = (Uint64 *)info->d_pixels;

Uint64 rgb_mask64 = 0x00FFFFFF00FFFFFF;
Uint32 rgb_mask32 = 0x00FFFFFF;

Uint32 *srcp32 = (Uint32 *)info->s_pixels;
Uint32 *dstp32 = (Uint32 *)info->d_pixels;

__m128i src1, dst1, sub_dst, mm_src_alpha, mm_zero, mm_rgb_mask;
int pxl_excess = width % 4;
int n_iters_4 = width / 4;

/* There are two paths through this blitter:
1. Two pixels at once.
2. One pixel at a time.
*/
if (((width % 2) == 0) && ((srcskip % 2) == 0) && ((dstskip % 2) == 0)) {
width = width / 2;
srcskip = srcskip / 2;
dstskip = dstskip / 2;
__m128i src1, dst1, sub_dst, mm_src_alpha;
__m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
__m128i *srcp128, *dstp128;
__m128i mm_rgb_mask = _mm_set1_epi32(0x00FFFFFF);
__m128i mm_zero = _mm_setzero_si128();

mm_zero = _mm_setzero_si128();
while (height--) {
srcp128 = (__m128i *)srcp32;
dstp128 = (__m128i *)dstp32;

/* two pixels at a time */
LOAD_64_INTO_M128(&rgb_mask64, &mm_rgb_mask);
while (height--) {
LOOP_UNROLLED4(
{
/* src(ARGB) -> src1 (00000000ARGBARGB) */
LOAD_64_INTO_M128(srcp64, &src1);
LOOP_UNROLLED4(
{
/* ==== load 4 pixels into SSE registers ==== */

/* isolate alpha channels
* 00000000A1000A2000 -> mm_src_alpha */
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, src1);
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
pixels_src = _mm_loadu_si128(srcp128);

/* shift right to position alpha channels for manipulation
* 000000000A1000A200 -> mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
/* isolate alpha channels
* [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);

/* shuffle alpha channels to duplicate 16 bit pairs
* shuffle (3, 3, 1, 1) (backed 2 bit numbers)
* [00][00][00][00][0A1][00][0A2][00] -> mm_src_alpha
* [7 ][6 ][5 ][4 ][ 3 ][2 ][ 1 ][0 ]
* Therefore the previous contents of 16 bit number #1
* Goes into 16 bit number #1 and #2, and the previous
* content of 16 bit number #3 goes into #2 and #3 */
mm_src_alpha =
_mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
/* shift right to position alpha channels for manipulation
* [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);

/* finally move into final config
* spread out so they can be multiplied in 16 bit math
* against all RGBA of both pixels being blit
* 0A10A10A10A10A20A20A20A2 -> mm_src_alpha */
mm_src_alpha =
_mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
pixels_dst = _mm_loadu_si128(dstp128);

/* 0A0R0G0B0A0R0G0B -> src1 */
src1 = _mm_unpacklo_epi8(src1, mm_zero);
/* ==== BATCH A (the 2 low pixels) ==== */

/* dst(ARGB) -> dst1 (00000000ARGBARGB) */
LOAD_64_INTO_M128(dstp64, &dst1);
/* 0A0R0G0B0A0R0G0B -> dst1 */
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
/* shuffle alpha channels to duplicate 16 bit pairs
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);

/* (srcRGB - dstRGB) */
sub_dst = _mm_sub_epi16(src1, dst1);
/* spread alpha into final config for 16 bit math
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
unpacked_alpha =
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);

/* (srcRGB - dstRGB) * srcA */
sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
/* 0A0R0G0B0A0R0G0B -> src1 */
src1 = _mm_unpacklo_epi8(pixels_src, mm_zero);

/* (srcRGB - dstRGB) * srcA + srcRGB */
sub_dst = _mm_add_epi16(sub_dst, src1);
/* 0A0R0G0B0A0R0G0B -> dst1 */
dst1 = _mm_unpacklo_epi8(pixels_dst, mm_zero);

/* (dstRGB << 8) */
dst1 = _mm_slli_epi16(dst1, 8);
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
sub_dst = _mm_add_epi16(sub_dst, dst1);
batch_a_dst = sub_dst;

/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
* 8)*/
sub_dst = _mm_srli_epi16(sub_dst, 8);
/* ==== BATCH B (the 2 high pixels) ==== */

/* pack everything back into a pixel with zeroed out alpha
*/
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
STORE_M128_INTO_64(&sub_dst, dstp64);
/*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);

++srcp64;
++dstp64;
},
n, width);
srcp64 += srcskip;
dstp64 += dstskip;
}
}
else {
/* one pixel at a time */
mm_zero = _mm_setzero_si128();
mm_rgb_mask = _mm_cvtsi32_si128(rgb_mask32);
/*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
unpacked_alpha =
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);

while (height--) {
LOOP_UNROLLED4(
{
/* Do the actual blend */
/* src(ARGB) -> src1 (000000000000ARGB) */
src1 = _mm_cvtsi32_si128(*srcp32);
/* src1 >> ashift -> mm_src_alpha(000000000000000A) */
mm_src_alpha = _mm_srli_si128(src1, 3);
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);

/* Then Calc RGB */
/* 0000000000000A0A -> rgb_src_alpha */
mm_src_alpha =
_mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
/* 000000000A0A0A0A -> rgb_src_alpha */
mm_src_alpha =
_mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);

/* 000000000A0R0G0B -> src1 */
src1 = _mm_unpacklo_epi8(src1, mm_zero);
ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

/* dst(ARGB) -> dst1 (000000000000ARGB) */
dst1 = _mm_cvtsi32_si128(*dstp32);
/* 000000000A0R0G0B -> dst1 */
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
/* ==== combine batches and store ==== */

/* (srcRGB - dstRGB) */
sub_dst = _mm_sub_epi16(src1, dst1);
sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
/* zero out alpha */
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
_mm_storeu_si128(dstp128, sub_dst);

/* (srcRGB - dstRGB) * srcA */
sub_dst = _mm_mullo_epi16(sub_dst, mm_src_alpha);
srcp128++;
dstp128++;
},
n, n_iters_4);

/* (srcRGB - dstRGB) * srcA + srcRGB */
sub_dst = _mm_add_epi16(sub_dst, src1);
srcp32 = (Uint32 *)srcp128;
dstp32 = (Uint32 *)dstp128;

/* (dstRGB << 8) */
dst1 = _mm_slli_epi16(dst1, 8);
for (int i = 0; i < pxl_excess; i++) {
/*[00][00][00][00][00][00][AR][GB] -> src1*/
src1 = _mm_cvtsi32_si128(*srcp32);

/* ((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) */
sub_dst = _mm_add_epi16(sub_dst, dst1);
/*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(src1, 3);

/* (((dstRGB << 8) + (srcRGB - dstRGB) * srcA + srcRGB) >>
* 8)*/
sub_dst = _mm_srli_epi16(sub_dst, 8);
/*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);

/* pack everything back into a pixel */
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
/* reset alpha to 0 */
*dstp32 = _mm_cvtsi128_si32(sub_dst);
/*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);

++srcp32;
++dstp32;
},
n, width);
srcp32 += srcskip;
dstp32 += dstskip;
/*[00][00][00][00][0A][0R][0G][0B] -> src1*/
src1 = _mm_unpacklo_epi8(src1, mm_zero);

/*[00][00][00][00][00][00][AR][GB] -> dst1*/
dst1 = _mm_cvtsi32_si128(*dstp32);

/*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);

ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

/* pack everything back into a pixel */
sub_dst = _mm_packus_epi16(sub_dst, mm_zero);
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
/* reset alpha to 0 */
*dstp32 = _mm_cvtsi128_si32(sub_dst);

srcp32++;
dstp32++;
}

srcp32 += srcskip;
dstp32 += dstskip;
}
}

Expand Down

0 comments on commit 717d032

Please sign in to comment.