From f5ee88c2cfd60df3e418c576991be2002ec37caf Mon Sep 17 00:00:00 2001 From: Isaac Aronson Date: Tue, 12 Sep 2023 22:28:08 -0500 Subject: [PATCH] Implement mathematically correct scalar blitters --- src/video/SDL_blit_A.c | 135 ++++++---------------------------- src/video/SDL_blit_A_avx2.c | 10 ++- src/video/SDL_blit_A_sse4_1.c | 10 ++- test/testautomation_blit.c | 25 +++---- 4 files changed, 46 insertions(+), 134 deletions(-) diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 1cdbaf359..ed3c941e0 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -546,112 +546,6 @@ static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info) } } -/* fast ARGB888->(A)RGB888 blending with pixel alpha */ -static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info) -{ - int width = info->dst_w; - int height = info->dst_h; - Uint32 *srcp = (Uint32 *)info->src; - int srcskip = info->src_skip >> 2; - Uint32 *dstp = (Uint32 *)info->dst; - int dstskip = info->dst_skip >> 2; - - while (height--) { - /* *INDENT-OFF* */ /* clang-format off */ - DUFFS_LOOP4({ - Uint32 dalpha; - Uint32 d; - Uint32 s1; - Uint32 d1; - Uint32 s = *srcp; - Uint32 alpha = s >> 24; - /* FIXME: Here we special-case opaque alpha since the - compositioning used (>>8 instead of /255) doesn't handle - it correctly. Also special-case alpha=0 for speed? - Benchmark this! */ - if (alpha) { - if (alpha == SDL_ALPHA_OPAQUE) { - *dstp = *srcp; - } else { - /* - * take out the middle component (green), and process - * the other two in parallel. One multiply less. - */ - d = *dstp; - dalpha = d >> 24; - s1 = s & 0xff00ff; - d1 = d & 0xff00ff; - d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; - s &= 0xff00; - d &= 0xff00; - d = (d + ((s - d) * alpha >> 8)) & 0xff00; - dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8); - *dstp = d1 | d | (dalpha << 24); - } - } - ++srcp; - ++dstp; - }, width); - /* *INDENT-ON* */ /* clang-format on */ - srcp += srcskip; - dstp += dstskip; - } -} - -/* fast ARGB888->(A)BGR888 blending with pixel alpha */ -static void BlitRGBtoBGRPixelAlpha(SDL_BlitInfo *info) -{ - int width = info->dst_w; - int height = info->dst_h; - Uint32 *srcp = (Uint32 *)info->src; - int srcskip = info->src_skip >> 2; - Uint32 *dstp = (Uint32 *)info->dst; - int dstskip = info->dst_skip >> 2; - - while (height--) { - /* *INDENT-OFF* */ /* clang-format off */ - DUFFS_LOOP4({ - Uint32 dalpha; - Uint32 d; - Uint32 s1; - Uint32 d1; - Uint32 s = *srcp; - Uint32 alpha = s >> 24; - /* FIXME: Here we special-case opaque alpha since the - compositioning used (>>8 instead of /255) doesn't handle - it correctly. Also special-case alpha=0 for speed? - Benchmark this! */ - if (alpha) { - /* - * take out the middle component (green), and process - * the other two in parallel. One multiply less. - */ - s1 = s & 0xff00ff; - s1 = (s1 >> 16) | (s1 << 16); - s &= 0xff00; - - if (alpha == SDL_ALPHA_OPAQUE) { - *dstp = 0xff000000 | s | s1; - } else { - d = *dstp; - dalpha = d >> 24; - d1 = d & 0xff00ff; - d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; - d &= 0xff00; - d = (d + ((s - d) * alpha >> 8)) & 0xff00; - dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8); - *dstp = d1 | d | (dalpha << 24); - } - } - ++srcp; - ++dstp; - }, width); - /* *INDENT-ON* */ /* clang-format on */ - srcp += srcskip; - dstp += dstskip; - } -} - /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ /* blend a single 16 bit pixel at 50% */ @@ -1285,6 +1179,15 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) } } +/* Accurate alpha blending with no division */ +static Uint8 AlphaBlendChannel(Uint8 sC, Uint8 dC, Uint8 sA) +{ + Uint16 x = ((sC - dC) * sA) + ((dC << 8) - dC); + x += 0x1U; // Use 0x80 to round instead of floor + x += x >> 8; + return x >> 8; +} + /* General (slow) N->N blending with pixel alpha */ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) { @@ -1298,6 +1201,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) SDL_PixelFormat *dstfmt = info->dst_fmt; int srcbpp; int dstbpp; + int freeFormat; Uint32 Pixel; unsigned sR, sG, sB, sA; unsigned dR, dG, dB, dA; @@ -1305,6 +1209,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) /* Set up some basic variables */ srcbpp = srcfmt->bytes_per_pixel; dstbpp = dstfmt->bytes_per_pixel; + freeFormat = 0; #ifdef SDL_AVX2_INTRINSICS if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) { @@ -1319,6 +1224,11 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) return; } #endif + /* Handle case where bad input sent */ + if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) { + dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888); + freeFormat = 1; + } while (height--) { /* *INDENT-OFF* */ /* clang-format off */ @@ -1327,7 +1237,10 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); if (sA) { DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); - ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); + dR = AlphaBlendChannel(sR, dR, sA); + dG = AlphaBlendChannel(sG, dG, sA); + dB = AlphaBlendChannel(sB, dB, sA); + dA = AlphaBlendChannel(255, dA, sA); ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); } src += srcbpp; @@ -1338,6 +1251,9 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) src += srcskip; dst += dstskip; } + if (freeFormat) { + SDL_DestroyPixelFormat(dstfmt); + } } SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) @@ -1406,11 +1322,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) return BlitRGBtoRGBPixelAlphaARMSIMD; } #endif - return BlitRGBtoRGBPixelAlpha; - } - } else if (sf->Rmask == df->Bmask && sf->Gmask == df->Gmask && sf->Bmask == df->Rmask && sf->bytes_per_pixel == 4) { - if (sf->Amask == 0xff000000) { - return BlitRGBtoBGRPixelAlpha; } } return BlitNtoNPixelAlpha; diff --git a/src/video/SDL_blit_A_avx2.c b/src/video/SDL_blit_A_avx2.c index ed2bfc1bf..8f4b3f356 100644 --- a/src/video/SDL_blit_A_avx2.c +++ b/src/video/SDL_blit_A_avx2.c @@ -89,9 +89,13 @@ __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(__m256i src, __m256i dst, const __m25 dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi), _mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi)); - // dst = (dst * 0x8081) >> 23 - dst_lo = _mm256_srli_epi16(_mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(-0x7F7F)), 7); - dst_hi = _mm256_srli_epi16(_mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(-0x7F7F)), 7); + // dst += 0x1U (use 0x80 to round instead of floor) + dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1)); + dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1)); + + // dst += dst >> 8 + dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8); + dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8); dst = _mm256_packus_epi16(dst_lo, dst_hi); return dst; diff --git a/src/video/SDL_blit_A_sse4_1.c b/src/video/SDL_blit_A_sse4_1.c index 34355e8c9..e243561d8 100644 --- a/src/video/SDL_blit_A_sse4_1.c +++ b/src/video/SDL_blit_A_sse4_1.c @@ -90,9 +90,13 @@ __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst, dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi), _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); - // dst = (dst * 0x8081) >> 23 - dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7); - dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7); + // dst += 0x1U (use 0x80 to round instead of floor) + dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); + dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); + + // dst += dst >> 8; + dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); + dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); dst = _mm_packus_epi16(dst_lo, dst_hi); return dst; diff --git a/test/testautomation_blit.c b/test/testautomation_blit.c index 9553bbeea..dc745a9d8 100644 --- a/test/testautomation_blit.c +++ b/test/testautomation_blit.c @@ -98,8 +98,7 @@ Uint32 hashSurfacePixels(SDL_Surface * surface) { int blit_testExampleApplicationRender(void *arg) { const int width = 32; const int height = 32; - const unsigned long scalar_hash = 0xf47a3f55; - const unsigned long x86_simd_hash = 0xe345d7a7; + const unsigned long correct_hash = 0xe345d7a7; SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ARGB8888); SDL_Surface* rainbow_background = SDLTest_ImageBlendingBackground(); SDL_Surface* gearbrain_sprite = SDLTest_ImageBlendingSprite(); @@ -109,9 +108,8 @@ int blit_testExampleApplicationRender(void *arg) { SDL_BlitSurface(gearbrain_sprite, NULL, dest_surface, NULL); // Check result const unsigned long hash = hashSurfacePixels(dest_surface); - SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash, - "Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx", - scalar_hash, x86_simd_hash, hash); + SDLTest_AssertCheck(hash == correct_hash, + "Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash); // Clean up SDL_DestroySurface(rainbow_background); SDL_DestroySurface(gearbrain_sprite); @@ -126,8 +124,7 @@ int blit_testExampleApplicationRender(void *arg) { int blit_testRandomToRandomSVGA(void *arg) { const int width = 800; const int height = 600; - const unsigned long scalar_hash = 0x1f56efad; - const unsigned long x86_simd_hash = 0x42140c5f; + const unsigned long correct_hash = 0x42140c5f; // Allocate random buffers Uint32 *dest_pixels = getNextRandomBuffer(width, height); Uint32 *src_pixels = getNextRandomBuffer(width, height); @@ -138,9 +135,8 @@ int blit_testRandomToRandomSVGA(void *arg) { SDL_BlitSurface(src_surface, NULL, dest_surface, NULL); // Check result const unsigned long hash = hashSurfacePixels(dest_surface); - SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash, - "Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx", - scalar_hash, x86_simd_hash, hash); + SDLTest_AssertCheck(hash == correct_hash, + "Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash); // Clean up SDL_DestroySurface(dest_surface); SDL_DestroySurface(src_surface); @@ -157,8 +153,7 @@ int blit_testRandomToRandomSVGAMultipleIterations(void *arg) { const int width = 800; const int height = 600; int i; - const unsigned long x86_simd_hash = 0x2626be78; - const unsigned long scalar_hash = 0xfb2a8ee8; + const unsigned long correct_hash = 0x5d26be78; // Create blank source surface SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ABGR8888); @@ -180,11 +175,9 @@ int blit_testRandomToRandomSVGAMultipleIterations(void *arg) { // Check result const unsigned long hash = hashSurfacePixels(dest_surface); // Clean up - SDL_SaveBMP(dest_surface, "250k_scalar.bmp"); SDL_DestroySurface(dest_surface); - SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash, - "Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx", - scalar_hash, x86_simd_hash, hash); + SDLTest_AssertCheck(hash == correct_hash, + "Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash); return TEST_COMPLETED; }