Skip to content

Commit

Permalink
Fix BlitNtoNPixelAlpha for formats with no dst alpha
Browse files Browse the repository at this point in the history
  • Loading branch information
0x1F9F1 committed Jul 3, 2024
1 parent 0f4ed0e commit ccfe484
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 106 deletions.
113 changes: 60 additions & 53 deletions src/video/SDL_blit_A_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,6 @@

#include "SDL_blit.h"

// Using the AVX2 instruction set, blit sixteen pixels into eight with alpha blending
SDL_FORCE_INLINE __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(
__m256i src, __m256i dst,
const __m256i alpha_shuffle, const __m256i alpha_saturate)
{
// SIMD implementation of blend_mul2.
// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA))

// Splat the alpha into all channels for each pixel
__m256i srca = _mm256_shuffle_epi8(src, alpha_shuffle);

// Set the alpha channels of src to 255
src = _mm256_or_si256(src, alpha_saturate);

__m256i src_lo = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
__m256i src_hi = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());

__m256i dst_lo = _mm256_unpacklo_epi8(dst, _mm256_setzero_si256());
__m256i dst_hi = _mm256_unpackhi_epi8(dst, _mm256_setzero_si256());

__m256i srca_lo = _mm256_unpacklo_epi8(srca, _mm256_setzero_si256());
__m256i srca_hi = _mm256_unpackhi_epi8(srca, _mm256_setzero_si256());

// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
_mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
_mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));

// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));

// dst += dst >> 8
dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);

dst = _mm256_packus_epi16(dst_lo, dst_hi);
return dst;
}

void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
{
int width = info->dst_w;
Expand All @@ -59,32 +17,64 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;

// The byte offsets for the start of each pixel
const __m256i mask_offsets = _mm256_set_epi8(
28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);

const __m256i shift_mask = _mm256_add_epi32(
const __m256i convert_mask = _mm256_add_epi32(
_mm256_set1_epi32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);

const __m256i splat_mask = _mm256_add_epi8(_mm256_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
const __m256i saturate_mask = _mm256_set1_epi32((int)dstfmt->Amask);
const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstfmt->Amask);

while (height--) {
int i = 0;

for (; i + 8 <= width; i += 8) {
// Load 8 src pixels and shuffle into the dst format
__m256i c_src = _mm256_shuffle_epi8(_mm256_loadu_si256((__m256i *)src), shift_mask);
// Load 8 src pixels
__m256i src256 = _mm256_loadu_si256((__m256i *)src);

// Load 8 dst pixels
__m256i c_dst = _mm256_loadu_si256((__m256i *)dst);
__m256i dst256 = _mm256_loadu_si256((__m256i *)dst);

// Extract the alpha from each pixel and splat it into all the channels
__m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);

// Convert to dst format
src256 = _mm256_shuffle_epi8(src256, convert_mask);

// Set the alpha channels of src to 255
src256 = _mm256_or_si256(src256, alpha_fill_mask);

__m256i src_lo = _mm256_unpacklo_epi8(src256, _mm256_setzero_si256());
__m256i src_hi = _mm256_unpackhi_epi8(src256, _mm256_setzero_si256());

__m256i dst_lo = _mm256_unpacklo_epi8(dst256, _mm256_setzero_si256());
__m256i dst_hi = _mm256_unpackhi_epi8(dst256, _mm256_setzero_si256());

__m256i srca_lo = _mm256_unpacklo_epi8(srcA, _mm256_setzero_si256());
__m256i srca_hi = _mm256_unpackhi_epi8(srcA, _mm256_setzero_si256());

// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_lo, dst_lo), srca_lo),
_mm256_sub_epi16(_mm256_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
_mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));

// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));

// dst += dst >> 8
dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);

// Blend the pixels together and save the result
_mm256_storeu_si256((__m256i *)dst, MixRGBA_AVX2(c_src, c_dst, splat_mask, saturate_mask));
_mm256_storeu_si256((__m256i *)dst, _mm256_packus_epi16(dst_lo, dst_hi));

src += 32;
dst += 32;
Expand All @@ -94,12 +84,29 @@ void SDL_TARGETING("avx2") BlitNtoNPixelAlpha_AVX2(SDL_BlitInfo *info)
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;

Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;

src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
(((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
(((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
(((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
dstfmt->Amask;

Uint32 srcRB = src32 & 0x00FF00FF;
Uint32 dstRB = dst32 & 0x00FF00FF;

Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;

Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
resRB += 0x00010001;
resRB += (resRB >> 8) & 0x00FF00FF;
resRB = (resRB >> 8) & 0x00FF00FF;

ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
resGA += 0x00010001;
resGA += (resGA >> 8) & 0x00FF00FF;
resGA &= 0xFF00FF00;
dst32 = resRB | resGA;

*(Uint32 *)dst = dst32;

Expand Down
113 changes: 60 additions & 53 deletions src/video/SDL_blit_A_sse4_1.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,48 +6,6 @@

#include "SDL_blit.h"

// Using the SSE4.1 instruction set, blit eight pixels into four with alpha blending
SDL_FORCE_INLINE __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(
__m128i src, __m128i dst,
const __m128i alpha_shuffle, const __m128i alpha_saturate)
{
// SIMD implementation of blend_mul2.
// dstRGB = (srcRGB * srcA) + (dstRGB * (1-srcA))
// dstA = srcA + (dstA * (1-srcA)) = (1 * srcA) + (dstA * (1-srcA))

// Splat the alpha into all channels for each pixel
__m128i srca = _mm_shuffle_epi8(src, alpha_shuffle);

// Set the alpha channels of src to 255
src = _mm_or_si128(src, alpha_saturate);

__m128i src_lo = _mm_unpacklo_epi8(src, _mm_setzero_si128());
__m128i src_hi = _mm_unpackhi_epi8(src, _mm_setzero_si128());

__m128i dst_lo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
__m128i dst_hi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

__m128i srca_lo = _mm_unpacklo_epi8(srca, _mm_setzero_si128());
__m128i srca_hi = _mm_unpackhi_epi8(srca, _mm_setzero_si128());

// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
_mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));

// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));

// dst += dst >> 8
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);

dst = _mm_packus_epi16(dst_lo, dst_hi);
return dst;
}

void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
{
int width = info->dst_w;
Expand All @@ -59,32 +17,64 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;

// The byte offsets for the start of each pixel
const __m128i mask_offsets = _mm_set_epi8(
12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);

const __m128i shift_mask = _mm_add_epi32(
const __m128i convert_mask = _mm_add_epi32(
_mm_set1_epi32(
((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
((srcfmt->Bshift >> 3) << dstfmt->Bshift) |
((srcfmt->Ashift >> 3) << dstfmt->Ashift)),
((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
mask_offsets);

const __m128i splat_mask = _mm_add_epi8(_mm_set1_epi8(dstfmt->Ashift >> 3), mask_offsets);
const __m128i saturate_mask = _mm_set1_epi32((int)dstfmt->Amask);
const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstfmt->Amask);

while (height--) {
int i = 0;

for (; i + 4 <= width; i += 4) {
// Load 4 src pixels and shuffle into the dst format
__m128i c_src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)src), shift_mask);
// Load 4 src pixels
__m128i src128 = _mm_loadu_si128((__m128i *)src);

// Load 4 dst pixels
__m128i c_dst = _mm_loadu_si128((__m128i *)dst);
__m128i dst128 = _mm_loadu_si128((__m128i *)dst);

// Extract the alpha from each pixel and splat it into all the channels
__m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);

// Convert to dst format
src128 = _mm_shuffle_epi8(src128, convert_mask);

// Set the alpha channels of src to 255
src128 = _mm_or_si128(src128, alpha_fill_mask);

__m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
__m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());

__m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
__m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());

__m128i srca_lo = _mm_unpacklo_epi8(srcA, _mm_setzero_si128());
__m128i srca_hi = _mm_unpackhi_epi8(srcA, _mm_setzero_si128());

// dst = ((src - dst) * srcA) + ((dst << 8) - dst)
dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srca_lo),
_mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));

// dst += 0x1U (use 0x80 to round instead of floor)
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));

// dst += dst >> 8
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);

// Blend the pixels together and save the result
_mm_storeu_si128((__m128i *)dst, MixRGBA_SSE4_1(c_src, c_dst, splat_mask, saturate_mask));
_mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(dst_lo, dst_hi));

src += 16;
dst += 16;
Expand All @@ -94,12 +84,29 @@ void SDL_TARGETING("sse4.1") BlitNtoNPixelAlpha_SSE4_1(SDL_BlitInfo *info)
Uint32 src32 = *(Uint32 *)src;
Uint32 dst32 = *(Uint32 *)dst;

Uint32 srcA = (src32 >> srcfmt->Ashift) & 0xFF;

src32 = (((src32 >> srcfmt->Rshift) & 0xFF) << dstfmt->Rshift) |
(((src32 >> srcfmt->Gshift) & 0xFF) << dstfmt->Gshift) |
(((src32 >> srcfmt->Bshift) & 0xFF) << dstfmt->Bshift) |
(((src32 >> srcfmt->Ashift) & 0xFF) << dstfmt->Ashift);
dstfmt->Amask;

Uint32 srcRB = src32 & 0x00FF00FF;
Uint32 dstRB = dst32 & 0x00FF00FF;

Uint32 srcGA = (src32 >> 8) & 0x00FF00FF;
Uint32 dstGA = (dst32 >> 8) & 0x00FF00FF;

Uint32 resRB = ((srcRB - dstRB) * srcA) + (dstRB << 8) - dstRB;
resRB += 0x00010001;
resRB += (resRB >> 8) & 0x00FF00FF;
resRB = (resRB >> 8) & 0x00FF00FF;

ALPHA_BLEND_RGBA_4(src32, dst32, dstfmt->Ashift);
Uint32 resGA = ((srcGA - dstGA) * srcA) + (dstGA << 8) - dstGA;
resGA += 0x00010001;
resGA += (resGA >> 8) & 0x00FF00FF;
resGA &= 0xFF00FF00;
dst32 = resRB | resGA;

*(Uint32 *)dst = dst32;

Expand Down

0 comments on commit ccfe484

Please sign in to comment.