cmake: Allow build system to disable loongarch intrinsics

This commit is contained in:
Anonymous Maarten 2023-02-26 00:35:57 +01:00 committed by Anonymous Maarten
parent 99b8313eb1
commit 46de6241d7
6 changed files with 52 additions and 36 deletions

View File

@ -1020,6 +1020,14 @@ if(NOT HAVE_SSE3)
set(SDL_DISABLE_SSE3 1) set(SDL_DISABLE_SSE3 1)
endif() endif()
if(NOT HAVE_LSX)
set(SDL_DISABLE_LSX 1)
endif()
if(NOT HAVE_LASX)
set(SDL_DISABLE_LASX 1)
endif()
# TODO: Can't deactivate on FreeBSD? w/o LIBC, SDL_stdinc.h can't define # TODO: Can't deactivate on FreeBSD? w/o LIBC, SDL_stdinc.h can't define
# anything. # anything.
if(SDL_LIBC) if(SDL_LIBC)

View File

@ -93,13 +93,11 @@ _m_prefetch(void *__P)
#endif #endif
#endif /* compiler version */ #endif /* compiler version */
#if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX_H) #if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX)
#include <lsxintrin.h> #include <lsxintrin.h>
#define __LSX__
#endif #endif
#if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX_H) #if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX)
#include <lasxintrin.h> #include <lasxintrin.h>
#define __LASX__
#endif #endif
#if defined(__AVX__) && !defined(SDL_DISABLE_AVX) #if defined(__AVX__) && !defined(SDL_DISABLE_AVX)
#include <immintrin.h> #include <immintrin.h>

View File

@ -592,5 +592,7 @@ typedef unsigned int uintptr_t;
#cmakedefine SDL_DISABLE_SSE3 1 #cmakedefine SDL_DISABLE_SSE3 1
#cmakedefine SDL_DISABLE_AVX 1 #cmakedefine SDL_DISABLE_AVX 1
#cmakedefine SDL_DISABLE_MMX 1 #cmakedefine SDL_DISABLE_MMX 1
#cmakedefine SDL_DISABLE_LSX 1
#cmakedefine SDL_DISABLE_LASX 1
#endif /* SDL_build_config_h_ */ #endif /* SDL_build_config_h_ */

View File

@ -211,6 +211,14 @@
#define HAVE_AVX_INTRINSICS 1 #define HAVE_AVX_INTRINSICS 1
#endif #endif
#if defined(__loongarch_sx) && !defined(SDL_DISABLE_LSX)
#define HAVE_LSX_INTRINSICS 1
#endif
#if defined(__loongarch_asx) && !defined(SDL_DISABLE_LASX)
#define HAVE_LASX_INTRINSICS 1
#endif
#if defined __clang__ #if defined __clang__
#if (!__has_attribute(target)) #if (!__has_attribute(target))
#undef HAVE_AVX_INTRINSICS #undef HAVE_AVX_INTRINSICS

View File

@ -419,7 +419,7 @@ static SDL_bool yuv_rgb_lsx(
Uint8 *rgb, Uint32 rgb_stride, Uint8 *rgb, Uint32 rgb_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
{ {
#ifdef __loongarch_sx #if HAVE_LSX_INTRINSICS
if (!SDL_HasLSX()) { if (!SDL_HasLSX()) {
return SDL_FALSE; return SDL_FALSE;
} }

View File

@ -73,7 +73,7 @@ static const RGB2YUVParam RGB2YUV[3] = {
// input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range // input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
static uint8_t clampU8(int32_t v) static uint8_t clampU8(int32_t v)
{ {
static const uint8_t lut[512] = static const uint8_t lut[512] =
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
@ -183,52 +183,52 @@ static uint8_t clampU8(int32_t v)
#include "yuv_rgb_std_func.h" #include "yuv_rgb_std_func.h"
void rgb24_yuv420_std( void rgb24_yuv420_std(
uint32_t width, uint32_t height, uint32_t width, uint32_t height,
const uint8_t *RGB, uint32_t RGB_stride, const uint8_t *RGB, uint32_t RGB_stride,
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
{ {
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
uint32_t x, y; uint32_t x, y;
for(y=0; y<(height-1); y+=2) for(y=0; y<(height-1); y+=2)
{ {
const uint8_t *rgb_ptr1=RGB+y*RGB_stride, const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
*rgb_ptr2=RGB+(y+1)*RGB_stride; *rgb_ptr2=RGB+(y+1)*RGB_stride;
uint8_t *y_ptr1=Y+y*Y_stride, uint8_t *y_ptr1=Y+y*Y_stride,
*y_ptr2=Y+(y+1)*Y_stride, *y_ptr2=Y+(y+1)*Y_stride,
*u_ptr=U+(y/2)*UV_stride, *u_ptr=U+(y/2)*UV_stride,
*v_ptr=V+(y/2)*UV_stride; *v_ptr=V+(y/2)*UV_stride;
for(x=0; x<(width-1); x+=2) for(x=0; x<(width-1); x+=2)
{ {
// compute yuv for the four pixels, u and v values are summed // compute yuv for the four pixels, u and v values are summed
int32_t y_tmp, u_tmp, v_tmp; int32_t y_tmp, u_tmp, v_tmp;
y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2]; y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2]; u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2]; v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION)); y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5]; y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5]; u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5]; v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION)); y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2]; y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2]; u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2]; v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION)); y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5]; y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5]; u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5]; v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION)); y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION)); u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION)); v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
rgb_ptr1 += 6; rgb_ptr1 += 6;
rgb_ptr2 += 6; rgb_ptr2 += 6;
y_ptr1 += 2; y_ptr1 += 2;
@ -609,35 +609,35 @@ V = _mm_srai_epi16(V, PRECISION);
SAVE_SI128((__m128i*)(u_ptr), u1); \ SAVE_SI128((__m128i*)(u_ptr), u1); \
SAVE_SI128((__m128i*)(v_ptr), v1); SAVE_SI128((__m128i*)(v_ptr), v1);
void rgb24_yuv420_sse(uint32_t width, uint32_t height, void rgb24_yuv420_sse(uint32_t width, uint32_t height,
const uint8_t *RGB, uint32_t RGB_stride, const uint8_t *RGB, uint32_t RGB_stride,
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
{ {
#define LOAD_SI128 _mm_load_si128 #define LOAD_SI128 _mm_load_si128
#define SAVE_SI128 _mm_stream_si128 #define SAVE_SI128 _mm_stream_si128
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
uint32_t xpos, ypos; uint32_t xpos, ypos;
for(ypos=0; ypos<(height-1); ypos+=2) for(ypos=0; ypos<(height-1); ypos+=2)
{ {
const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
*rgb_ptr2=RGB+(ypos+1)*RGB_stride; *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
uint8_t *y_ptr1=Y+ypos*Y_stride, uint8_t *y_ptr1=Y+ypos*Y_stride,
*y_ptr2=Y+(ypos+1)*Y_stride, *y_ptr2=Y+(ypos+1)*Y_stride,
*u_ptr=U+(ypos/2)*UV_stride, *u_ptr=U+(ypos/2)*UV_stride,
*v_ptr=V+(ypos/2)*UV_stride; *v_ptr=V+(ypos/2)*UV_stride;
for(xpos=0; xpos<(width-31); xpos+=32) for(xpos=0; xpos<(width-31); xpos+=32)
{ {
RGB2YUV_32 RGB2YUV_32
rgb_ptr1+=96; rgb_ptr1+=96;
rgb_ptr2+=96; rgb_ptr2+=96;
y_ptr1+=32; y_ptr1+=32;
y_ptr2+=32; y_ptr2+=32;
u_ptr+=16; u_ptr+=16;
v_ptr+=16; v_ptr+=16;
} }
} }
@ -645,35 +645,35 @@ void rgb24_yuv420_sse(uint32_t width, uint32_t height,
#undef SAVE_SI128 #undef SAVE_SI128
} }
void rgb24_yuv420_sseu(uint32_t width, uint32_t height, void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
const uint8_t *RGB, uint32_t RGB_stride, const uint8_t *RGB, uint32_t RGB_stride,
uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
YCbCrType yuv_type) YCbCrType yuv_type)
{ {
#define LOAD_SI128 _mm_loadu_si128 #define LOAD_SI128 _mm_loadu_si128
#define SAVE_SI128 _mm_storeu_si128 #define SAVE_SI128 _mm_storeu_si128
const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]); const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
uint32_t xpos, ypos; uint32_t xpos, ypos;
for(ypos=0; ypos<(height-1); ypos+=2) for(ypos=0; ypos<(height-1); ypos+=2)
{ {
const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
*rgb_ptr2=RGB+(ypos+1)*RGB_stride; *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
uint8_t *y_ptr1=Y+ypos*Y_stride, uint8_t *y_ptr1=Y+ypos*Y_stride,
*y_ptr2=Y+(ypos+1)*Y_stride, *y_ptr2=Y+(ypos+1)*Y_stride,
*u_ptr=U+(ypos/2)*UV_stride, *u_ptr=U+(ypos/2)*UV_stride,
*v_ptr=V+(ypos/2)*UV_stride; *v_ptr=V+(ypos/2)*UV_stride;
for(xpos=0; xpos<(width-31); xpos+=32) for(xpos=0; xpos<(width-31); xpos+=32)
{ {
RGB2YUV_32 RGB2YUV_32
rgb_ptr1+=96; rgb_ptr1+=96;
rgb_ptr2+=96; rgb_ptr2+=96;
y_ptr1+=32; y_ptr1+=32;
y_ptr2+=32; y_ptr2+=32;
u_ptr+=16; u_ptr+=16;
v_ptr+=16; v_ptr+=16;
} }
} }
@ -684,7 +684,7 @@ void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
#endif //HAVE_SSE2_INTRINSICS #endif //HAVE_SSE2_INTRINSICS
#ifdef __loongarch_sx #if HAVE_LSX_INTRINSICS
#define LSX_FUNCTION_NAME yuv420_rgb24_lsx #define LSX_FUNCTION_NAME yuv420_rgb24_lsx
#define STD_FUNCTION_NAME yuv420_rgb24_std #define STD_FUNCTION_NAME yuv420_rgb24_std
@ -716,6 +716,6 @@ void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
#define RGB_FORMAT RGB_FORMAT_ABGR #define RGB_FORMAT RGB_FORMAT_ABGR
#include "yuv_rgb_lsx_func.h" #include "yuv_rgb_lsx_func.h"
#endif //__loongarch_sx #endif //HAVE_LSX_INTRINSICS
#endif /* SDL_HAVE_YUV */ #endif /* SDL_HAVE_YUV */