diff --git a/libretro-common/gfx/scaler/scaler_int.c b/libretro-common/gfx/scaler/scaler_int.c index 5e72f81ec1..c82ad3a6b4 100644 --- a/libretro-common/gfx/scaler/scaler_int.c +++ b/libretro-common/gfx/scaler/scaler_int.c @@ -53,12 +53,11 @@ * The C version of scalers perform the exact same operations as the SIMD code for testing purposes. */ -#if defined(__SSE2__) void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride) { int h, w, y; - const uint64_t *input = ctx->scaled.frame; - uint32_t *output = (uint32_t*)output_; + const uint64_t *input = ctx->scaled.frame; + uint32_t *output = (uint32_t*)output_; const int16_t *filter_vert = ctx->vert.filter; @@ -68,11 +67,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid for (w = 0; w < ctx->out_width; w++) { + const uint64_t *input_base_y = input_base + w; +#if defined(__SSE2__) __m128i final; __m128i res = _mm_setzero_si128(); - const uint64_t *input_base_y = input_base + w; - for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2)) { __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll); @@ -95,30 +94,12 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid final = _mm_packus_epi16(res, res); output[w] = _mm_cvtsi128_si32(final); - } - } -} #else -void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride) -{ - int h, w, y; - const uint64_t *input = ctx->scaled.frame; - uint32_t *output = (uint32_t*)output_; - - const int16_t *filter_vert = ctx->vert.filter; - - for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2) - { - const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3); - - for (w = 0; w < ctx->out_width; w++) - { int16_t res_a = 0; int16_t res_r = 0; int16_t res_g = 0; int16_t res_b = 0; - const uint64_t *input_base_y = input_base + w; for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3)) { uint64_t col = *input_base_y; @@ -141,17 +122,24 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid res_g >>= (7 - 2 - 2); res_b >>= (7 - 2 - 2); - output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) | (clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0); + output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) | + (clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0); +#endif } } } + +#if !defined(__SSE2__) +static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b) +{ + return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0); +} #endif -#if defined(__SSE2__) void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride) { int h, w, x; - const uint32_t *input = (const uint32_t*)input_; + const uint32_t *input = (uint32_t*)input_; uint64_t *output = ctx->scaled.frame; for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3) @@ -160,9 +148,9 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride) { - __m128i res = _mm_setzero_si128(); - const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w]; +#if defined(__SSE2__) + __m128i res = _mm_setzero_si128(); for (x = 0; (x + 1) < ctx->horiz.filter_len; x += 2) { @@ -194,33 +182,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int uint32_t *u32; uint64_t *u64; } u; - u.u64 = output + w; + u.u64 = output + w; u.u32[0] = _mm_cvtsi128_si32(res); u.u32[1] = _mm_cvtsi128_si32(_mm_srli_si128(res, 4)); #endif - } - } -} #else -static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b) -{ - return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0); -} - -void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride) -{ - int h, w, x; - const uint32_t *input = (uint32_t*)input_; - uint64_t *output = ctx->scaled.frame; - - for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3) - { - const int16_t *filter_horiz = ctx->horiz.filter; - - for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride) - { - const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w]; - int16_t res_a = 0; int16_t res_r = 0; int16_t res_g = 0; @@ -244,10 +210,10 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int } output[w] = build_argb64(res_a, res_r, res_g, res_b); +#endif } } } -#endif void scaler_argb8888_point_special(const struct scaler_ctx *ctx, void *output_, const void *input_, @@ -256,26 +222,21 @@ void scaler_argb8888_point_special(const struct scaler_ctx *ctx, int out_stride, int in_stride) { int h, w; - const uint32_t *input = NULL; - uint32_t *output = NULL; - int x_pos = (1 << 15) * in_width / out_width - (1 << 15); - int x_step = (1 << 16) * in_width / out_width; - int y_pos = (1 << 15) * in_height / out_height - (1 << 15); - int y_step = (1 << 16) * in_height / out_height; - - (void)ctx; + int x_pos = (1 << 15) * in_width / out_width - (1 << 15); + int x_step = (1 << 16) * in_width / out_width; + int y_pos = (1 << 15) * in_height / out_height - (1 << 15); + int y_step = (1 << 16) * in_height / out_height; + const uint32_t *input = (const uint32_t*)input_; + uint32_t *output = (uint32_t*)output_; if (x_pos < 0) x_pos = 0; if (y_pos < 0) y_pos = 0; - input = (const uint32_t*)input_; - output = (uint32_t*)output_; - for (h = 0; h < out_height; h++, y_pos += y_step, output += out_stride >> 2) { - int x = x_pos; + int x = x_pos; const uint32_t *inp = input + (y_pos >> 16) * (in_stride >> 2); for (w = 0; w < out_width; w++, x += x_step)