mirror of https://github.com/snes9xgit/snes9x.git
164 lines
6.1 KiB
C++
164 lines
6.1 KiB
C++
/*****************************************************************************\
|
||
Snes9x - Portable Super Nintendo Entertainment System (TM) emulator.
|
||
This file is licensed under the Snes9x License.
|
||
For further information, consult the LICENSE file in the root directory.
|
||
\*****************************************************************************/
|
||
|
||
#include <cstdint>
|
||
#include <cmath>
|
||
#include <algorithm>
|
||
|
||
#define CLAMP_U8(x, lo, hi) ((x) < (lo) ? (lo) : ((x) > (hi) ? (hi) : (x)))
|
||
|
||
// ---- Gamma tables (unchanged behavior) --------------------------------------
|
||
|
||
static uint8_t gamma_r_encode[32];
|
||
static uint8_t gamma_g_encode[64];
|
||
static uint8_t gamma_decode[256];
|
||
|
||
static void init_gamma_tables()
|
||
{
|
||
constexpr float gamma = 1.6f;
|
||
constexpr float inv_gamma = 1.0f / gamma;
|
||
|
||
for (int i = 0; i < 32; ++i)
|
||
gamma_r_encode[i] = uint8_t(CLAMP_U8(int(std::pow((i << 3) / 255.0f, gamma) * 255.0f + 0.5f), 0, 255));
|
||
for (int i = 0; i < 64; ++i)
|
||
gamma_g_encode[i] = uint8_t(CLAMP_U8(int(std::pow((i << 2) / 255.0f, gamma) * 255.0f + 0.5f), 0, 255));
|
||
for (int i = 0; i < 256; ++i)
|
||
gamma_decode[i] = uint8_t(CLAMP_U8(int(std::pow(i / 255.0f, inv_gamma) * 255.0f + 0.5f), 0, 255));
|
||
}
|
||
|
||
// ---- RGB565 helpers ---------------------------------------------------------
|
||
|
||
static inline uint16_t build_rgb565_fast(int r, int g, int b)
|
||
{
|
||
return ((r & 0xF8) << 8) | ((g & 0xFC) << 3) | (b >> 3);
|
||
}
|
||
|
||
static inline void unpack_rgb565_gamma(const uint8_t* src, int pitch, int x, int y, int& r, int& g, int& b)
|
||
{
|
||
const uint8_t* pixel = src + y * pitch + x * 2;
|
||
const uint16_t color = uint16_t(pixel[0]) | (uint16_t(pixel[1]) << 8);
|
||
|
||
const int r5 = (color >> 11) & 0x1F;
|
||
const int g6 = (color >> 5) & 0x3F;
|
||
const int b5 = color & 0x1F;
|
||
|
||
r = gamma_r_encode[r5];
|
||
g = gamma_g_encode[g6];
|
||
b = gamma_r_encode[b5]; // reuse red gamma table for blue
|
||
}
|
||
|
||
// ---- Fixed-point smoothstep weights at 4× sample locations ------------------
|
||
// smoothstep(0,1,x) at x in {0, 1/4, 1/2, 3/4} = {0, 5/32, 1/2, 27/32}
|
||
// Scale by 256 for 8.8 fixed point.
|
||
static constexpr uint16_t W[4] = { 0, 40, 128, 216 }; // w = smoothstep
|
||
static constexpr uint16_t IW[4] = { 256, 216, 128, 40 }; // 256 - w
|
||
|
||
void ApplySharpBilinear4x(uint8_t* __restrict dst, int dst_pitch,
|
||
const uint8_t* __restrict src,
|
||
int src_width, int src_height, int src_pitch)
|
||
{
|
||
const int dst_width = src_width << 2; // *4
|
||
const int dst_height = src_height << 2; // *4
|
||
|
||
static bool gamma_ready = false;
|
||
if (!gamma_ready)
|
||
{
|
||
init_gamma_tables();
|
||
gamma_ready = true;
|
||
}
|
||
|
||
// Iterate over source texels; each emits a 4×4 block in the destination.
|
||
for (int sy = 0; sy < src_height; ++sy)
|
||
{
|
||
// Clamp source rows to avoid reading past the bottom edge.
|
||
const int sy0 = (sy < src_height - 1) ? sy : (src_height - 2);
|
||
const int sy1 = sy0 + 1;
|
||
|
||
// Precompute destination row base once per source row.
|
||
const int dy_base = sy << 2; // sy * 4
|
||
|
||
for (int sx = 0; sx < src_width; ++sx)
|
||
{
|
||
// Clamp source cols to avoid reading past the right edge.
|
||
const int sx0 = (sx < src_width - 1) ? sx : (src_width - 2);
|
||
const int sx1 = sx0 + 1;
|
||
|
||
// Unpack the 2×2 neighborhood exactly once per 4×4 block.
|
||
int r00, g00, b00;
|
||
int r10, g10, b10;
|
||
int r01, g01, b01;
|
||
int r11, g11, b11;
|
||
|
||
unpack_rgb565_gamma(src, src_pitch, sx0, sy0, r00, g00, b00);
|
||
unpack_rgb565_gamma(src, src_pitch, sx1, sy0, r10, g10, b10);
|
||
unpack_rgb565_gamma(src, src_pitch, sx0, sy1, r01, g01, b01);
|
||
unpack_rgb565_gamma(src, src_pitch, sx1, sy1, r11, g11, b11);
|
||
|
||
// Emit the 4×4 destination block using separable bilinear in 8.8 fixed-point.
|
||
const int dx_base = sx << 2; // sx * 4
|
||
|
||
// For each of the 4 subcolumns (dx), do horizontal mixes top/bottom once,
|
||
// then vertical mix for each of the 4 subrows (dy).
|
||
int rtop[4], gtop[4], btop[4];
|
||
int rbot[4], gbot[4], bbot[4];
|
||
|
||
for (int dx = 0; dx < 4; ++dx)
|
||
{
|
||
const uint16_t wx = W[dx];
|
||
const uint16_t iwx = IW[dx];
|
||
|
||
// Top row horizontal blend
|
||
rtop[dx] = (r00 * iwx + r10 * wx + 128) >> 8;
|
||
gtop[dx] = (g00 * iwx + g10 * wx + 128) >> 8;
|
||
btop[dx] = (b00 * iwx + b10 * wx + 128) >> 8;
|
||
|
||
// Bottom row horizontal blend
|
||
rbot[dx] = (r01 * iwx + r11 * wx + 128) >> 8;
|
||
gbot[dx] = (g01 * iwx + g11 * wx + 128) >> 8;
|
||
bbot[dx] = (b01 * iwx + b11 * wx + 128) >> 8;
|
||
}
|
||
|
||
for (int dy = 0; dy < 4; ++dy)
|
||
{
|
||
const uint16_t wy = W[dy];
|
||
const uint16_t iwy = IW[dy];
|
||
|
||
// Destination row pointer for this subrow
|
||
const int y = dy_base + dy;
|
||
uint8_t* __restrict dst_row = dst + y * dst_pitch;
|
||
|
||
for (int dx = 0; dx < 4; ++dx)
|
||
{
|
||
const int x = dx_base + dx;
|
||
|
||
// Final vertical blend
|
||
int r = (rtop[dx] * iwy + rbot[dx] * wy + 128) >> 8;
|
||
int g = (gtop[dx] * iwy + gbot[dx] * wy + 128) >> 8;
|
||
int b = (btop[dx] * iwy + bbot[dx] * wy + 128) >> 8;
|
||
|
||
// Gamma decode back to display space and pack
|
||
const uint16_t out = build_rgb565_fast(
|
||
gamma_decode[CLAMP_U8(r, 0, 255)],
|
||
gamma_decode[CLAMP_U8(g, 0, 255)],
|
||
gamma_decode[CLAMP_U8(b, 0, 255)]
|
||
);
|
||
|
||
uint8_t* __restrict dst_px = dst_row + (x << 1); // x*2
|
||
dst_px[0] = uint8_t(out & 0xFF);
|
||
dst_px[1] = uint8_t((out >> 8) & 0xFF);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
void sharpbilinear_4x(uint8_t* srcPtr, int srcPitch,
|
||
uint8_t* dstPtr, int dstPitch,
|
||
int width, int height)
|
||
{
|
||
ApplySharpBilinear4x(dstPtr, dstPitch, srcPtr, width, height, srcPitch);
|
||
}
|