diff --git a/src/core/gpu.h b/src/core/gpu.h index 390a506a2..bbf4f93c9 100644 --- a/src/core/gpu.h +++ b/src/core/gpu.h @@ -95,7 +95,8 @@ public: MAX_PRIMITIVE_HEIGHT = 512, DOT_TIMER_INDEX = 0, HBLANK_TIMER_INDEX = 1, - MAX_RESOLUTION_SCALE = 16 + MAX_RESOLUTION_SCALE = 16, + DITHER_MATRIX_SIZE = 4 }; enum : u16 @@ -107,10 +108,10 @@ public: }; // 4x4 dither matrix. - static constexpr s32 DITHER_MATRIX[4][4] = {{-4, +0, -3, +1}, // row 0 - {+2, -2, +3, -1}, // row 1 - {-3, +1, -4, +0}, // row 2 - {+4, -1, +2, -2}}; // row 3 + static constexpr s32 DITHER_MATRIX[DITHER_MATRIX_SIZE][DITHER_MATRIX_SIZE] = {{-4, +0, -3, +1}, // row 0 + {+2, -2, +3, -1}, // row 1 + {-3, +1, -4, +0}, // row 2 + {+4, -1, +2, -2}}; // row 3 // Base class constructor. GPU(); diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 2a27289fb..1cc516c86 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -581,17 +581,20 @@ std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMod ss << "};\n"; ss << R"( -int3 ApplyDithering(uint2 coord, int3 icol) +uint3 ApplyDithering(uint2 coord, uint3 icol) { - uint2 fc = coord & uint2(3u, 3u); + #if DITHERING_SCALED + uint2 fc = coord & uint2(3u, 3u); + #else + uint2 fc = (coord / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & uint2(3u, 3u); + #endif int offset = s_dither_values[fc.y * 4u + fc.x]; - return icol + int3(offset, offset, offset); -} -int3 TruncateTo15Bit(int3 icol) -{ - icol = clamp(icol, int3(0, 0, 0), int3(255, 255, 255)); - return (icol & int3(~7, ~7, ~7)) | ((icol >> 3) & int3(7, 7, 7)); + #if !TRUE_COLOR + return uint3(clamp((int3(icol) + int3(offset, offset, offset)) >> 3, 0, 31)); + #else + return uint3(clamp(int3(icol) + int3(offset, offset, offset), 0, 255)); + #endif } #if TEXTURED @@ -654,10 +657,10 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) ss << R"( { - int3 vertcol = int3(v_col0.rgb * float3(255.0, 255.0, 255.0)); + uint3 vertcol = uint3(v_col0.rgb * float3(255.0, 255.0, 255.0)); bool semitransparent; - int3 icolor; + uint3 icolor; float ialpha; float oalpha; @@ -707,10 +710,27 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) ialpha = 1.0; #endif - #if RAW_TEXTURE - icolor = int3(texcol.rgb * float3(255.0, 255.0, 255.0)); + // If not using true color, truncate the framebuffer colors to 5-bit. + #if !TRUE_COLOR + icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)) >> 3; + #if !RAW_TEXTURE + icolor = (icolor * vertcol) >> 4; + #if DITHERING + icolor = ApplyDithering(uint2(v_pos.xy), icolor); + #else + icolor = min(icolor >> 3, uint3(31u, 31u, 31u)); + #endif + #endif #else - icolor = (vertcol * int3(texcol.rgb * float3(255.0, 255.0, 255.0))) >> 7; + icolor = uint3(texcol.rgb * float3(255.0, 255.0, 255.0)); + #if !RAW_TEXTURE + icolor = (icolor * vertcol) >> 7; + #if DITHERING + icolor = ApplyDithering(uint2(v_pos.xy), icolor); + #else + icolor = min(icolor, uint3(255u, 255u, 255u)); + #endif + #endif #endif // Compute output alpha (mask bit) @@ -721,17 +741,16 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) icolor = vertcol; ialpha = 1.0; - // However, the mask bit is cleared if set mask bit is false. - oalpha = float(u_set_mask_while_drawing); - #endif - - // Apply dithering - #if DITHERING - #if DITHERING_SCALED + #if DITHERING icolor = ApplyDithering(uint2(v_pos.xy), icolor); #else - icolor = ApplyDithering(uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE), icolor); + #if !TRUE_COLOR + icolor >>= 3; + #endif #endif + + // However, the mask bit is cleared if set mask bit is false. + oalpha = float(u_set_mask_while_drawing); #endif // Premultiply alpha so we don't need to use a colour output for it. @@ -744,11 +763,10 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) #if !TRUE_COLOR // We want to apply the alpha before the truncation to 16-bit, otherwise we'll be passing a 32-bit precision color // into the blend unit, which can cause a small amount of error to accumulate. - icolor = int3(((float3(icolor) / float3(255.0, 255.0, 255.0)) * premultiply_alpha) * float3(255.0, 255.0, 255.0)); - color = (float3(icolor >> 3) / float3(31.0, 31.0, 31.0)); + color = floor(float3(icolor) * premultiply_alpha) / float3(31.0, 31.0, 31.0); #else // True color is actually simpler here since we want to preserve the precision. - color = (float3(icolor) / float3(255.0, 255.0, 255.0)) * premultiply_alpha; + color = (float3(icolor) * premultiply_alpha) / float3(255.0, 255.0, 255.0); #endif #if TRANSPARENCY diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp index 65d4ed87e..762aeb1e3 100644 --- a/src/core/gpu_sw.cpp +++ b/src/core/gpu_sw.cpp @@ -551,6 +551,25 @@ void GPU_SW::DrawRectangle(s32 origin_x, s32 origin_y, u32 width, u32 height, u8 } } +constexpr GPU_SW::DitherLUT GPU_SW::ComputeDitherLUT() +{ + DitherLUT lut = {}; + for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++) + { + for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++) + { + for (s32 value = 0; value < DITHER_LUT_SIZE; value++) + { + const s32 dithered_value = (value + DITHER_MATRIX[i][j]) >> 3; + lut[i][j][value] = static_cast((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value)); + } + } + } + return lut; +} + +static constexpr GPU_SW::DitherLUT s_dither_lut = GPU_SW::ComputeDitherLUT(); + template void GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y) { @@ -612,23 +631,25 @@ void GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 tex } else { - const u8 r = Truncate8(std::min((ZeroExtend16(texture_color.GetR8()) * ZeroExtend16(color_r)) >> 7, 0xFF)); - const u8 g = Truncate8(std::min((ZeroExtend16(texture_color.GetG8()) * ZeroExtend16(color_g)) >> 7, 0xFF)); - const u8 b = Truncate8(std::min((ZeroExtend16(texture_color.GetB8()) * ZeroExtend16(color_b)) >> 7, 0xFF)); - if constexpr (dithering_enable) - color.SetRGB24Dithered(x, y, r, g, b, texture_color.c); - else - color.SetRGB24(r, g, b, texture_color.c); + const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u; + const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u; + + color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) | + (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) | + (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) | + (texture_color.bits & 0x8000u); } } else { transparent = true; - if constexpr (dithering_enable) - color.SetRGB24Dithered(x, y, color_r, color_g, color_b); - else - color.SetRGB24(color_r, color_g, color_b); + const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u; + const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u; + + color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) | + (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) | + (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10); } const VRAMPixel bg_color{GetPixel(static_cast(x), static_cast(y))}; diff --git a/src/core/gpu_sw.h b/src/core/gpu_sw.h index 13562a7e3..6db9e792b 100644 --- a/src/core/gpu_sw.h +++ b/src/core/gpu_sw.h @@ -23,6 +23,11 @@ public: u16* GetPixelPtr(u32 x, u32 y) { return &m_vram[VRAM_WIDTH * y + x]; } void SetPixel(u32 x, u32 y, u16 value) { m_vram[VRAM_WIDTH * y + x] = value; } + // this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512) + static constexpr u32 DITHER_LUT_SIZE = 512; + using DitherLUT = std::array, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>; + static constexpr DitherLUT ComputeDitherLUT(); + protected: struct SWVertex {