From 74ec7a37da96c3e7d7a301e66f6b33b8b0584ad7 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 19 Oct 2024 19:11:56 +1000 Subject: [PATCH] GPU/HW: Make copy/write shaders resolution independent --- src/core/gpu_hw.cpp | 85 ++++++++++++++++++++++------------- src/core/gpu_hw_shadergen.cpp | 44 +++++++++--------- src/core/gpu_hw_shadergen.h | 5 +-- 3 files changed, 75 insertions(+), 59 deletions(-) diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index d7ccec1a8..19c5d7abb 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -1460,9 +1460,9 @@ bool GPU_HW::CompilePipelines(Error* error) // VRAM copy { - std::unique_ptr fs = g_gpu_device->CreateShader( - GPUShaderStage::Fragment, shadergen.GetLanguage(), - shadergen.GenerateVRAMCopyFragmentShader(m_resolution_scale, m_write_mask_as_depth), error); + std::unique_ptr fs = + g_gpu_device->CreateShader(GPUShaderStage::Fragment, shadergen.GetLanguage(), + shadergen.GenerateVRAMCopyFragmentShader(m_write_mask_as_depth), error); if (!fs) return false; @@ -1491,8 +1491,7 @@ bool GPU_HW::CompilePipelines(Error* error) const bool use_ssbo = features.texture_buffers_emulated_with_ssbo; std::unique_ptr fs = g_gpu_device->CreateShader( GPUShaderStage::Fragment, shadergen.GetLanguage(), - shadergen.GenerateVRAMWriteFragmentShader(m_resolution_scale, use_buffer, use_ssbo, m_write_mask_as_depth), - error); + shadergen.GenerateVRAMWriteFragmentShader(use_buffer, use_ssbo, m_write_mask_as_depth), error); if (!fs) return false; @@ -3376,19 +3375,31 @@ void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* da struct VRAMWriteUBOData { - u32 u_dst_x; - u32 u_dst_y; - u32 u_end_x; - u32 u_end_y; - u32 u_width; - u32 u_height; + float u_dst_x; + float u_dst_y; + float u_end_x; + float u_end_y; + float u_width; + float u_height; + float u_vram_width; + float u_vram_height; + float u_resolution_scale; u32 u_buffer_base_offset; u32 u_mask_or_bits; float u_depth_value; }; - const VRAMWriteUBOData uniforms = { - (x % VRAM_WIDTH), (y % VRAM_HEIGHT), ((x + width) % VRAM_WIDTH), ((y + height) % VRAM_HEIGHT), width, - height, map_index, (set_mask) ? 0x8000u : 0x00, GetCurrentNormalizedVertexDepth()}; + const VRAMWriteUBOData uniforms = {static_cast(x % VRAM_WIDTH), + static_cast(y % VRAM_HEIGHT), + static_cast((x + width) % VRAM_WIDTH), + static_cast((y + height) % VRAM_HEIGHT), + static_cast(width), + static_cast(height), + static_cast(m_vram_texture->GetWidth()), + static_cast(m_vram_texture->GetHeight()), + static_cast(m_resolution_scale), + map_index, + (set_mask) ? 0x8000u : 0x00, + GetCurrentNormalizedVertexDepth()}; // the viewport should already be set to the full vram, so just adjust the scissor const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale)); @@ -3458,25 +3469,27 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 struct VRAMCopyUBOData { - u32 u_src_x; - u32 u_src_y; - u32 u_dst_x; - u32 u_dst_y; - u32 u_end_x; - u32 u_end_y; - u32 u_width; - u32 u_height; + float u_src_x; + float u_src_y; + float u_dst_x; + float u_dst_y; + float u_end_x; + float u_end_y; + float u_vram_width; + float u_vram_height; + float u_resolution_scale; u32 u_set_mask_bit; float u_depth_value; }; - const VRAMCopyUBOData uniforms = {(src_x % VRAM_WIDTH) * m_resolution_scale, - (src_y % VRAM_HEIGHT) * m_resolution_scale, - (dst_x % VRAM_WIDTH) * m_resolution_scale, - (dst_y % VRAM_HEIGHT) * m_resolution_scale, - ((dst_x + width) % VRAM_WIDTH) * m_resolution_scale, - ((dst_y + height) % VRAM_HEIGHT) * m_resolution_scale, - width * m_resolution_scale, - height * m_resolution_scale, + const VRAMCopyUBOData uniforms = {static_cast((src_x % VRAM_WIDTH) * m_resolution_scale), + static_cast((src_y % VRAM_HEIGHT) * m_resolution_scale), + static_cast((dst_x % VRAM_WIDTH) * m_resolution_scale), + static_cast((dst_y % VRAM_HEIGHT) * m_resolution_scale), + static_cast(((dst_x + width) % VRAM_WIDTH) * m_resolution_scale), + static_cast(((dst_y + height) % VRAM_HEIGHT) * m_resolution_scale), + static_cast(m_vram_texture->GetWidth()), + static_cast(m_vram_texture->GetHeight()), + static_cast(m_resolution_scale), m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, GetCurrentNormalizedVertexDepth()}; @@ -3923,8 +3936,16 @@ void GPU_HW::UpdateDisplay() reinterpret_start_x + scaled_display_width, scaled_vram_offset_y + read_height, scaled_display_width, read_height); - const u32 uniforms[4] = {reinterpret_start_x, scaled_vram_offset_y, skip_x, line_skip}; - g_gpu_device->PushUniformBuffer(uniforms, sizeof(uniforms)); + struct ExtractUniforms + { + u32 vram_offset_x; + u32 vram_offset_y; + float skip_x; + float line_skip; + }; + const ExtractUniforms uniforms = {reinterpret_start_x, scaled_vram_offset_y, static_cast(skip_x), + static_cast(line_skip)}; + g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); g_gpu_device->SetViewportAndScissor(0, 0, scaled_display_width, read_height); g_gpu_device->Draw(3, 0); diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index c3409df60..443d0c36c 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -1194,7 +1194,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMExtractFragmentShader(u32 resolution_s ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n"; ss << "CONSTANT uint MULTISAMPLES = " << multisamples << "u;\n"; - DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_skip_x", "uint u_line_skip"}, true); + DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "float u_skip_x", "float u_line_skip"}, true); DeclareTexture(ss, "samp0", 0, msaa); if (depth_buffer) DeclareTexture(ss, "samp1", 1, msaa); @@ -1251,7 +1251,7 @@ float3 SampleVRAM24(uint2 icoords) DeclareFragmentEntryPoint(ss, 0, 1, {}, true, depth_buffer ? 2 : 1); ss << R"( { - uint2 icoords = uint2(uint(v_pos.x) + u_skip_x, uint(v_pos.y) << u_line_skip); + uint2 icoords = uint2(v_pos.x + u_skip_x, v_pos.y * u_line_skip); int2 wrapped_coords = int2((icoords + u_vram_offset) % VRAM_SIZE); #if COLOR_24BIT @@ -1422,7 +1422,7 @@ uint SampleVRAM(uint2 coords) return ss.str(); } -std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(u32 resolution_scale, bool use_buffer, bool use_ssbo, +std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo, bool write_mask_as_depth) const { std::stringstream ss; @@ -1432,12 +1432,10 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(u32 resolution_sca DefineMacro(ss, "WRITE_MASK_AS_DEPTH", write_mask_as_depth); DefineMacro(ss, "USE_BUFFER", use_buffer); - ss << "CONSTANT uint RESOLUTION_SCALE = " << resolution_scale << "u;\n"; - ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ");\n"; - DeclareUniformBuffer(ss, - {"uint2 u_base_coords", "uint2 u_end_coords", "uint2 u_size", "uint u_buffer_base_offset", - "uint u_mask_or_bits", "float u_depth_value"}, + {"float2 u_base_coords", "float2 u_end_coords", "float2 u_size", "float2 u_vram_size", + "float u_resolution_scale", "uint u_buffer_base_offset", "uint u_mask_or_bits", + "float u_depth_value"}, true); if (!use_buffer) @@ -1469,7 +1467,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(u32 resolution_sca DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, write_mask_as_depth); ss << R"( { - uint2 coords = uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); + float2 coords = floor(v_pos.xy / u_resolution_scale); // make sure it's not oversized and out of range if ((coords.x < u_base_coords.x && coords.x >= u_end_coords.x) || @@ -1479,14 +1477,14 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(u32 resolution_sca } // find offset from the start of the row/column - uint2 offset; - offset.x = (coords.x < u_base_coords.x) ? (VRAM_SIZE.x - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x); - offset.y = (coords.y < u_base_coords.y) ? (VRAM_SIZE.y - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y); + float2 offset; + offset.x = (coords.x < u_base_coords.x) ? (u_vram_size.x - u_base_coords.x + coords.x) : (coords.x - u_base_coords.x); + offset.y = (coords.y < u_base_coords.y) ? (u_vram_size.y - u_base_coords.y + coords.y) : (coords.y - u_base_coords.y); #if !USE_BUFFER uint value = LOAD_TEXTURE(samp0, int2(offset), 0).x; #else - uint buffer_offset = u_buffer_base_offset + (offset.y * u_size.x) + offset.x; + uint buffer_offset = u_buffer_base_offset + uint((offset.y * u_size.x) + offset.x); uint value = GET_VALUE(buffer_offset) | u_mask_or_bits; #endif @@ -1499,7 +1497,7 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader(u32 resolution_sca return ss.str(); } -std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader(u32 resolution_scale, bool write_mask_as_depth) const +std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader(bool write_mask_as_depth) const { // TODO: This won't currently work because we can't bind the texture to both the shader and framebuffer. const bool msaa = false; @@ -1509,19 +1507,16 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader(u32 resolution_scal DefineMacro(ss, "WRITE_MASK_AS_DEPTH", write_mask_as_depth); DefineMacro(ss, "MSAA_COPY", msaa); - ss << "CONSTANT uint RESOLUTION_SCALE = " << resolution_scale << "u;\n"; - ss << "CONSTANT uint2 VRAM_SIZE = uint2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n"; - DeclareUniformBuffer(ss, - {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_end_coords", "uint2 u_size", - "bool u_set_mask_bit", "float u_depth_value"}, + {"float2 u_src_coords", "float2 u_dst_coords", "float2 u_end_coords", "float2 u_vram_size", + "float u_resolution_scale", "bool u_set_mask_bit", "float u_depth_value"}, true); DeclareTexture(ss, "samp0", 0, msaa); DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, false, write_mask_as_depth, false, false, msaa); ss << R"( { - uint2 dst_coords = uint2(v_pos.xy); + float2 dst_coords = floor(v_pos.xy); // make sure it's not oversized and out of range if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) || @@ -1531,12 +1526,13 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader(u32 resolution_scal } // find offset from the start of the row/column - uint2 offset; - offset.x = (dst_coords.x < u_dst_coords.x) ? (VRAM_SIZE.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x); - offset.y = (dst_coords.y < u_dst_coords.y) ? (VRAM_SIZE.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y); + float2 offset; + offset.x = (dst_coords.x < u_dst_coords.x) ? (u_vram_size.x - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x); + offset.y = (dst_coords.y < u_dst_coords.y) ? (u_vram_size.y - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y); // find the source coordinates to copy from - uint2 src_coords = (u_src_coords + offset) % VRAM_SIZE; + float2 offset_coords = u_src_coords + offset; + float2 src_coords = offset_coords - (floor(offset_coords / u_vram_size) * u_vram_size); // sample and apply mask bit #if MSAA_COPY diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index 9c8f89021..b1d546172 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -26,9 +26,8 @@ public: std::string GenerateWireframeGeometryShader() const; std::string GenerateWireframeFragmentShader() const; std::string GenerateVRAMReadFragmentShader(u32 resolution_scale, u32 multisamples) const; - std::string GenerateVRAMWriteFragmentShader(u32 resolution_scale, bool use_buffer, bool use_ssbo, - bool write_mask_as_depth) const; - std::string GenerateVRAMCopyFragmentShader(u32 resolution_scale, bool write_mask_as_depth) const; + std::string GenerateVRAMWriteFragmentShader(bool use_buffer, bool use_ssbo, bool write_mask_as_depth) const; + std::string GenerateVRAMCopyFragmentShader(bool write_mask_as_depth) const; std::string GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced, bool write_mask_as_depth) const; std::string GenerateVRAMUpdateDepthFragmentShader(u32 multisamples) const; std::string GenerateVRAMExtractFragmentShader(u32 resolution_scale, u32 multisamples, bool color_24bit,