diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index faf31004e..cca5e1daf 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -433,7 +433,7 @@ void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom) *bottom = std::max((m_drawing_area.bottom + 1) * m_resolution_scale, *top + 1); } -Common::Rectangle GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) +Common::Rectangle GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) const { Common::Rectangle out_rc = Common::Rectangle::FromExtents(x, y, width, height); if (out_rc.right > VRAM_WIDTH) @@ -449,6 +449,15 @@ Common::Rectangle GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u3 return out_rc; } +bool GPU_HW::UseVRAMCopyShader(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) const +{ + // masking enabled, oversized, or overlapping + return (m_GPUSTAT.IsMaskingEnabled() || (src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || + (dst_x + width) > VRAM_WIDTH || (dst_y + height) > VRAM_HEIGHT || + Common::Rectangle::FromExtents(src_x, src_y, width, height) + .Intersects(Common::Rectangle::FromExtents(dst_x, dst_y, width, height))); +} + GPU_HW::BatchPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc) { if (rc.primitive == Primitive::Line) diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index 6cdc2cea1..b69639b0f 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -106,6 +106,17 @@ protected: u32 u_interlaced_displayed_field; }; + struct VRAMCopyUBOData + { + u32 u_src_x; + u32 u_src_y; + u32 u_dst_x; + u32 u_dst_y; + u32 u_width; + u32 u_height; + u32 u_set_mask_bit; + }; + struct RendererStats { u32 num_batches; @@ -157,7 +168,10 @@ protected: } /// Computes the area affected by a VRAM transfer, including wrap-around of X. - Common::Rectangle GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height); + Common::Rectangle GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) const; + + /// Returns true if the VRAM copy shader should be used (oversized copies, masking). + bool UseVRAMCopyShader(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) const; /// Handles quads with flipped texture coordinate directions. static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices); diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index af8234ff4..9f09ec37f 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -404,6 +404,10 @@ bool GPU_HW_D3D11::CompileShaders() if (!m_vram_write_pixel_shader) return false; + m_vram_copy_pixel_shader = m_shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateVRAMCopyFragmentShader()); + if (!m_vram_copy_pixel_shader) + return false; + for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++) { for (u8 interlacing = 0; interlacing < 2; interlacing++) @@ -689,14 +693,30 @@ void GPU_HW_D3D11::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* d void GPU_HW_D3D11::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) { - if ((src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || (dst_x + width) > VRAM_WIDTH || - (dst_y + height) > VRAM_HEIGHT) + if (UseVRAMCopyShader(src_x, src_y, dst_x, dst_y, width, height)) { - Log_WarningPrintf("Oversized VRAM copy (%u,%u, %u,%u, %u,%u), CPU round trip", src_x, src_y, dst_x, dst_y, width, - height); - ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT); - GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height); - UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data()); + const Common::Rectangle src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height); + const Common::Rectangle dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height); + if (m_vram_dirty_rect.Intersects(src_bounds)) + UpdateVRAMReadTexture(); + IncludeVRAMDityRectangle(dst_bounds); + + const VRAMCopyUBOData uniforms = { + src_x * m_resolution_scale, + src_y * m_resolution_scale, + dst_x * m_resolution_scale, + dst_y * m_resolution_scale, + width * m_resolution_scale, + height * m_resolution_scale, + m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, + }; + + const Common::Rectangle dst_bounds_scaled(dst_bounds * m_resolution_scale); + SetViewportAndScissor(dst_bounds_scaled.left, dst_bounds_scaled.top, dst_bounds_scaled.GetWidth(), + dst_bounds_scaled.GetHeight()); + m_context->PSSetShaderResources(0, 1, m_vram_read_texture.GetD3DSRVArray()); + DrawUtilityShader(m_vram_copy_pixel_shader.Get(), &uniforms, sizeof(uniforms)); + RestoreGraphicsAPIState(); return; } diff --git a/src/core/gpu_hw_d3d11.h b/src/core/gpu_hw_d3d11.h index 7e8c2ba54..dca10597a 100644 --- a/src/core/gpu_hw_d3d11.h +++ b/src/core/gpu_hw_d3d11.h @@ -113,5 +113,6 @@ private: ComPtr m_vram_interlaced_fill_pixel_shader; ComPtr m_vram_read_pixel_shader; ComPtr m_vram_write_pixel_shader; + ComPtr m_vram_copy_pixel_shader; std::array, 2>, 2> m_display_pixel_shaders; // [depth_24][interlaced] }; diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index 32f08d560..7feadcbce 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -431,6 +431,19 @@ bool GPU_HW_OpenGL::CompilePrograms() prog->Uniform1i("samp0", 0); m_vram_read_program = std::move(*prog); + prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {}, + shadergen.GenerateVRAMCopyFragmentShader(), [this](GL::Program& prog) { + if (!m_is_gles) + prog.BindFragData(0, "o_col0"); + }); + if (!prog) + return false; + + prog->BindUniformBlock("UBOBlock", 1); + prog->Bind(); + prog->Uniform1i("samp0", 0); + m_vram_copy_program = std::move(*prog); + if (m_supports_texture_buffer) { prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {}, @@ -770,14 +783,39 @@ void GPU_HW_OpenGL::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* void GPU_HW_OpenGL::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) { - if ((src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || (dst_x + width) > VRAM_WIDTH || - (dst_y + height) > VRAM_HEIGHT) + if (UseVRAMCopyShader(src_x, src_y, dst_x, dst_y, width, height)) { - Log_WarningPrintf("Oversized VRAM copy (%u,%u, %u,%u, %u,%u), CPU round trip", src_x, src_y, dst_x, dst_y, width, - height); - ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT); - GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height); - UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data()); + const Common::Rectangle src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height); + const Common::Rectangle dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height); + if (m_vram_dirty_rect.Intersects(src_bounds)) + UpdateVRAMReadTexture(); + IncludeVRAMDityRectangle(dst_bounds); + + VRAMCopyUBOData uniforms = { + src_x * m_resolution_scale, + src_y * m_resolution_scale, + dst_x * m_resolution_scale, + dst_y * m_resolution_scale, + width * m_resolution_scale, + height * m_resolution_scale, + m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, + }; + uniforms.u_src_y = m_vram_texture.GetHeight() - uniforms.u_src_y - uniforms.u_height; + uniforms.u_dst_y = m_vram_texture.GetHeight() - uniforms.u_dst_y - uniforms.u_height; + UploadUniformBuffer(&uniforms, sizeof(uniforms)); + + glDisable(GL_SCISSOR_TEST); + glDisable(GL_BLEND); + + const Common::Rectangle dst_bounds_scaled(dst_bounds * m_resolution_scale); + glViewport(dst_bounds_scaled.left, + m_vram_texture.GetHeight() - dst_bounds_scaled.top - dst_bounds_scaled.GetHeight(), + dst_bounds_scaled.GetWidth(), dst_bounds_scaled.GetHeight()); + m_vram_read_texture.Bind(); + m_vram_copy_program.Bind(); + glDrawArrays(GL_TRIANGLES, 0, 3); + + RestoreGraphicsAPIState(); return; } diff --git a/src/core/gpu_hw_opengl.h b/src/core/gpu_hw_opengl.h index bdbb5a650..cb50b1394 100644 --- a/src/core/gpu_hw_opengl.h +++ b/src/core/gpu_hw_opengl.h @@ -84,6 +84,7 @@ private: GL::Program m_vram_interlaced_fill_program; GL::Program m_vram_read_program; GL::Program m_vram_write_program; + GL::Program m_vram_copy_program; u32 m_uniform_buffer_alignment = 1; u32 m_max_texture_buffer_size = 0; diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 0b3706c25..7addc05fc 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -967,3 +967,32 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader() return ss.str(); } + +std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader() +{ + std::stringstream ss; + WriteHeader(ss); + WriteCommonFunctions(ss); + DeclareUniformBuffer(ss, {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_size", "bool u_set_mask_bit"}); + + DeclareTexture(ss, "samp0", 0); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); + ss << R"( +{ + uint2 dst_coords = uint2(v_pos.xy); + + // find offset from the start of the row/column + uint2 offset; + offset.x = (dst_coords.x < u_dst_coords.x) ? (uint(VRAM_SIZE.x - 1) - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x); + offset.y = (dst_coords.y < u_dst_coords.y) ? (uint(VRAM_SIZE.y - 1) - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y); + + // find the source coordinates to copy from + uint2 src_coords = (u_src_coords + offset) % uint2(VRAM_SIZE); + + // sample and apply mask bit + float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0); + o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a); +})"; + + return ss.str(); +} diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index f9e49af76..8bc91d951 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -22,6 +22,7 @@ public: std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced); std::string GenerateVRAMReadFragmentShader(); std::string GenerateVRAMWriteFragmentShader(); + std::string GenerateVRAMCopyFragmentShader(); HostDisplay::RenderAPI m_render_api; u32 m_resolution_scale;