GPU/HW: Implement oversized VRAM fills in hardware

Fixes downscaling in Bugs and Taz PAL.
2021-07-21 19:22:04 +10:00 · 2021-07-21 19:22:04 +10:00 · 70209db402
parent 7ea78ad2df
commit 70209db402
12 changed files with 146 additions and 140 deletions
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -976,6 +976,10 @@ GPU_HW::VRAMFillUBOData GPU_HW::GetVRAMFillUBOData(u32 x, u32 y, u32 width, u32
    color = VRAMRGBA5551ToRGBA8888(VRAMRGBA8888ToRGBA5551(color));

  VRAMFillUBOData uniforms;
+  uniforms.u_dst_x = (x % VRAM_WIDTH) * m_resolution_scale;
+  uniforms.u_dst_y = (y % VRAM_HEIGHT) * m_resolution_scale;
+  uniforms.u_end_x = ((x + width) % VRAM_WIDTH) * m_resolution_scale;
+  uniforms.u_end_y = ((y + height) % VRAM_HEIGHT) * m_resolution_scale;
  std::tie(uniforms.u_fill_color[0], uniforms.u_fill_color[1], uniforms.u_fill_color[2], uniforms.u_fill_color[3]) =
    RGBA8ToFloat(color);

--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -125,6 +125,10 @@ protected:

  struct VRAMFillUBOData
  {
+    u32 u_dst_x;
+    u32 u_dst_y;
+    u32 u_end_x;
+    u32 u_end_y;
    float u_fill_color[4];
    u32 u_interlaced_displayed_field;
  };
@ -268,13 +272,19 @@ protected:

  /// We need two-pass rendering when using BG-FG blending and texturing, as the transparency can be enabled
  /// on a per-pixel basis, and the opaque pixels shouldn't be blended at all.
-  bool NeedsTwoPassRendering() const
+  ALWAYS_INLINE bool NeedsTwoPassRendering() const
  {
    return (m_batch.texture_mode != GPUTextureMode::Disabled &&
            (m_batch.transparency_mode == GPUTransparencyMode::BackgroundMinusForeground ||
             (!m_supports_dual_source_blend && m_batch.transparency_mode != GPUTransparencyMode::Disabled)));
  }

+  /// Returns true if the specified VRAM fill is oversized.
+  ALWAYS_INLINE static bool IsVRAMFillOversized(u32 x, u32 y, u32 width, u32 height)
+  {
+    return ((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT);
+  }
+
  ALWAYS_INLINE bool IsUsingSoftwareRendererForReadbacks() { return static_cast<bool>(m_sw_renderer); }

  void FillBackendCommandParameters(GPUBackendCommand* cmd) const;
--- a/src/core/gpu_hw_d3d11.cpp
+++ b/src/core/gpu_hw_d3d11.cpp
@ -508,7 +508,8 @@ bool GPU_HW_D3D11::CompileShaders()
                             m_true_color, m_scaled_dithering, m_texture_filtering, m_using_uv_limits,
                             m_pgxp_depth_buffer, m_supports_dual_source_blend);

-  ShaderCompileProgressTracker progress("Compiling Shaders", 1 + 1 + 2 + (4 * 9 * 2 * 2) + 7 + (2 * 3) + 1);
+  ShaderCompileProgressTracker progress("Compiling Shaders",
+                                        1 + 1 + 2 + (4 * 9 * 2 * 2) + 1 + (2 * 2) + 4 + (2 * 3) + 1);

  // input layout
  {
@ -585,18 +586,19 @@ bool GPU_HW_D3D11::CompileShaders()

  progress.Increment();

-  m_vram_fill_pixel_shader = shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateFillFragmentShader());
-  if (!m_vram_fill_pixel_shader)
-    return false;
-
-  progress.Increment();
-
-  m_vram_interlaced_fill_pixel_shader =
-    shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateInterlacedFillFragmentShader());
-  if (!m_vram_interlaced_fill_pixel_shader)
+  for (u8 wrapped = 0; wrapped < 2; wrapped++)
+  {
+    for (u8 interlaced = 0; interlaced < 2; interlaced++)
+    {
+      const std::string ps =
+        shadergen.GenerateVRAMFillFragmentShader(ConvertToBoolUnchecked(wrapped), ConvertToBoolUnchecked(interlaced));
+      m_vram_fill_pixel_shaders[wrapped][interlaced] = shader_cache.GetPixelShader(m_device.Get(), ps);
+      if (!m_vram_fill_pixel_shaders[wrapped][interlaced])
        return false;

      progress.Increment();
+    }
+  }

  m_vram_read_pixel_shader = shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateVRAMReadFragmentShader());
  if (!m_vram_read_pixel_shader)
@ -682,8 +684,7 @@ void GPU_HW_D3D11::DestroyShaders()
  m_vram_copy_pixel_shader.Reset();
  m_vram_write_pixel_shader.Reset();
  m_vram_read_pixel_shader.Reset();
-  m_vram_interlaced_fill_pixel_shader.Reset();
-  m_vram_fill_pixel_shader.Reset();
+  m_vram_fill_pixel_shaders = {};
  m_copy_pixel_shader.Reset();
  m_uv_quad_vertex_shader.Reset();
  m_screen_quad_vertex_shader.Reset();
@ -976,26 +977,18 @@ void GPU_HW_D3D11::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
  if (IsUsingSoftwareRendererForReadbacks())
    FillSoftwareRendererVRAM(x, y, width, height, color);

-  if ((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT)
-  {
-    // CPU round trip if oversized for now.
-    Log_WarningPrintf("Oversized VRAM fill (%u-%u, %u-%u), CPU round trip", x, x + width, y, y + height);
-    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
-    GPU::FillVRAM(x, y, width, height, color);
-    UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_ptr, false, false);
-    return;
-  }
-
  GPU_HW::FillVRAM(x, y, width, height, color);

-  const VRAMFillUBOData uniforms = GetVRAMFillUBOData(x, y, width, height, color);
-
  m_context->OMSetDepthStencilState(m_depth_test_always_state.Get(), 0);

-  SetViewportAndScissor(x * m_resolution_scale, y * m_resolution_scale, width * m_resolution_scale,
-                        height * m_resolution_scale);
-  DrawUtilityShader(IsInterlacedRenderingEnabled() ? m_vram_interlaced_fill_pixel_shader.Get() :
-                                                     m_vram_fill_pixel_shader.Get(),
+  const Common::Rectangle<u32> bounds(GetVRAMTransferBounds(x, y, width, height));
+  SetViewportAndScissor(bounds.left * m_resolution_scale, bounds.top * m_resolution_scale,
+                        bounds.GetWidth() * m_resolution_scale, bounds.GetHeight() * m_resolution_scale);
+
+  const VRAMFillUBOData uniforms = GetVRAMFillUBOData(x, y, width, height, color);
+  DrawUtilityShader(m_vram_fill_pixel_shaders[BoolToUInt8(IsVRAMFillOversized(x, y, width, height))]
+                                             [BoolToUInt8(IsInterlacedRenderingEnabled())]
+                                               .Get(),
                    &uniforms, sizeof(uniforms));

  RestoreGraphicsAPIState();
--- a/src/core/gpu_hw_d3d11.h
+++ b/src/core/gpu_hw_d3d11.h
@ -123,8 +123,7 @@ private:
  ComPtr<ID3D11VertexShader> m_screen_quad_vertex_shader;
  ComPtr<ID3D11VertexShader> m_uv_quad_vertex_shader;
  ComPtr<ID3D11PixelShader> m_copy_pixel_shader;
-  ComPtr<ID3D11PixelShader> m_vram_fill_pixel_shader;
-  ComPtr<ID3D11PixelShader> m_vram_interlaced_fill_pixel_shader;
+  std::array<std::array<ComPtr<ID3D11PixelShader>, 2>, 2> m_vram_fill_pixel_shaders;  // [wrapped][interlaced]
  ComPtr<ID3D11PixelShader> m_vram_read_pixel_shader;
  ComPtr<ID3D11PixelShader> m_vram_write_pixel_shader;
  ComPtr<ID3D11PixelShader> m_vram_copy_pixel_shader;
--- a/src/core/gpu_hw_d3d12.cpp
+++ b/src/core/gpu_hw_d3d12.cpp
@ -420,8 +420,8 @@ bool GPU_HW_D3D12::CompilePipelines()
                             m_true_color, m_scaled_dithering, m_texture_filtering, m_using_uv_limits,
                             m_pgxp_depth_buffer, m_supports_dual_source_blend);

-  ShaderCompileProgressTracker progress("Compiling Pipelines", 2 + (4 * 9 * 2 * 2) + (2 * 4 * 5 * 9 * 2 * 2) + 1 + 2 +
-                                                                 2 + 2 + 1 + 1 + (2 * 3) + 1);
+  ShaderCompileProgressTracker progress("Compiling Pipelines", 2 + (4 * 9 * 2 * 2) + (2 * 4 * 5 * 9 * 2 * 2) + 1 +
+                                                                 (2 * 2) + 2 + 2 + 1 + 1 + (2 * 3) + 1);

  // vertex shaders - [textured]
  // fragment shaders - [render_mode][texture_mode][dithering][interlacing]
@ -561,23 +561,24 @@ bool GPU_HW_D3D12::CompilePipelines()
  gpbuilder.SetDepthStencilFormat(m_vram_depth_texture.GetFormat());

  // VRAM fill
+  for (u8 wrapped = 0; wrapped < 2; wrapped++)
  {
    for (u8 interlaced = 0; interlaced < 2; interlaced++)
    {
      ComPtr<ID3DBlob> fs = shader_cache.GetPixelShader(
-        (interlaced == 0) ? shadergen.GenerateFillFragmentShader() : shadergen.GenerateInterlacedFillFragmentShader());
+        shadergen.GenerateVRAMFillFragmentShader(ConvertToBoolUnchecked(wrapped), ConvertToBoolUnchecked(interlaced)));
      if (!fs)
        return false;

      gpbuilder.SetPixelShader(fs.Get());
      gpbuilder.SetDepthState(true, true, D3D12_COMPARISON_FUNC_ALWAYS);

-      m_vram_fill_pipelines[interlaced] = gpbuilder.Create(g_d3d12_context->GetDevice(), shader_cache, false);
-      if (!m_vram_fill_pipelines[interlaced])
+      m_vram_fill_pipelines[wrapped][interlaced] = gpbuilder.Create(g_d3d12_context->GetDevice(), shader_cache, false);
+      if (!m_vram_fill_pipelines[wrapped][interlaced])
        return false;

-      D3D12::SetObjectNameFormatted(m_vram_fill_pipelines[interlaced].Get(), "VRAM Fill Pipeline Interlacing=%u",
-                                    interlaced);
+      D3D12::SetObjectNameFormatted(m_vram_fill_pipelines[wrapped][interlaced].Get(),
+                                    "VRAM Fill Pipeline Wrapped=%u,Interlacing=%u", wrapped, interlaced);

      progress.Increment();
    }
@ -994,31 +995,22 @@ void GPU_HW_D3D12::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
    FillSoftwareRendererVRAM(x, y, width, height, color);

  // TODO: Use fast clear
-  if ((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT)
-  {
-    // CPU round trip if oversized for now.
-    Log_WarningPrintf("Oversized VRAM fill (%u-%u, %u-%u), CPU round trip", x, x + width, y, y + height);
-    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
-    GPU::FillVRAM(x, y, width, height, color);
-    UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data(), false, false);
-    return;
-  }
-
  GPU_HW::FillVRAM(x, y, width, height, color);

-  x *= m_resolution_scale;
-  y *= m_resolution_scale;
-  width *= m_resolution_scale;
-  height *= m_resolution_scale;
-
  const VRAMFillUBOData uniforms = GetVRAMFillUBOData(x, y, width, height, color);

  ID3D12GraphicsCommandList* cmdlist = g_d3d12_context->GetCommandList();
  cmdlist->SetGraphicsRootSignature(m_single_sampler_root_signature.Get());
  cmdlist->SetGraphicsRoot32BitConstants(0, sizeof(uniforms) / sizeof(u32), &uniforms, 0);
  cmdlist->SetGraphicsRootDescriptorTable(1, g_d3d12_context->GetNullSRVDescriptor());
-  cmdlist->SetPipelineState(m_vram_fill_pipelines[BoolToUInt8(IsInterlacedRenderingEnabled())].Get());
-  D3D12::SetViewportAndScissor(cmdlist, x, y, width, height);
+  cmdlist->SetPipelineState(m_vram_fill_pipelines[BoolToUInt8(IsVRAMFillOversized(x, y, width, height))]
+                                                 [BoolToUInt8(IsInterlacedRenderingEnabled())]
+                                                   .Get());
+
+  const Common::Rectangle<u32> bounds(GetVRAMTransferBounds(x, y, width, height));
+  D3D12::SetViewportAndScissor(cmdlist, bounds.left * m_resolution_scale, bounds.top * m_resolution_scale,
+                               bounds.GetWidth() * m_resolution_scale, bounds.GetHeight() * m_resolution_scale);
+
  cmdlist->DrawInstanced(3, 1, 0, 0);

  RestoreGraphicsAPIState();
--- a/src/core/gpu_hw_d3d12.h
+++ b/src/core/gpu_hw_d3d12.h
@ -92,8 +92,8 @@ private:
  // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing]
  DimensionalArray<ComPtr<ID3D12PipelineState>, 2, 2, 5, 9, 4, 2> m_batch_pipelines;

-  // [interlaced]
-  std::array<ComPtr<ID3D12PipelineState>, 2> m_vram_fill_pipelines;
+  // [wrapped][interlaced]
+  DimensionalArray<ComPtr<ID3D12PipelineState>, 2, 2> m_vram_fill_pipelines;

  // [depth_test]
  std::array<ComPtr<ID3D12PipelineState>, 2> m_vram_write_pipelines;
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@ -517,7 +517,7 @@ bool GPU_HW_OpenGL::CompilePrograms()
                             m_true_color, m_scaled_dithering, m_texture_filtering, m_using_uv_limits,
                             m_pgxp_depth_buffer, m_supports_dual_source_blend);

-  ShaderCompileProgressTracker progress("Compiling Programs", (4 * 9 * 2 * 2) + (2 * 3) + 1 + 1 + 1 + 1 + 1 + 1);
+  ShaderCompileProgressTracker progress("Compiling Programs", (4 * 9 * 2 * 2) + (2 * 3) + (2 * 2) + 1 + 1 + 1 + 1 + 1);

  for (u32 render_mode = 0; render_mode < 4; render_mode++)
  {
@ -609,8 +609,13 @@ bool GPU_HW_OpenGL::CompilePrograms()
    }
  }

-  std::optional<GL::Program> prog = shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {},
-                                                            shadergen.GenerateInterlacedFillFragmentShader(),
+  for (u8 wrapped = 0; wrapped < 2; wrapped++)
+  {
+    for (u8 interlaced = 0; interlaced < 2; interlaced++)
+    {
+      std::optional<GL::Program> prog = shader_cache.GetProgram(
+        shadergen.GenerateScreenQuadVertexShader(), {},
+        shadergen.GenerateVRAMFillFragmentShader(ConvertToBoolUnchecked(wrapped), ConvertToBoolUnchecked(interlaced)),
        [this, use_binding_layout](GL::Program& prog) {
          if (!IsGLES() && !use_binding_layout)
            prog.BindFragData(0, "o_col0");
@ -621,10 +626,12 @@ bool GPU_HW_OpenGL::CompilePrograms()
      if (!use_binding_layout)
        prog->BindUniformBlock("UBOBlock", 1);

-  m_vram_interlaced_fill_program = std::move(*prog);
+      m_vram_fill_programs[wrapped][interlaced] = std::move(*prog);
      progress.Increment();
+    }
+  }

-  prog =
+  std::optional<GL::Program> prog =
    shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {}, shadergen.GenerateVRAMReadFragmentShader(),
                            [this, use_binding_layout](GL::Program& prog) {
                              if (!IsGLES() && !use_binding_layout)
@ -1014,28 +1021,17 @@ void GPU_HW_OpenGL::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
  if (IsUsingSoftwareRendererForReadbacks())
    FillSoftwareRendererVRAM(x, y, width, height, color);

-  if ((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT)
-  {
-    // CPU round trip if oversized for now.
-    Log_WarningPrintf("Oversized VRAM fill (%u-%u, %u-%u), CPU round trip", x, x + width, y, y + height);
-    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
-    GPU::FillVRAM(x, y, width, height, color);
-    UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_ptr, false, false);
-    return;
-  }
-
  GPU_HW::FillVRAM(x, y, width, height, color);

-  // scale coordinates
-  x *= m_resolution_scale;
-  y *= m_resolution_scale;
-  width *= m_resolution_scale;
-  height *= m_resolution_scale;
-
-  glScissor(x, m_vram_texture.GetHeight() - y - height, width, height);
+  const Common::Rectangle<u32> bounds(GetVRAMTransferBounds(x, y, width, height));
+  glScissor(bounds.left * m_resolution_scale,
+            m_vram_texture.GetHeight() - (bounds.top * m_resolution_scale) - (height * m_resolution_scale),
+            width * m_resolution_scale, height * m_resolution_scale);

  // fast path when not using interlaced rendering
-  if (!IsInterlacedRenderingEnabled())
+  const bool wrapped = IsVRAMFillOversized(x, y, width, height);
+  const bool interlaced = IsInterlacedRenderingEnabled();
+  if (!wrapped && !interlaced)
  {
    const auto [r, g, b, a] =
      RGBA8ToFloat(m_true_color ? color : VRAMRGBA5551ToRGBA8888(VRAMRGBA8888ToRGBA5551(color)));
@ -1048,7 +1044,7 @@ void GPU_HW_OpenGL::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
  {
    const VRAMFillUBOData uniforms = GetVRAMFillUBOData(x, y, width, height, color);

-    m_vram_interlaced_fill_program.Bind();
+    m_vram_fill_programs[BoolToUInt8(wrapped)][BoolToUInt8(interlaced)].Bind();
    UploadUniformBuffer(&uniforms, sizeof(uniforms));
    glDisable(GL_BLEND);
    SetDepthFunc(GL_ALWAYS);
--- a/src/core/gpu_hw_opengl.h
+++ b/src/core/gpu_hw_opengl.h
@ -99,7 +99,7 @@ private:
  std::array<std::array<std::array<std::array<GL::Program, 2>, 2>, 9>, 4>
    m_render_programs;                                          // [render_mode][texture_mode][dithering][interlacing]
  std::array<std::array<GL::Program, 3>, 2> m_display_programs; // [depth_24][interlaced]
-  GL::Program m_vram_interlaced_fill_program;
+  std::array<std::array<GL::Program, 2>, 2> m_vram_fill_programs;
  GL::Program m_vram_read_program;
  GL::Program m_vram_write_program;
  GL::Program m_vram_copy_program;
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@ -997,27 +997,6 @@ float4 SampleFromVRAM(uint4 texpage, float2 coords)
  return ss.str();
 }

-std::string GPU_HW_ShaderGen::GenerateInterlacedFillFragmentShader()
-{
-  std::stringstream ss;
-  WriteHeader(ss);
-  WriteCommonFunctions(ss);
-  DeclareUniformBuffer(ss, {"float4 u_fill_color", "uint u_interlaced_displayed_field"}, true);
-  DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true);
-
-  ss << R"(
-{
-  if ((fixYCoord(uint(v_pos.y)) & 1u) == u_interlaced_displayed_field)
-    discard;
-
-  o_col0 = u_fill_color;
-  o_depth = u_fill_color.a;
-}
-)";
-
-  return ss.str();
-}
-
 std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit,
                                                            GPU_HW::InterlacedRenderMode interlace_mode,
                                                            bool smooth_chroma)
@ -1324,6 +1303,50 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
  return ss.str();
 }

+std::string GPU_HW_ShaderGen::GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced)
+{
+  std::stringstream ss;
+  WriteHeader(ss);
+  WriteCommonFunctions(ss);
+  DefineMacro(ss, "PGXP_DEPTH", m_pgxp_depth);
+  DefineMacro(ss, "WRAPPED", wrapped);
+  DefineMacro(ss, "INTERLACED", interlaced);
+
+  DeclareUniformBuffer(
+    ss, {"uint2 u_dst_coords", "uint2 u_end_coords", "float4 u_fill_color", "uint u_interlaced_displayed_field"}, true);
+
+  DeclareFragmentEntryPoint(ss, 0, 1, {}, interlaced || wrapped, 1, true, false, false, false);
+  ss << R"(
+{
+#if INTERLACED || WRAPPED
+  uint2 dst_coords = uint2(uint(v_pos.x), fixYCoord(uint(v_pos.y)));
+#endif
+
+#if INTERLACED
+  if ((dst_coords.y & 1u) == u_interlaced_displayed_field)
+    discard;
+#endif
+
+#if WRAPPED
+  // make sure it's not oversized and out of range
+  if ((dst_coords.x < u_dst_coords.x && dst_coords.x >= u_end_coords.x) ||
+      (dst_coords.y < u_dst_coords.y && dst_coords.y >= u_end_coords.y))
+  {
+    discard;
+  }
+#endif
+
+  o_col0 = u_fill_color;
+#if !PGXP_DEPTH
+  o_depth = u_fill_color.a;
+#else
+  o_depth = 1.0f;
+#endif
+})";
+
+  return ss.str();
+}
+
 std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader()
 {
  std::stringstream ss;
--- a/src/core/gpu_hw_shadergen.h
+++ b/src/core/gpu_hw_shadergen.h
@ -13,12 +13,12 @@ public:
  std::string GenerateBatchVertexShader(bool textured);
  std::string GenerateBatchFragmentShader(GPU_HW::BatchRenderMode transparency, GPUTextureMode texture_mode,
                                          bool dithering, bool interlacing);
-  std::string GenerateInterlacedFillFragmentShader();
  std::string GenerateDisplayFragmentShader(bool depth_24bit, GPU_HW::InterlacedRenderMode interlace_mode,
                                            bool smooth_chroma);
  std::string GenerateVRAMReadFragmentShader();
  std::string GenerateVRAMWriteFragmentShader(bool use_ssbo);
  std::string GenerateVRAMCopyFragmentShader();
+  std::string GenerateVRAMFillFragmentShader(bool wrapped, bool interlaced);
  std::string GenerateVRAMUpdateDepthFragmentShader();

  std::string GenerateAdaptiveDownsampleMipFragmentShader(bool first_pass);
--- a/src/core/gpu_hw_vulkan.cpp
+++ b/src/core/gpu_hw_vulkan.cpp
@ -840,7 +840,7 @@ bool GPU_HW_Vulkan::CompilePipelines()
                             m_pgxp_depth_buffer, m_supports_dual_source_blend);

  ShaderCompileProgressTracker progress("Compiling Pipelines", 2 + (4 * 9 * 2 * 2) + (3 * 4 * 5 * 9 * 2 * 2) + 1 + 2 +
-                                                                 2 + 2 + 1 + 1 + (2 * 3) + 1);
+                                                                 (2 * 2) + 2 + 1 + 1 + (2 * 3) + 1);

  // vertex shaders - [textured]
  // fragment shaders - [render_mode][texture_mode][dithering][interlacing]
@ -1010,11 +1010,12 @@ bool GPU_HW_Vulkan::CompilePipelines()
  gpbuilder.SetMultisamples(m_multisamples, false);

  // VRAM fill
+  for (u8 wrapped = 0; wrapped < 2; wrapped++)
  {
    for (u8 interlaced = 0; interlaced < 2; interlaced++)
    {
      VkShaderModule fs = g_vulkan_shader_cache->GetFragmentShader(
-        (interlaced == 0) ? shadergen.GenerateFillFragmentShader() : shadergen.GenerateInterlacedFillFragmentShader());
+        shadergen.GenerateVRAMFillFragmentShader(ConvertToBoolUnchecked(wrapped), ConvertToBoolUnchecked(interlaced)));
      if (fs == VK_NULL_HANDLE)
        return false;

@ -1022,9 +1023,9 @@ bool GPU_HW_Vulkan::CompilePipelines()
      gpbuilder.SetFragmentShader(fs);
      gpbuilder.SetDepthState(true, true, VK_COMPARE_OP_ALWAYS);

-      m_vram_fill_pipelines[interlaced] = gpbuilder.Create(device, pipeline_cache, false);
+      m_vram_fill_pipelines[wrapped][interlaced] = gpbuilder.Create(device, pipeline_cache, false);
      vkDestroyShaderModule(device, fs, nullptr);
-      if (m_vram_fill_pipelines[interlaced] == VK_NULL_HANDLE)
+      if (m_vram_fill_pipelines[wrapped][interlaced] == VK_NULL_HANDLE)
        return false;

      progress.Increment();
@ -1249,8 +1250,7 @@ void GPU_HW_Vulkan::DestroyPipelines()
 {
  m_batch_pipelines.enumerate(Vulkan::Util::SafeDestroyPipeline);

-  for (VkPipeline& p : m_vram_fill_pipelines)
-    Vulkan::Util::SafeDestroyPipeline(p);
+  m_vram_fill_pipelines.enumerate(Vulkan::Util::SafeDestroyPipeline);

  for (VkPipeline& p : m_vram_write_pipelines)
    Vulkan::Util::SafeDestroyPipeline(p);
@ -1482,23 +1482,8 @@ void GPU_HW_Vulkan::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
  if (IsUsingSoftwareRendererForReadbacks())
    FillSoftwareRendererVRAM(x, y, width, height, color);

-  if ((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT)
-  {
-    // CPU round trip if oversized for now.
-    Log_WarningPrintf("Oversized VRAM fill (%u-%u, %u-%u), CPU round trip", x, x + width, y, y + height);
-    ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
-    GPU::FillVRAM(x, y, width, height, color);
-    UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_ptr, false, false);
-    return;
-  }
-
  GPU_HW::FillVRAM(x, y, width, height, color);

-  x *= m_resolution_scale;
-  y *= m_resolution_scale;
-  width *= m_resolution_scale;
-  height *= m_resolution_scale;
-
  BeginVRAMRenderPass();

  VkCommandBuffer cmdbuf = g_vulkan_context->GetCurrentCommandBuffer();
@ -1506,8 +1491,12 @@ void GPU_HW_Vulkan::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
  vkCmdPushConstants(cmdbuf, m_no_samplers_pipeline_layout, VK_SHADER_STAGE_FRAGMENT_BIT, 0, sizeof(uniforms),
                     &uniforms);
  vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS,
-                    m_vram_fill_pipelines[BoolToUInt8(IsInterlacedRenderingEnabled())]);
-  Vulkan::Util::SetViewportAndScissor(cmdbuf, x, y, width, height);
+                    m_vram_fill_pipelines[BoolToUInt8(IsVRAMFillOversized(x, y, width, height))]
+                                         [BoolToUInt8(IsInterlacedRenderingEnabled())]);
+
+  const Common::Rectangle<u32> bounds(GetVRAMTransferBounds(x, y, width, height));
+  Vulkan::Util::SetViewportAndScissor(cmdbuf, bounds.left * m_resolution_scale, bounds.top * m_resolution_scale,
+                                      bounds.GetWidth() * m_resolution_scale, bounds.GetHeight() * m_resolution_scale);
  vkCmdDraw(cmdbuf, 3, 1, 0, 0);

  RestoreGraphicsAPIState();
--- a/src/core/gpu_hw_vulkan.h
+++ b/src/core/gpu_hw_vulkan.h
@ -129,8 +129,8 @@ private:
  // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing]
  DimensionalArray<VkPipeline, 2, 2, 5, 9, 4, 3> m_batch_pipelines{};

-  // [interlaced]
-  std::array<VkPipeline, 2> m_vram_fill_pipelines{};
+  // [wrapped][interlaced]
+  DimensionalArray<VkPipeline, 2, 2> m_vram_fill_pipelines{};

  // [depth_test]
  std::array<VkPipeline, 2> m_vram_write_pipelines{};