GPU/HW: Implement oversized copies on GPU
Fixes slowdown caused by this.
This commit is contained in:
parent
045c4d1745
commit
5ad133a278
|
@ -433,7 +433,7 @@ void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
|
||||||
*bottom = std::max<u32>((m_drawing_area.bottom + 1) * m_resolution_scale, *top + 1);
|
*bottom = std::max<u32>((m_drawing_area.bottom + 1) * m_resolution_scale, *top + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Common::Rectangle<u32> GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height)
|
Common::Rectangle<u32> GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) const
|
||||||
{
|
{
|
||||||
Common::Rectangle<u32> out_rc = Common::Rectangle<u32>::FromExtents(x, y, width, height);
|
Common::Rectangle<u32> out_rc = Common::Rectangle<u32>::FromExtents(x, y, width, height);
|
||||||
if (out_rc.right > VRAM_WIDTH)
|
if (out_rc.right > VRAM_WIDTH)
|
||||||
|
@ -449,6 +449,15 @@ Common::Rectangle<u32> GPU_HW::GetVRAMTransferBounds(u32 x, u32 y, u32 width, u3
|
||||||
return out_rc;
|
return out_rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool GPU_HW::UseVRAMCopyShader(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) const
|
||||||
|
{
|
||||||
|
// masking enabled, oversized, or overlapping
|
||||||
|
return (m_GPUSTAT.IsMaskingEnabled() || (src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT ||
|
||||||
|
(dst_x + width) > VRAM_WIDTH || (dst_y + height) > VRAM_HEIGHT ||
|
||||||
|
Common::Rectangle<u32>::FromExtents(src_x, src_y, width, height)
|
||||||
|
.Intersects(Common::Rectangle<u32>::FromExtents(dst_x, dst_y, width, height)));
|
||||||
|
}
|
||||||
|
|
||||||
GPU_HW::BatchPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
|
GPU_HW::BatchPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
|
||||||
{
|
{
|
||||||
if (rc.primitive == Primitive::Line)
|
if (rc.primitive == Primitive::Line)
|
||||||
|
|
|
@ -106,6 +106,17 @@ protected:
|
||||||
u32 u_interlaced_displayed_field;
|
u32 u_interlaced_displayed_field;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct VRAMCopyUBOData
|
||||||
|
{
|
||||||
|
u32 u_src_x;
|
||||||
|
u32 u_src_y;
|
||||||
|
u32 u_dst_x;
|
||||||
|
u32 u_dst_y;
|
||||||
|
u32 u_width;
|
||||||
|
u32 u_height;
|
||||||
|
u32 u_set_mask_bit;
|
||||||
|
};
|
||||||
|
|
||||||
struct RendererStats
|
struct RendererStats
|
||||||
{
|
{
|
||||||
u32 num_batches;
|
u32 num_batches;
|
||||||
|
@ -157,7 +168,10 @@ protected:
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Computes the area affected by a VRAM transfer, including wrap-around of X.
|
/// Computes the area affected by a VRAM transfer, including wrap-around of X.
|
||||||
Common::Rectangle<u32> GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height);
|
Common::Rectangle<u32> GetVRAMTransferBounds(u32 x, u32 y, u32 width, u32 height) const;
|
||||||
|
|
||||||
|
/// Returns true if the VRAM copy shader should be used (oversized copies, masking).
|
||||||
|
bool UseVRAMCopyShader(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) const;
|
||||||
|
|
||||||
/// Handles quads with flipped texture coordinate directions.
|
/// Handles quads with flipped texture coordinate directions.
|
||||||
static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices);
|
static void HandleFlippedQuadTextureCoordinates(BatchVertex* vertices);
|
||||||
|
|
|
@ -404,6 +404,10 @@ bool GPU_HW_D3D11::CompileShaders()
|
||||||
if (!m_vram_write_pixel_shader)
|
if (!m_vram_write_pixel_shader)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
m_vram_copy_pixel_shader = m_shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateVRAMCopyFragmentShader());
|
||||||
|
if (!m_vram_copy_pixel_shader)
|
||||||
|
return false;
|
||||||
|
|
||||||
for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++)
|
for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++)
|
||||||
{
|
{
|
||||||
for (u8 interlacing = 0; interlacing < 2; interlacing++)
|
for (u8 interlacing = 0; interlacing < 2; interlacing++)
|
||||||
|
@ -689,14 +693,30 @@ void GPU_HW_D3D11::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* d
|
||||||
|
|
||||||
void GPU_HW_D3D11::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
|
void GPU_HW_D3D11::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
|
||||||
{
|
{
|
||||||
if ((src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || (dst_x + width) > VRAM_WIDTH ||
|
if (UseVRAMCopyShader(src_x, src_y, dst_x, dst_y, width, height))
|
||||||
(dst_y + height) > VRAM_HEIGHT)
|
|
||||||
{
|
{
|
||||||
Log_WarningPrintf("Oversized VRAM copy (%u,%u, %u,%u, %u,%u), CPU round trip", src_x, src_y, dst_x, dst_y, width,
|
const Common::Rectangle<u32> src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
|
||||||
height);
|
const Common::Rectangle<u32> dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
|
||||||
ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
|
if (m_vram_dirty_rect.Intersects(src_bounds))
|
||||||
GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height);
|
UpdateVRAMReadTexture();
|
||||||
UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data());
|
IncludeVRAMDityRectangle(dst_bounds);
|
||||||
|
|
||||||
|
const VRAMCopyUBOData uniforms = {
|
||||||
|
src_x * m_resolution_scale,
|
||||||
|
src_y * m_resolution_scale,
|
||||||
|
dst_x * m_resolution_scale,
|
||||||
|
dst_y * m_resolution_scale,
|
||||||
|
width * m_resolution_scale,
|
||||||
|
height * m_resolution_scale,
|
||||||
|
m_GPUSTAT.set_mask_while_drawing ? 1u : 0u,
|
||||||
|
};
|
||||||
|
|
||||||
|
const Common::Rectangle<u32> dst_bounds_scaled(dst_bounds * m_resolution_scale);
|
||||||
|
SetViewportAndScissor(dst_bounds_scaled.left, dst_bounds_scaled.top, dst_bounds_scaled.GetWidth(),
|
||||||
|
dst_bounds_scaled.GetHeight());
|
||||||
|
m_context->PSSetShaderResources(0, 1, m_vram_read_texture.GetD3DSRVArray());
|
||||||
|
DrawUtilityShader(m_vram_copy_pixel_shader.Get(), &uniforms, sizeof(uniforms));
|
||||||
|
RestoreGraphicsAPIState();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -113,5 +113,6 @@ private:
|
||||||
ComPtr<ID3D11PixelShader> m_vram_interlaced_fill_pixel_shader;
|
ComPtr<ID3D11PixelShader> m_vram_interlaced_fill_pixel_shader;
|
||||||
ComPtr<ID3D11PixelShader> m_vram_read_pixel_shader;
|
ComPtr<ID3D11PixelShader> m_vram_read_pixel_shader;
|
||||||
ComPtr<ID3D11PixelShader> m_vram_write_pixel_shader;
|
ComPtr<ID3D11PixelShader> m_vram_write_pixel_shader;
|
||||||
|
ComPtr<ID3D11PixelShader> m_vram_copy_pixel_shader;
|
||||||
std::array<std::array<ComPtr<ID3D11PixelShader>, 2>, 2> m_display_pixel_shaders; // [depth_24][interlaced]
|
std::array<std::array<ComPtr<ID3D11PixelShader>, 2>, 2> m_display_pixel_shaders; // [depth_24][interlaced]
|
||||||
};
|
};
|
||||||
|
|
|
@ -431,6 +431,19 @@ bool GPU_HW_OpenGL::CompilePrograms()
|
||||||
prog->Uniform1i("samp0", 0);
|
prog->Uniform1i("samp0", 0);
|
||||||
m_vram_read_program = std::move(*prog);
|
m_vram_read_program = std::move(*prog);
|
||||||
|
|
||||||
|
prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {},
|
||||||
|
shadergen.GenerateVRAMCopyFragmentShader(), [this](GL::Program& prog) {
|
||||||
|
if (!m_is_gles)
|
||||||
|
prog.BindFragData(0, "o_col0");
|
||||||
|
});
|
||||||
|
if (!prog)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
prog->BindUniformBlock("UBOBlock", 1);
|
||||||
|
prog->Bind();
|
||||||
|
prog->Uniform1i("samp0", 0);
|
||||||
|
m_vram_copy_program = std::move(*prog);
|
||||||
|
|
||||||
if (m_supports_texture_buffer)
|
if (m_supports_texture_buffer)
|
||||||
{
|
{
|
||||||
prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {},
|
prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {},
|
||||||
|
@ -770,14 +783,39 @@ void GPU_HW_OpenGL::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void*
|
||||||
|
|
||||||
void GPU_HW_OpenGL::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
|
void GPU_HW_OpenGL::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
|
||||||
{
|
{
|
||||||
if ((src_x + width) > VRAM_WIDTH || (src_y + height) > VRAM_HEIGHT || (dst_x + width) > VRAM_WIDTH ||
|
if (UseVRAMCopyShader(src_x, src_y, dst_x, dst_y, width, height))
|
||||||
(dst_y + height) > VRAM_HEIGHT)
|
|
||||||
{
|
{
|
||||||
Log_WarningPrintf("Oversized VRAM copy (%u,%u, %u,%u, %u,%u), CPU round trip", src_x, src_y, dst_x, dst_y, width,
|
const Common::Rectangle<u32> src_bounds = GetVRAMTransferBounds(src_x, src_y, width, height);
|
||||||
height);
|
const Common::Rectangle<u32> dst_bounds = GetVRAMTransferBounds(dst_x, dst_y, width, height);
|
||||||
ReadVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT);
|
if (m_vram_dirty_rect.Intersects(src_bounds))
|
||||||
GPU::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height);
|
UpdateVRAMReadTexture();
|
||||||
UpdateVRAM(0, 0, VRAM_WIDTH, VRAM_HEIGHT, m_vram_shadow.data());
|
IncludeVRAMDityRectangle(dst_bounds);
|
||||||
|
|
||||||
|
VRAMCopyUBOData uniforms = {
|
||||||
|
src_x * m_resolution_scale,
|
||||||
|
src_y * m_resolution_scale,
|
||||||
|
dst_x * m_resolution_scale,
|
||||||
|
dst_y * m_resolution_scale,
|
||||||
|
width * m_resolution_scale,
|
||||||
|
height * m_resolution_scale,
|
||||||
|
m_GPUSTAT.set_mask_while_drawing ? 1u : 0u,
|
||||||
|
};
|
||||||
|
uniforms.u_src_y = m_vram_texture.GetHeight() - uniforms.u_src_y - uniforms.u_height;
|
||||||
|
uniforms.u_dst_y = m_vram_texture.GetHeight() - uniforms.u_dst_y - uniforms.u_height;
|
||||||
|
UploadUniformBuffer(&uniforms, sizeof(uniforms));
|
||||||
|
|
||||||
|
glDisable(GL_SCISSOR_TEST);
|
||||||
|
glDisable(GL_BLEND);
|
||||||
|
|
||||||
|
const Common::Rectangle<u32> dst_bounds_scaled(dst_bounds * m_resolution_scale);
|
||||||
|
glViewport(dst_bounds_scaled.left,
|
||||||
|
m_vram_texture.GetHeight() - dst_bounds_scaled.top - dst_bounds_scaled.GetHeight(),
|
||||||
|
dst_bounds_scaled.GetWidth(), dst_bounds_scaled.GetHeight());
|
||||||
|
m_vram_read_texture.Bind();
|
||||||
|
m_vram_copy_program.Bind();
|
||||||
|
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||||
|
|
||||||
|
RestoreGraphicsAPIState();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -84,6 +84,7 @@ private:
|
||||||
GL::Program m_vram_interlaced_fill_program;
|
GL::Program m_vram_interlaced_fill_program;
|
||||||
GL::Program m_vram_read_program;
|
GL::Program m_vram_read_program;
|
||||||
GL::Program m_vram_write_program;
|
GL::Program m_vram_write_program;
|
||||||
|
GL::Program m_vram_copy_program;
|
||||||
|
|
||||||
u32 m_uniform_buffer_alignment = 1;
|
u32 m_uniform_buffer_alignment = 1;
|
||||||
u32 m_max_texture_buffer_size = 0;
|
u32 m_max_texture_buffer_size = 0;
|
||||||
|
|
|
@ -967,3 +967,32 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader()
|
||||||
|
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader()
|
||||||
|
{
|
||||||
|
std::stringstream ss;
|
||||||
|
WriteHeader(ss);
|
||||||
|
WriteCommonFunctions(ss);
|
||||||
|
DeclareUniformBuffer(ss, {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_size", "bool u_set_mask_bit"});
|
||||||
|
|
||||||
|
DeclareTexture(ss, "samp0", 0);
|
||||||
|
DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false);
|
||||||
|
ss << R"(
|
||||||
|
{
|
||||||
|
uint2 dst_coords = uint2(v_pos.xy);
|
||||||
|
|
||||||
|
// find offset from the start of the row/column
|
||||||
|
uint2 offset;
|
||||||
|
offset.x = (dst_coords.x < u_dst_coords.x) ? (uint(VRAM_SIZE.x - 1) - u_dst_coords.x + dst_coords.x) : (dst_coords.x - u_dst_coords.x);
|
||||||
|
offset.y = (dst_coords.y < u_dst_coords.y) ? (uint(VRAM_SIZE.y - 1) - u_dst_coords.y + dst_coords.y) : (dst_coords.y - u_dst_coords.y);
|
||||||
|
|
||||||
|
// find the source coordinates to copy from
|
||||||
|
uint2 src_coords = (u_src_coords + offset) % uint2(VRAM_SIZE);
|
||||||
|
|
||||||
|
// sample and apply mask bit
|
||||||
|
float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0);
|
||||||
|
o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a);
|
||||||
|
})";
|
||||||
|
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ public:
|
||||||
std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
|
std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
|
||||||
std::string GenerateVRAMReadFragmentShader();
|
std::string GenerateVRAMReadFragmentShader();
|
||||||
std::string GenerateVRAMWriteFragmentShader();
|
std::string GenerateVRAMWriteFragmentShader();
|
||||||
|
std::string GenerateVRAMCopyFragmentShader();
|
||||||
|
|
||||||
HostDisplay::RenderAPI m_render_api;
|
HostDisplay::RenderAPI m_render_api;
|
||||||
u32 m_resolution_scale;
|
u32 m_resolution_scale;
|
||||||
|
|
Loading…
Reference in New Issue