From b9186139d0ebc35e998dc8a878ace927f4cc30c4 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Tue, 7 Jan 2025 17:03:26 +1000 Subject: [PATCH] GPU/HW: Use sized tristrips instead of fullscreen quads --- src/core/gpu_backend.cpp | 43 +++++++++++++++++++++++ src/core/gpu_backend.h | 19 +++++++++++ src/core/gpu_hw.cpp | 64 +++++++++++++++++++++-------------- src/core/gpu_hw.h | 2 ++ src/core/gpu_hw_shadergen.cpp | 21 ++++++++++++ src/core/gpu_hw_shadergen.h | 2 ++ 6 files changed, 125 insertions(+), 26 deletions(-) diff --git a/src/core/gpu_backend.cpp b/src/core/gpu_backend.cpp index 4f138e3d4..4e6c30707 100644 --- a/src/core/gpu_backend.cpp +++ b/src/core/gpu_backend.cpp @@ -1209,6 +1209,49 @@ bool GPUBackend::ApplyChromaSmoothing() return true; } +void GPUBackend::SetScreenQuadInputLayout(GPUPipeline::GraphicsConfig& config) +{ + static constexpr GPUPipeline::VertexAttribute screen_vertex_attributes[] = { + GPUPipeline::VertexAttribute::Make(0, GPUPipeline::VertexAttribute::Semantic::Position, 0, + GPUPipeline::VertexAttribute::Type::Float, 2, OFFSETOF(ScreenVertex, x)), + GPUPipeline::VertexAttribute::Make(1, GPUPipeline::VertexAttribute::Semantic::TexCoord, 0, + GPUPipeline::VertexAttribute::Type::Float, 2, OFFSETOF(ScreenVertex, u)), + }; + + // common state + config.input_layout.vertex_attributes = screen_vertex_attributes; + config.input_layout.vertex_stride = sizeof(ScreenVertex); + config.primitive = GPUPipeline::Primitive::TriangleStrips; +} + +GSVector4 GPUBackend::GetScreenQuadClipSpaceCoordinates(const GSVector4i bounds, const GSVector2i rt_size) +{ + const GSVector4 fboundsxxyy = GSVector4(bounds.xzyw()); + const GSVector2 fsize = GSVector2(rt_size); + const GSVector2 x = ((fboundsxxyy.xy() * GSVector2::cxpr(2.0f)) / fsize.xx()) - GSVector2::cxpr(1.0f); + const GSVector2 y = GSVector2::cxpr(1.0f) - (GSVector2::cxpr(2.0f) * (fboundsxxyy.zw() / fsize.yy())); + return GSVector4::xyxy(x, y).xzyw(); +} + +void GPUBackend::DrawScreenQuad(const GSVector4i bounds, const GSVector2i rt_size, + const GSVector4 uv_bounds /* = GSVector4::cxpr(0.0f, 0.0f, 1.0f, 1.0f) */) +{ + const GSVector4 xy = GetScreenQuadClipSpaceCoordinates(bounds, rt_size); + + ScreenVertex* vertices; + u32 space; + u32 base_vertex; + g_gpu_device->MapVertexBuffer(sizeof(ScreenVertex), 4, reinterpret_cast(&vertices), &space, &base_vertex); + + vertices[0].Set(xy.xy(), uv_bounds.xy()); + vertices[1].Set(xy.zyzw().xy(), uv_bounds.zyzw().xy()); + vertices[2].Set(xy.xwzw().xy(), uv_bounds.xwzw().xy()); + vertices[3].Set(xy.zw(), uv_bounds.zw()); + + g_gpu_device->UnmapVertexBuffer(sizeof(ScreenVertex), 4); + g_gpu_device->Draw(4, base_vertex); +} + void GPUBackend::CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rotation, bool apply_aspect_ratio, GSVector4i* display_rect, GSVector4i* draw_rect) const { diff --git a/src/core/gpu_backend.h b/src/core/gpu_backend.h index 230c8297f..53f79fb8b 100644 --- a/src/core/gpu_backend.h +++ b/src/core/gpu_backend.h @@ -118,6 +118,19 @@ protected: DEINTERLACE_BUFFER_COUNT = 4, }; + struct ScreenVertex + { + float x; + float y; + float u; + float v; + + ALWAYS_INLINE void Set(const GSVector2& xy, const GSVector2& uv) + { + GSVector4::store(this, GSVector4::xyxy(xy, uv)); + } + }; + virtual void ReadVRAM(u32 x, u32 y, u32 width, u32 height) = 0; virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced_rendering, u8 interlaced_display_field) = 0; @@ -143,6 +156,12 @@ protected: virtual bool AllocateMemorySaveState(System::MemorySaveState& mss, Error* error) = 0; virtual void DoMemoryState(StateWrapper& sw, System::MemorySaveState& mss) = 0; + static void SetScreenQuadInputLayout(GPUPipeline::GraphicsConfig& config); + static GSVector4 GetScreenQuadClipSpaceCoordinates(const GSVector4i bounds, const GSVector2i rt_size); + + void DrawScreenQuad(const GSVector4i bounds, const GSVector2i rt_size, + const GSVector4 uv_bounds = GSVector4::cxpr(0.0f, 0.0f, 1.0f, 1.0f)); + /// Helper function for computing the draw rectangle in a larger window. void CalculateDrawRect(s32 window_width, s32 window_height, bool apply_rotation, bool apply_aspect_ratio, GSVector4i* display_rect, GSVector4i* draw_rect) const; diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 9e600cb10..4f60c7f3b 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -1057,6 +1057,15 @@ bool GPU_HW::CompileCommonShaders(Error* error) if (!m_fullscreen_quad_vertex_shader) return false; + GL_OBJECT_NAME(m_fullscreen_quad_vertex_shader, "Fullscreen Quad Vertex Shader"); + + m_screen_quad_vertex_shader = g_gpu_device->CreateShader(GPUShaderStage::Vertex, shadergen.GetLanguage(), + shadergen.GenerateScreenVertexShader(), error); + if (!m_screen_quad_vertex_shader) + return false; + + GL_OBJECT_NAME(m_screen_quad_vertex_shader, "Screen Quad Vertex Shader"); + return true; } @@ -1538,12 +1547,11 @@ bool GPU_HW::CompilePipelines(Error* error) batch_shader_guard.Run(); // common state - plconfig.input_layout.vertex_attributes = {}; - plconfig.input_layout.vertex_stride = 0; + SetScreenQuadInputLayout(plconfig); + plconfig.vertex_shader = m_screen_quad_vertex_shader.get(); plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants; plconfig.per_sample_shading = false; plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState(); - plconfig.vertex_shader = m_fullscreen_quad_vertex_shader.get(); plconfig.color_formats[1] = needs_rov_depth ? VRAM_DS_COLOR_FORMAT : GPUTexture::Format::Unknown; // VRAM fill @@ -1631,8 +1639,6 @@ bool GPU_HW::CompilePipelines(Error* error) } } - plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants; - // VRAM write replacement { std::unique_ptr fs = g_gpu_device->CreateShader( @@ -1641,6 +1647,7 @@ bool GPU_HW::CompilePipelines(Error* error) return false; plconfig.fragment_shader = fs.get(); + plconfig.layout = GPUPipeline::Layout::SingleTextureAndPushConstants; plconfig.depth = GPUPipeline::DepthState::GetNoTestsState(); if (!(m_vram_write_replacement_pipeline = g_gpu_device->CreatePipeline(plconfig, error))) return false; @@ -1649,6 +1656,11 @@ bool GPU_HW::CompilePipelines(Error* error) return false; } + plconfig.vertex_shader = m_fullscreen_quad_vertex_shader.get(); + plconfig.primitive = GPUPipeline::Primitive::Triangles; + plconfig.input_layout.vertex_attributes = {}; + plconfig.input_layout.vertex_stride = 0; + // VRAM update depth if (m_write_mask_as_depth) { @@ -1954,6 +1966,7 @@ void GPU_HW::UpdateVRAMReadTexture(bool drawn, bool written) void GPU_HW::UpdateDepthBufferFromMaskBit() { + GL_SCOPE_FMT("UpdateDepthBufferFromMaskBit()"); DebugAssert(!m_pgxp_depth_buffer && m_vram_depth_texture && m_write_mask_as_depth); // Viewport should already be set full, only need to fudge the scissor. @@ -2997,9 +3010,10 @@ bool GPU_HW::BlitVRAMReplacementTexture(GPUTexture* tex, u32 dst_x, u32 dst_y, u g_gpu_device->SetTextureSampler(0, tex, g_gpu_device->GetLinearSampler()); g_gpu_device->SetPipeline(m_vram_write_replacement_pipeline.get()); - g_gpu_device->SetViewportAndScissor(dst_x, dst_y, width, height); - g_gpu_device->Draw(3, 0); + const GSVector4i rect(dst_x, dst_y, dst_x + width, dst_y + height); + g_gpu_device->SetScissor(rect); + DrawScreenQuad(rect, m_vram_texture->GetSizeVec()); RestoreDeviceContext(); return true; } @@ -3225,9 +3239,6 @@ void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, bool inter const bool is_oversized = (((x + width) > VRAM_WIDTH || (y + height) > VRAM_HEIGHT)); g_gpu_device->SetPipeline(m_vram_fill_pipelines[BoolToUInt8(is_oversized)][BoolToUInt8(interlaced_rendering)].get()); - const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale)); - g_gpu_device->SetViewportAndScissor(scaled_bounds); - struct VRAMFillUBOData { u32 u_dst_x; @@ -3247,7 +3258,10 @@ void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, bool inter GPUDevice::RGBA8ToFloat(m_true_color ? color : VRAMRGBA5551ToRGBA8888(VRAMRGBA8888ToRGBA5551(color))); uniforms.u_interlaced_displayed_field = active_line_lsb; g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); - g_gpu_device->Draw(3, 0); + + const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale)); + g_gpu_device->SetScissor(scaled_bounds); + DrawScreenQuad(scaled_bounds, m_vram_texture->GetSizeVec()); RestoreDeviceContext(); } @@ -3357,14 +3371,15 @@ void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* da { DeactivateROV(); - std::unique_ptr upload_texture; + GPUDevice::AutoRecycleTexture upload_texture; u32 map_index; if (!g_gpu_device->GetFeatures().supports_texture_buffers) { map_index = 0; - upload_texture = g_gpu_device->FetchTexture(width, height, 1, 1, 1, GPUTexture::Type::Texture, - GPUTexture::Format::R16U, GPUTexture::Flags::None, data, data_pitch); + upload_texture = + g_gpu_device->FetchAutoRecycleTexture(width, height, 1, 1, 1, GPUTexture::Type::Texture, GPUTexture::Format::R16U, + GPUTexture::Flags::None, data, data_pitch); if (!upload_texture) { ERROR_LOG("Failed to get {}x{} upload texture. Things are gonna break.", width, height); @@ -3406,21 +3421,17 @@ void GPU_HW::UpdateVRAMOnGPU(u32 x, u32 y, u32 width, u32 height, const void* da GetCurrentNormalizedVertexDepth()}; // the viewport should already be set to the full vram, so just adjust the scissor - const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale)); - g_gpu_device->SetScissor(scaled_bounds.left, scaled_bounds.top, scaled_bounds.width(), scaled_bounds.height()); g_gpu_device->SetPipeline(m_vram_write_pipelines[BoolToUInt8(check_mask && m_write_mask_as_depth)].get()); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); + if (upload_texture) - { g_gpu_device->SetTextureSampler(0, upload_texture.get(), g_gpu_device->GetNearestSampler()); - g_gpu_device->Draw(3, 0); - g_gpu_device->RecycleTexture(std::move(upload_texture)); - } else - { g_gpu_device->SetTextureBuffer(0, m_vram_upload_buffer.get()); - g_gpu_device->Draw(3, 0); - } + + const GSVector4i scaled_bounds = bounds.mul32l(GSVector4i(m_resolution_scale)); + g_gpu_device->SetScissor(scaled_bounds); + DrawScreenQuad(scaled_bounds, m_vram_texture->GetSizeVec()); RestoreDeviceContext(); } @@ -3492,12 +3503,13 @@ void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 GetCurrentNormalizedVertexDepth()}; // VRAM read texture should already be bound. - const GSVector4i dst_bounds_scaled = dst_bounds.mul32l(GSVector4i(m_resolution_scale)); - g_gpu_device->SetViewportAndScissor(dst_bounds_scaled); g_gpu_device->SetPipeline(m_vram_copy_pipelines[BoolToUInt8(check_mask && m_write_mask_as_depth)].get()); g_gpu_device->SetTextureSampler(0, m_vram_read_texture.get(), g_gpu_device->GetNearestSampler()); g_gpu_device->PushUniformBuffer(&uniforms, sizeof(uniforms)); - g_gpu_device->Draw(3, 0); + + const GSVector4i dst_bounds_scaled = dst_bounds.mul32l(GSVector4i(m_resolution_scale)); + g_gpu_device->SetScissor(dst_bounds_scaled); + DrawScreenQuad(dst_bounds_scaled, m_vram_texture->GetSizeVec()); RestoreDeviceContext(); if (check_mask && !m_pgxp_depth_buffer) diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index 6f8827f9c..1cfbb0e62 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -223,6 +223,7 @@ private: bool ShouldCheckForTexPageOverlap() const; bool IsFlushed() const; + void EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices); void EnsureVertexBufferSpaceForCommand(const GPUBackendDrawCommand* cmd); void PrepareDraw(const GPUBackendDrawCommand* cmd); @@ -380,4 +381,5 @@ private: // common shaders std::unique_ptr m_fullscreen_quad_vertex_shader; + std::unique_ptr m_screen_quad_vertex_shader; }; diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 83fe2dd10..dc0a95735 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -50,6 +50,27 @@ void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) const false); } +std::string GPU_HW_ShaderGen::GenerateScreenVertexShader() const +{ + std::stringstream ss; + WriteHeader(ss); + DeclareVertexEntryPoint(ss, {"float2 a_pos", "float2 a_tex0"}, 0, 1, {}, false, "", false, false, false); + ss << R"( +{ + // Depth set to 1 for PGXP depth buffer. + v_pos = float4(a_pos, 1.0f, 1.0f); + v_tex0 = a_tex0; + + // NDC space Y flip in Vulkan. + #if API_OPENGL || API_OPENGL_ES || API_VULKAN + v_pos.y = -v_pos.y; + #endif +} +)"; + + return ss.str(); +} + std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool upscaled, bool msaa, bool per_sample_shading, bool textured, bool palette, bool page_texture, bool uv_limits, bool force_round_texcoords, bool pgxp_depth, diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index e68323564..05053a6b8 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -13,6 +13,8 @@ public: GPU_HW_ShaderGen(RenderAPI render_api, bool supports_dual_source_blend, bool supports_framebuffer_fetch); ~GPU_HW_ShaderGen(); + std::string GenerateScreenVertexShader() const; + std::string GenerateBatchVertexShader(bool upscaled, bool msaa, bool per_sample_shading, bool textured, bool palette, bool page_texture, bool uv_limits, bool force_round_texcoords, bool pgxp_depth, bool disable_color_perspective) const;