From 9446587e8fa7751b9977e7cdffe3a8f3b6d38d7c Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sun, 3 May 2020 17:11:28 +1000 Subject: [PATCH] GPU/HW: Mask bit handling in hardware renderers Fixes: - Menu effect in Ghost in the Shell - Incorrect text colours in menu of Dragon Quest VII - Fade effect in TwinBee RPG - Fog in Silent Hill - Water in Duke Nukem - Land of the Babes - Shadows in Ultraman - Fighting Evolution and probably others. --- src/common/d3d11/texture.cpp | 14 ++-- src/common/d3d11/texture.h | 2 +- src/core/gpu_hw.cpp | 114 +++++++++++++++++++++++++-- src/core/gpu_hw.h | 43 +++++++++- src/core/gpu_hw_d3d11.cpp | 115 +++++++++++++++++++++------ src/core/gpu_hw_d3d11.h | 7 ++ src/core/gpu_hw_opengl.cpp | 143 +++++++++++++++++++++++----------- src/core/gpu_hw_opengl.h | 4 + src/core/gpu_hw_shadergen.cpp | 131 +++++++++++++++++++++---------- src/core/gpu_hw_shadergen.h | 3 +- 10 files changed, 448 insertions(+), 128 deletions(-) diff --git a/src/common/d3d11/texture.cpp b/src/common/d3d11/texture.cpp index ced98eda3..1662c822c 100644 --- a/src/common/d3d11/texture.cpp +++ b/src/common/d3d11/texture.cpp @@ -27,14 +27,10 @@ D3D11_TEXTURE2D_DESC Texture::GetDesc() const return desc; } -bool Texture::Create(ID3D11Device* device, u32 width, u32 height, DXGI_FORMAT format, bool shader_resource, - bool render_target, const void* initial_data, u32 initial_data_stride) +bool Texture::Create(ID3D11Device* device, u32 width, u32 height, DXGI_FORMAT format, u32 bind_flags, + const void* initial_data, u32 initial_data_stride) { - CD3D11_TEXTURE2D_DESC desc(format, width, height, 1, 1, 0, D3D11_USAGE_DEFAULT, 0, 1, 0, 0); - if (shader_resource) - desc.BindFlags |= D3D11_BIND_SHADER_RESOURCE; - if (render_target) - desc.BindFlags |= D3D11_BIND_RENDER_TARGET; + CD3D11_TEXTURE2D_DESC desc(format, width, height, 1, 1, bind_flags, D3D11_USAGE_DEFAULT, 0, 1, 0, 0); D3D11_SUBRESOURCE_DATA srd; srd.pSysMem = initial_data; @@ -50,7 +46,7 @@ bool Texture::Create(ID3D11Device* device, u32 width, u32 height, DXGI_FORMAT fo } ComPtr srv; - if (shader_resource) + if (bind_flags & D3D11_BIND_SHADER_RESOURCE) { const CD3D11_SHADER_RESOURCE_VIEW_DESC srv_desc(D3D11_SRV_DIMENSION_TEXTURE2D, desc.Format, 0, desc.MipLevels, 0, desc.ArraySize); @@ -63,7 +59,7 @@ bool Texture::Create(ID3D11Device* device, u32 width, u32 height, DXGI_FORMAT fo } ComPtr rtv; - if (render_target) + if (bind_flags & D3D11_BIND_RENDER_TARGET) { const CD3D11_RENDER_TARGET_VIEW_DESC rtv_desc(D3D11_RTV_DIMENSION_TEXTURE2D, desc.Format, 0, 0, desc.ArraySize); const HRESULT hr = device->CreateRenderTargetView(texture.Get(), &rtv_desc, rtv.GetAddressOf()); diff --git a/src/common/d3d11/texture.h b/src/common/d3d11/texture.h index e39b7bd3c..357d42e2b 100644 --- a/src/common/d3d11/texture.h +++ b/src/common/d3d11/texture.h @@ -31,7 +31,7 @@ public: ALWAYS_INLINE operator ID3D11RenderTargetView*() const { return m_rtv.Get(); } ALWAYS_INLINE operator bool() const { return static_cast(m_texture); } - bool Create(ID3D11Device* device, u32 width, u32 height, DXGI_FORMAT format, bool shader_resource, bool render_target, + bool Create(ID3D11Device* device, u32 width, u32 height, DXGI_FORMAT format, u32 bind_flags, const void* initial_data = nullptr, u32 initial_data_stride = 0); bool Adopt(ID3D11Device* device, ComPtr texture); diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 12bc83e5d..ef3a90ccc 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -28,6 +28,7 @@ bool GPU_HW::Initialize(HostDisplay* host_display, System* system, DMA* dma, Int const Settings& settings = m_system->GetSettings(); m_resolution_scale = settings.gpu_resolution_scale; + m_render_api = host_display->GetRenderAPI(); m_true_color = settings.gpu_true_color; m_scaled_dithering = settings.gpu_scaled_dithering; m_texture_filtering = settings.gpu_texture_filtering; @@ -46,10 +47,15 @@ void GPU_HW::Reset() { GPU::Reset(); + m_batch_current_vertex_ptr = m_batch_start_vertex_ptr; + m_vram_shadow.fill(0); m_batch = {}; m_batch_ubo_data = {}; + m_batch_current_vertex_depth_id = 1; + m_batch_next_vertex_depth_id = 2; + SetBatchUBOVertexDepthID(m_batch_current_vertex_depth_id); m_batch_ubo_dirty = true; SetFullVRAMDirtyRectangle(); @@ -62,7 +68,11 @@ bool GPU_HW::DoState(StateWrapper& sw) // invalidate the whole VRAM read texture when loading state if (sw.IsReading()) + { + m_batch_current_vertex_ptr = m_batch_start_vertex_ptr; SetFullVRAMDirtyRectangle(); + ResetBatchVertexDepthID(); + } return true; } @@ -177,12 +187,11 @@ void GPU_HW::LoadVertices() const RenderCommand rc{m_render_command.bits}; const u32 texpage = ZeroExtend32(m_draw_mode.mode_reg.bits) | (ZeroExtend32(m_draw_mode.palette_reg) << 16); - // TODO: Move this to the GPU.. switch (rc.primitive) { case Primitive::Polygon: { - EnsureVertexBufferSpace(rc.quad_polygon ? 6 : 3); + DebugAssert(GetBatchVertexSpace() >= (rc.quad_polygon ? 6u : 3u)); const u32 first_color = rc.color_for_first_vertex; const bool shaded = rc.shading_enable; @@ -308,9 +317,7 @@ void GPU_HW::LoadVertices() } // we can split the rectangle up into potentially 8 quads - const u32 required_vertices = 6 * (((rectangle_width + (TEXTURE_PAGE_WIDTH - 1)) / TEXTURE_PAGE_WIDTH) + 1u) * - (((rectangle_height + (TEXTURE_PAGE_HEIGHT - 1)) / TEXTURE_PAGE_HEIGHT) + 1u); - EnsureVertexBufferSpace(required_vertices); + DebugAssert(GetBatchVertexSpace() >= MAX_VERTICES_FOR_RECTANGLE); // Split the rectangle into multiple quads if it's greater than 256x256, as the texture page should repeat. u16 tex_top = orig_tex_top; @@ -361,7 +368,7 @@ void GPU_HW::LoadVertices() { if (!rc.polyline) { - EnsureVertexBufferSpace(2); + DebugAssert(GetBatchVertexSpace() >= 2); u32 color0, color1; VertexPosition pos0, pos1; @@ -410,7 +417,7 @@ void GPU_HW::LoadVertices() { // Multiply by two because we don't use line strips. const u32 num_vertices = GetPolyLineVertexCount(); - EnsureVertexBufferSpace(num_vertices * 2); + DebugAssert(GetBatchVertexSpace() >= (num_vertices * 2)); const u32 first_color = rc.color_for_first_vertex; const bool shaded = rc.shading_enable; @@ -534,6 +541,73 @@ void GPU_HW::EnsureVertexBufferSpace(u32 required_vertices) MapBatchVertexPointer(required_vertices); } +void GPU_HW::EnsureVertexBufferSpaceForCurrentCommand() +{ + u32 required_vertices; + switch (m_render_command.primitive) + { + case Primitive::Polygon: + required_vertices = m_render_command.quad_polygon ? 6 : 3; + break; + case Primitive::Rectangle: + required_vertices = MAX_VERTICES_FOR_RECTANGLE; + break; + case Primitive::Line: + default: + required_vertices = m_render_command.polyline ? (GetPolyLineVertexCount() * 2u) : 2u; + break; + } + + // can we fit these vertices in the current depth buffer range? + if (BatchVertexDepthIDNeedsUpdate() && + (m_batch_next_vertex_depth_id + GetBatchVertexCount() + required_vertices) > MAX_BATCH_VERTEX_COUNTER_IDS) + { + // implies FlushRender() + ResetBatchVertexDepthID(); + } + else if (m_batch_current_vertex_ptr) + { + if (GetBatchVertexSpace() >= required_vertices) + return; + + FlushRender(); + } + + MapBatchVertexPointer(required_vertices); +} + +void GPU_HW::ResetBatchVertexDepthID() +{ + Log_PerfPrint("Resetting batch vertex depth ID"); + FlushRender(); + UpdateDepthBufferFromMaskBit(); + + m_batch_current_vertex_depth_id = 1; + m_batch_next_vertex_depth_id = 2; + SetBatchUBOVertexDepthID(m_batch_current_vertex_depth_id); +} + +void GPU_HW::IncrementBatchVertexID(u32 count) +{ + DebugAssert((m_batch_next_vertex_depth_id + count) <= MAX_BATCH_VERTEX_COUNTER_IDS); + m_batch_next_vertex_depth_id += count; +} + +void GPU_HW::SetBatchUBOVertexDepthID(u32 value) +{ + u32 ubo_value; + + // In OpenGL, gl_VertexID is inclusive of the base vertex, whereas SV_VertexID in D3D isn't. + // We rely on unsigned overflow to compute the correct value based on the base vertex. + if (m_render_api != HostDisplay::RenderAPI::D3D11) + ubo_value = m_batch_base_vertex - value; + else + ubo_value = value; + + m_batch_ubo_dirty |= (m_batch_ubo_data.u_vertex_depth_id != ubo_value); + m_batch_ubo_data.u_vertex_depth_id = ubo_value; +} + void GPU_HW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) { IncludeVRAMDityRectangle( @@ -544,12 +618,26 @@ void GPU_HW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data) { DebugAssert((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT); IncludeVRAMDityRectangle(Common::Rectangle::FromExtents(x, y, width, height)); + + if (m_GPUSTAT.check_mask_before_draw) + { + // set new vertex counter since we want this to take into consideration previous masked pixels + m_batch_current_vertex_depth_id = m_batch_next_vertex_depth_id++; + SetBatchUBOVertexDepthID(m_batch_current_vertex_depth_id); + } } void GPU_HW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) { IncludeVRAMDityRectangle( Common::Rectangle::FromExtents(dst_x, dst_y, width, height).Clamped(0, 0, VRAM_WIDTH, VRAM_HEIGHT)); + + if (m_GPUSTAT.check_mask_before_draw) + { + // set new vertex counter since we want this to take into consideration previous masked pixels + m_batch_current_vertex_depth_id = m_batch_next_vertex_depth_id++; + SetBatchUBOVertexDepthID(m_batch_current_vertex_depth_id); + } } void GPU_HW::DispatchRenderCommand() @@ -600,6 +688,8 @@ void GPU_HW::DispatchRenderCommand() FlushRender(); } + EnsureVertexBufferSpaceForCurrentCommand(); + // transparency mode change if (m_batch.transparency_mode != transparency_mode && transparency_mode != TransparencyMode::Disabled) { @@ -614,7 +704,8 @@ void GPU_HW::DispatchRenderCommand() { m_batch.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw; m_batch.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing; - m_batch_ubo_data.u_set_mask_while_drawing = BoolToUInt32(m_GPUSTAT.set_mask_while_drawing); + m_batch_ubo_data.u_check_mask_before_draw = BoolToUInt32(m_batch.check_mask_before_draw); + m_batch_ubo_data.u_set_mask_while_drawing = BoolToUInt32(m_batch.set_mask_while_drawing); m_batch_ubo_dirty = true; } @@ -657,6 +748,10 @@ void GPU_HW::FlushRender() if (vertex_count == 0) return; + const bool update_depth_id = BatchVertexDepthIDNeedsUpdate(); + if (update_depth_id) + SetBatchUBOVertexDepthID(m_batch_next_vertex_depth_id); + if (m_drawing_area_changed) { m_drawing_area_changed = false; @@ -680,6 +775,9 @@ void GPU_HW::FlushRender() m_renderer_stats.num_batches++; DrawBatchVertices(m_batch.GetRenderMode(), m_batch_base_vertex, vertex_count); } + + if (update_depth_id) + IncrementBatchVertexID(vertex_count); } void GPU_HW::DrawRendererStats(bool is_idle_frame) diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index b69639b0f..62fa1bb1b 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -1,6 +1,7 @@ #pragma once #include "common/heap_array.h" #include "gpu.h" +#include "host_display.h" #include #include #include @@ -42,7 +43,10 @@ protected: { VRAM_UPDATE_TEXTURE_BUFFER_SIZE = VRAM_WIDTH * VRAM_HEIGHT * sizeof(u32), VERTEX_BUFFER_SIZE = 1 * 1024 * 1024, - UNIFORM_BUFFER_SIZE = 512 * 1024 + UNIFORM_BUFFER_SIZE = 512 * 1024, + MAX_BATCH_VERTEX_COUNTER_IDS = 65536 - 2, + MAX_VERTICES_FOR_RECTANGLE = 6 * (((MAX_PRIMITIVE_WIDTH + (TEXTURE_PAGE_WIDTH - 1)) / TEXTURE_PAGE_WIDTH) + 1u) * + (((MAX_PRIMITIVE_HEIGHT + (TEXTURE_PAGE_HEIGHT - 1)) / TEXTURE_PAGE_HEIGHT) + 1u) }; struct BatchVertex @@ -102,8 +106,19 @@ protected: u32 u_texture_window_offset[2]; float u_src_alpha_factor; float u_dst_alpha_factor; - u32 u_set_mask_while_drawing; u32 u_interlaced_displayed_field; + u32 u_vertex_depth_id; + u32 u_check_mask_before_draw; + u32 u_set_mask_while_drawing; + }; + + struct VRAMWriteUBOData + { + u32 u_base_coords[2]; + u32 u_size[2]; + u32 u_buffer_base_offset; + u32 u_mask_or_bits; + float u_depth_value; }; struct VRAMCopyUBOData @@ -115,6 +130,7 @@ protected: u32 u_width; u32 u_height; u32 u_set_mask_bit; + float u_depth_value; }; struct RendererStats @@ -133,6 +149,7 @@ protected: } virtual void UpdateVRAMReadTexture() = 0; + virtual void UpdateDepthBufferFromMaskBit() = 0; virtual void SetScissorFromDrawingArea() = 0; virtual void MapBatchVertexPointer(u32 required_vertices) = 0; virtual void UnmapBatchVertexPointer(u32 used_vertices) = 0; @@ -147,11 +164,28 @@ protected: void ClearVRAMDirtyRectangle() { m_vram_dirty_rect.SetInvalid(); } void IncludeVRAMDityRectangle(const Common::Rectangle& rect); + bool IsFlushed() const { return m_batch_current_vertex_ptr == m_batch_start_vertex_ptr; } + u32 GetBatchVertexSpace() const { return static_cast(m_batch_end_vertex_ptr - m_batch_current_vertex_ptr); } u32 GetBatchVertexCount() const { return static_cast(m_batch_current_vertex_ptr - m_batch_start_vertex_ptr); } void EnsureVertexBufferSpace(u32 required_vertices); + void EnsureVertexBufferSpaceForCurrentCommand(); + void ResetBatchVertexDepthID(); + void IncrementBatchVertexID(u32 count); + void SetBatchUBOVertexDepthID(u32 value); - bool IsFlushed() const { return m_batch_current_vertex_ptr == m_batch_start_vertex_ptr; } + /// Returns the value to be written to the depth buffer for the current operation for mask bit emulation. + ALWAYS_INLINE float GetCurrentNormalizedBatchVertexDepthID() const + { + return 1.0f - (static_cast(m_batch_next_vertex_depth_id) / 65535.0f); + } + + /// Returns true if the batch vertex depth ID needs to be updated. + ALWAYS_INLINE bool BatchVertexDepthIDNeedsUpdate() const + { + // because GL uses base vertex we're incrementing the depth id every draw whether we like it or not + return m_batch.check_mask_before_draw || m_render_api != HostDisplay::RenderAPI::D3D11; + } void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) override; void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data) override; @@ -182,9 +216,12 @@ protected: BatchVertex* m_batch_end_vertex_ptr = nullptr; BatchVertex* m_batch_current_vertex_ptr = nullptr; u32 m_batch_base_vertex = 0; + u32 m_batch_current_vertex_depth_id = 0; + u32 m_batch_next_vertex_depth_id = 0; u32 m_resolution_scale = 1; u32 m_max_resolution_scale = 1; + HostDisplay::RenderAPI m_render_api = HostDisplay::RenderAPI::None; bool m_true_color = true; bool m_scaled_dithering = false; bool m_texture_filtering = false; diff --git a/src/core/gpu_hw_d3d11.cpp b/src/core/gpu_hw_d3d11.cpp index ee65bc35c..e4cd3f2c1 100644 --- a/src/core/gpu_hw_d3d11.cpp +++ b/src/core/gpu_hw_d3d11.cpp @@ -111,8 +111,7 @@ void GPU_HW_D3D11::RestoreGraphicsAPIState() m_context->IASetVertexBuffers(0, 1, m_vertex_stream_buffer.GetD3DBufferArray(), &stride, &offset); m_context->IASetInputLayout(m_batch_input_layout.Get()); m_context->PSSetShaderResources(0, 1, m_vram_read_texture.GetD3DSRVArray()); - m_context->OMSetDepthStencilState(m_depth_disabled_state.Get(), 0); - m_context->OMSetRenderTargets(1, m_vram_texture.GetD3DRTVArray(), nullptr); + m_context->OMSetRenderTargets(1, m_vram_texture.GetD3DRTVArray(), m_vram_depth_view.Get()); m_context->RSSetState(m_cull_none_rasterizer_state.Get()); SetViewport(0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); SetScissorFromDrawingArea(); @@ -171,16 +170,29 @@ bool GPU_HW_D3D11::CreateFramebuffer() const u32 texture_width = VRAM_WIDTH * m_resolution_scale; const u32 texture_height = VRAM_HEIGHT * m_resolution_scale; const DXGI_FORMAT texture_format = DXGI_FORMAT_R8G8B8A8_UNORM; + const DXGI_FORMAT depth_format = DXGI_FORMAT_D16_UNORM; - if (!m_vram_texture.Create(m_device.Get(), texture_width, texture_height, texture_format, true, true) || - !m_vram_read_texture.Create(m_device.Get(), texture_width, texture_height, texture_format, true, false) || - !m_display_texture.Create(m_device.Get(), texture_width, texture_height, texture_format, true, true) || - !m_vram_encoding_texture.Create(m_device.Get(), VRAM_WIDTH, VRAM_HEIGHT, texture_format, true, true) || + if (!m_vram_texture.Create(m_device.Get(), texture_width, texture_height, texture_format, + D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET) || + !m_vram_depth_texture.Create(m_device.Get(), texture_width, texture_height, depth_format, + D3D11_BIND_DEPTH_STENCIL) || + !m_vram_read_texture.Create(m_device.Get(), texture_width, texture_height, texture_format, + D3D11_BIND_SHADER_RESOURCE) || + !m_display_texture.Create(m_device.Get(), texture_width, texture_height, texture_format, + D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET) || + !m_vram_encoding_texture.Create(m_device.Get(), VRAM_WIDTH, VRAM_HEIGHT, texture_format, + D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET) || !m_vram_readback_texture.Create(m_device.Get(), VRAM_WIDTH, VRAM_HEIGHT, texture_format, false)) { return false; } + const CD3D11_DEPTH_STENCIL_VIEW_DESC depth_view_desc(D3D11_DSV_DIMENSION_TEXTURE2D, depth_format); + HRESULT hr = + m_device->CreateDepthStencilView(m_vram_depth_texture, &depth_view_desc, m_vram_depth_view.GetAddressOf()); + if (FAILED(hr)) + return false; + // do we need to restore the framebuffer after a size change? if (old_vram_texture) { @@ -192,10 +204,12 @@ bool GPU_HW_D3D11::CreateFramebuffer() BlitTexture(m_vram_texture.GetD3DRTV(), 0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), old_vram_texture.GetD3DSRV(), 0, 0, old_vram_texture.GetWidth(), old_vram_texture.GetHeight(), old_vram_texture.GetWidth(), old_vram_texture.GetHeight(), linear_filter); + UpdateDepthBufferFromMaskBit(); } m_context->OMSetRenderTargets(1, m_vram_texture.GetD3DRTVArray(), nullptr); SetFullVRAMDirtyRectangle(); + RestoreGraphicsAPIState(); return true; } @@ -203,12 +217,16 @@ void GPU_HW_D3D11::ClearFramebuffer() { static constexpr std::array color = {}; m_context->ClearRenderTargetView(m_vram_texture.GetD3DRTV(), color.data()); + m_context->ClearDepthStencilView(m_vram_depth_view.Get(), D3D11_CLEAR_DEPTH, 0.0f, 0); + m_context->ClearRenderTargetView(m_display_texture, color.data()); SetFullVRAMDirtyRectangle(); } void GPU_HW_D3D11::DestroyFramebuffer() { m_vram_read_texture.Destroy(); + m_vram_depth_view.Reset(); + m_vram_depth_texture.Destroy(); m_vram_texture.Destroy(); m_vram_encoding_texture.Destroy(); m_display_texture.Destroy(); @@ -289,11 +307,28 @@ bool GPU_HW_D3D11::CreateStateObjects() if (FAILED(hr)) return false; + ds_desc.DepthEnable = TRUE; + ds_desc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ALL; + ds_desc.DepthFunc = D3D11_COMPARISON_ALWAYS; + hr = m_device->CreateDepthStencilState(&ds_desc, m_depth_test_always_state.ReleaseAndGetAddressOf()); + if (FAILED(hr)) + return false; + + ds_desc.DepthFunc = D3D11_COMPARISON_GREATER_EQUAL; + hr = m_device->CreateDepthStencilState(&ds_desc, m_depth_test_less_state.ReleaseAndGetAddressOf()); + if (FAILED(hr)) + return false; + CD3D11_BLEND_DESC bl_desc = CD3D11_BLEND_DESC(CD3D11_DEFAULT()); hr = m_device->CreateBlendState(&bl_desc, m_blend_disabled_state.ReleaseAndGetAddressOf()); if (FAILED(hr)) return false; + bl_desc.RenderTarget[0].RenderTargetWriteMask = 0; + hr = m_device->CreateBlendState(&bl_desc, m_blend_no_color_writes_state.ReleaseAndGetAddressOf()); + if (FAILED(hr)) + return false; + CD3D11_SAMPLER_DESC sampler_desc = CD3D11_SAMPLER_DESC(CD3D11_DEFAULT()); sampler_desc.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; hr = m_device->CreateSamplerState(&sampler_desc, m_point_sampler_state.ReleaseAndGetAddressOf()); @@ -307,11 +342,8 @@ bool GPU_HW_D3D11::CreateStateObjects() for (u8 transparency_mode = 0; transparency_mode < 5; transparency_mode++) { - if (transparency_mode == static_cast(TransparencyMode::Disabled) && !m_texture_filtering) - { - bl_desc = CD3D11_BLEND_DESC(CD3D11_DEFAULT()); - } - else + bl_desc = CD3D11_BLEND_DESC(CD3D11_DEFAULT()); + if (transparency_mode != static_cast(TransparencyMode::Disabled) || m_texture_filtering) { bl_desc.RenderTarget[0].BlendEnable = TRUE; bl_desc.RenderTarget[0].SrcBlend = D3D11_BLEND_ONE; @@ -409,6 +441,11 @@ bool GPU_HW_D3D11::CompileShaders() if (!m_vram_copy_pixel_shader) return false; + m_vram_update_depth_pixel_shader = + m_shader_cache.GetPixelShader(m_device.Get(), shadergen.GenerateVRAMUpdateDepthFragmentShader()); + if (!m_vram_update_depth_pixel_shader) + return false; + for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++) { for (u8 interlacing = 0; interlacing < 2; interlacing++) @@ -467,6 +504,7 @@ void GPU_HW_D3D11::BlitTexture(ID3D11RenderTargetView* dst, u32 dst_x, u32 dst_y static_cast(src_height) / static_cast(src_texture_height)}; m_context->OMSetRenderTargets(1, &dst, nullptr); + m_context->OMSetDepthStencilState(m_depth_disabled_state.Get(), 0); m_context->PSSetShaderResources(0, 1, &src); m_context->PSSetSamplers( 0, 1, linear_filter ? m_linear_sampler_state.GetAddressOf() : m_point_sampler_state.GetAddressOf()); @@ -516,6 +554,8 @@ void GPU_HW_D3D11::DrawBatchVertices(BatchRenderMode render_mode, u32 base_verte const TransparencyMode transparency_mode = (render_mode == BatchRenderMode::OnlyOpaque) ? TransparencyMode::Disabled : m_batch.transparency_mode; m_context->OMSetBlendState(m_batch_blend_states[static_cast(transparency_mode)].Get(), nullptr, 0xFFFFFFFFu); + m_context->OMSetDepthStencilState( + m_batch.check_mask_before_draw ? m_depth_test_less_state.Get() : m_depth_test_always_state.Get(), 0); m_context->Draw(num_vertices, base_vertex); } @@ -567,6 +607,7 @@ void GPU_HW_D3D11::UpdateDisplay() else { m_context->OMSetRenderTargets(1, m_display_texture.GetD3DRTVArray(), nullptr); + m_context->OMSetDepthStencilState(m_depth_disabled_state.Get(), 0); m_context->PSSetShaderResources(0, 1, m_vram_texture.GetD3DSRVArray()); const u32 reinterpret_field_offset = GetInterlacedField(); @@ -604,6 +645,7 @@ void GPU_HW_D3D11::ReadVRAM(u32 x, u32 y, u32 width, u32 height) // Encode the 24-bit texture as 16-bit. const u32 uniforms[4] = {copy_rect.left, copy_rect.top, copy_rect.GetWidth(), copy_rect.GetHeight()}; m_context->OMSetRenderTargets(1, m_vram_encoding_texture.GetD3DRTVArray(), nullptr); + m_context->OMSetDepthStencilState(m_depth_disabled_state.Get(), 0); m_context->PSSetShaderResources(0, 1, m_vram_texture.GetD3DSRVArray()); SetViewportAndScissor(0, 0, encoded_width, encoded_height); DrawUtilityShader(m_vram_read_pixel_shader.Get(), uniforms, sizeof(uniforms)); @@ -654,6 +696,8 @@ void GPU_HW_D3D11::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) RGBA8ToFloat(color); uniforms.u_interlaced_displayed_field = GetInterlacedField(); + m_context->OMSetDepthStencilState(m_depth_test_always_state.Get(), 0); + SetViewportAndScissor(x * m_resolution_scale, y * m_resolution_scale, width * m_resolution_scale, height * m_resolution_scale); DrawUtilityShader(IsInterlacedRenderingEnabled() ? m_vram_interlaced_fill_pixel_shader.Get() : @@ -682,13 +726,21 @@ void GPU_HW_D3D11::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* d std::memcpy(map_result.pointer, data, num_pixels * sizeof(u16)); m_texture_stream_buffer.Unmap(m_context.Get(), num_pixels * sizeof(u16)); - const u32 uniforms[5] = {x, y, width, height, map_result.index_aligned}; + const VRAMWriteUBOData uniforms = {x, + y, + width, + height, + map_result.index_aligned, + m_GPUSTAT.set_mask_while_drawing ? 0xFFu : 0x00, + GetCurrentNormalizedBatchVertexDepthID()}; + m_context->OMSetDepthStencilState( + m_GPUSTAT.check_mask_before_draw ? m_depth_test_less_state.Get() : m_depth_test_always_state.Get(), 0); m_context->PSSetShaderResources(0, 1, m_texture_stream_buffer_srv_r16ui.GetAddressOf()); // the viewport should already be set to the full vram, so just adjust the scissor SetScissor(x * m_resolution_scale, y * m_resolution_scale, width * m_resolution_scale, height * m_resolution_scale); - DrawUtilityShader(m_vram_write_pixel_shader.Get(), uniforms, sizeof(uniforms)); + DrawUtilityShader(m_vram_write_pixel_shader.Get(), &uniforms, sizeof(uniforms)); RestoreGraphicsAPIState(); } @@ -703,19 +755,20 @@ void GPU_HW_D3D11::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 widt UpdateVRAMReadTexture(); IncludeVRAMDityRectangle(dst_bounds); - const VRAMCopyUBOData uniforms = { - src_x * m_resolution_scale, - src_y * m_resolution_scale, - dst_x * m_resolution_scale, - dst_y * m_resolution_scale, - width * m_resolution_scale, - height * m_resolution_scale, - m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, - }; + const VRAMCopyUBOData uniforms = {src_x * m_resolution_scale, + src_y * m_resolution_scale, + dst_x * m_resolution_scale, + dst_y * m_resolution_scale, + width * m_resolution_scale, + height * m_resolution_scale, + m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, + GetCurrentNormalizedBatchVertexDepthID()}; const Common::Rectangle dst_bounds_scaled(dst_bounds * m_resolution_scale); SetViewportAndScissor(dst_bounds_scaled.left, dst_bounds_scaled.top, dst_bounds_scaled.GetWidth(), dst_bounds_scaled.GetHeight()); + m_context->OMSetDepthStencilState( + m_GPUSTAT.check_mask_before_draw ? m_depth_test_less_state.Get() : m_depth_test_always_state.Get(), 0); m_context->PSSetShaderResources(0, 1, m_vram_read_texture.GetD3DSRVArray()); DrawUtilityShader(m_vram_copy_pixel_shader.Get(), &uniforms, sizeof(uniforms)); RestoreGraphicsAPIState(); @@ -728,6 +781,9 @@ void GPU_HW_D3D11::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 widt if (m_vram_dirty_rect.Intersects(Common::Rectangle::FromExtents(src_x, src_y, width, height))) UpdateVRAMReadTexture(); + if (m_GPUSTAT.IsMaskingEnabled()) + Log_WarningPrintf("Masking enabled on VRAM copy - not implemented"); + GPU_HW::CopyVRAM(src_x, src_y, dst_x, dst_y, width, height); src_x *= m_resolution_scale; @@ -749,6 +805,21 @@ void GPU_HW_D3D11::UpdateVRAMReadTexture() &src_box); } +void GPU_HW_D3D11::UpdateDepthBufferFromMaskBit() +{ + SetViewportAndScissor(0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); + + m_context->OMSetRenderTargets(0, nullptr, m_vram_depth_view.Get()); + m_context->OMSetDepthStencilState(m_depth_test_always_state.Get(), 0); + m_context->OMSetBlendState(m_blend_no_color_writes_state.Get(), nullptr, 0xFFFFFFFFu); + + m_context->PSSetShaderResources(0, 1, m_vram_texture.GetD3DSRVArray()); + DrawUtilityShader(m_vram_update_depth_pixel_shader.Get(), nullptr, 0); + + m_context->PSSetShaderResources(0, 1, m_vram_read_texture.GetD3DSRVArray()); + RestoreGraphicsAPIState(); +} + std::unique_ptr GPU::CreateHardwareD3D11Renderer() { return std::make_unique(); diff --git a/src/core/gpu_hw_d3d11.h b/src/core/gpu_hw_d3d11.h index dca10597a..38f9f1a98 100644 --- a/src/core/gpu_hw_d3d11.h +++ b/src/core/gpu_hw_d3d11.h @@ -34,6 +34,7 @@ protected: void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data) override; void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override; void UpdateVRAMReadTexture() override; + void UpdateDepthBufferFromMaskBit() override; void SetScissorFromDrawingArea() override; void MapBatchVertexPointer(u32 required_vertices) override; void UnmapBatchVertexPointer(u32 used_vertices) override; @@ -77,6 +78,8 @@ private: // downsample texture - used for readbacks at >1xIR. D3D11::Texture m_vram_texture; + D3D11::Texture m_vram_depth_texture; + ComPtr m_vram_depth_view; D3D11::Texture m_vram_read_texture; D3D11::Texture m_vram_encoding_texture; D3D11::Texture m_display_texture; @@ -94,8 +97,11 @@ private: ComPtr m_cull_none_rasterizer_state; ComPtr m_depth_disabled_state; + ComPtr m_depth_test_always_state; + ComPtr m_depth_test_less_state; ComPtr m_blend_disabled_state; + ComPtr m_blend_no_color_writes_state; ComPtr m_point_sampler_state; ComPtr m_linear_sampler_state; @@ -114,5 +120,6 @@ private: ComPtr m_vram_read_pixel_shader; ComPtr m_vram_write_pixel_shader; ComPtr m_vram_copy_pixel_shader; + ComPtr m_vram_update_depth_pixel_shader; std::array, 2>, 2> m_display_pixel_shaders; // [depth_24][interlaced] }; diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index d7494e655..9cb8bb247 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -11,6 +11,8 @@ GPU_HW_OpenGL::GPU_HW_OpenGL() : GPU_HW() {} GPU_HW_OpenGL::~GPU_HW_OpenGL() { // Destroy objects which don't have destructors to clean them up + if (m_vram_fbo_id != 0) + glDeleteFramebuffers(1, &m_vram_fbo_id); if (m_vao_id != 0) glDeleteVertexArrays(1, &m_vao_id); if (m_attributeless_vao_id != 0) @@ -90,7 +92,6 @@ void GPU_HW_OpenGL::ResetGraphicsAPIState() glEnable(GL_CULL_FACE); glDisable(GL_SCISSOR_TEST); glDisable(GL_BLEND); - glDepthMask(GL_TRUE); if (m_resolution_scale > 1 && !m_supports_geometry_shaders) glLineWidth(1.0f); glBindVertexArray(0); @@ -98,13 +99,14 @@ void GPU_HW_OpenGL::ResetGraphicsAPIState() void GPU_HW_OpenGL::RestoreGraphicsAPIState() { - m_vram_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, m_vram_fbo_id); glViewport(0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); glDisable(GL_CULL_FACE); - glDisable(GL_DEPTH_TEST); + glEnable(GL_DEPTH_TEST); glEnable(GL_SCISSOR_TEST); - glDepthMask(GL_FALSE); + glDepthMask(GL_TRUE); + glDepthFunc(GL_ALWAYS); if (m_resolution_scale > 1 && !m_supports_geometry_shaders) glLineWidth(static_cast(m_resolution_scale)); glBindVertexArray(m_vao_id); @@ -211,34 +213,16 @@ bool GPU_HW_OpenGL::CreateFramebuffer() { // save old vram texture/fbo, in case we're changing scale GL::Texture old_vram_texture = std::move(m_vram_texture); + GLuint old_vram_fbo = m_vram_fbo_id; // scale vram size to internal resolution const u32 texture_width = VRAM_WIDTH * m_resolution_scale; const u32 texture_height = VRAM_HEIGHT * m_resolution_scale; if (!m_vram_texture.Create(texture_width, texture_height, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, nullptr, false) || - !m_vram_texture.CreateFramebuffer()) - { - return false; - } - - // do we need to restore the framebuffer after a size change? - if (old_vram_texture.IsValid()) - { - const bool linear_filter = old_vram_texture.GetWidth() > m_vram_texture.GetWidth(); - Log_DevPrintf("Scaling %ux%u VRAM texture to %ux%u using %s filter", old_vram_texture.GetWidth(), - old_vram_texture.GetHeight(), m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), - linear_filter ? "linear" : "nearest"); - glDisable(GL_SCISSOR_TEST); - old_vram_texture.BindFramebuffer(GL_READ_FRAMEBUFFER); - glBlitFramebuffer(0, 0, old_vram_texture.GetWidth(), old_vram_texture.GetHeight(), 0, 0, m_vram_texture.GetWidth(), - m_vram_texture.GetHeight(), GL_COLOR_BUFFER_BIT, linear_filter ? GL_LINEAR : GL_NEAREST); - - glEnable(GL_SCISSOR_TEST); - old_vram_texture.Destroy(); - } - - if (!m_vram_read_texture.Create(texture_width, texture_height, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, nullptr, false) || + !m_vram_depth_texture.Create(texture_width, texture_height, GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, + GL_UNSIGNED_SHORT, nullptr, false) || + !m_vram_read_texture.Create(texture_width, texture_height, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, nullptr, false) || !m_vram_read_texture.CreateFramebuffer() || !m_vram_encoding_texture.Create(VRAM_WIDTH, VRAM_HEIGHT, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, nullptr, false) || !m_vram_encoding_texture.CreateFramebuffer() || @@ -248,7 +232,32 @@ bool GPU_HW_OpenGL::CreateFramebuffer() return false; } - m_vram_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); + glGenFramebuffers(1, &m_vram_fbo_id); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, m_vram_fbo_id); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, m_vram_texture.GetGLId(), 0); + glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, m_vram_depth_texture.GetGLId(), 0); + Assert(glCheckFramebufferStatus(GL_DRAW_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE); + + // do we need to restore the framebuffer after a size change? + if (old_vram_fbo != 0) + { + const bool linear_filter = old_vram_texture.GetWidth() > m_vram_texture.GetWidth(); + Log_DevPrintf("Scaling %ux%u VRAM texture to %ux%u using %s filter", old_vram_texture.GetWidth(), + old_vram_texture.GetHeight(), m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), + linear_filter ? "linear" : "nearest"); + glDisable(GL_SCISSOR_TEST); + + glBindFramebuffer(GL_READ_FRAMEBUFFER, old_vram_fbo); + glBlitFramebuffer(0, 0, old_vram_texture.GetWidth(), old_vram_texture.GetHeight(), 0, 0, m_vram_texture.GetWidth(), + m_vram_texture.GetHeight(), GL_COLOR_BUFFER_BIT, linear_filter ? GL_LINEAR : GL_NEAREST); + + glEnable(GL_SCISSOR_TEST); + old_vram_texture.Destroy(); + glDeleteFramebuffers(1, &old_vram_fbo); + + UpdateDepthBufferFromMaskBit(); + } + SetFullVRAMDirtyRectangle(); return true; } @@ -257,7 +266,8 @@ void GPU_HW_OpenGL::ClearFramebuffer() { glDisable(GL_SCISSOR_TEST); glClearColor(0.0f, 0.0f, 0.0f, 0.0f); - glClear(GL_COLOR_BUFFER_BIT); + glClearDepth(0.0f); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glEnable(GL_SCISSOR_TEST); SetFullVRAMDirtyRectangle(); } @@ -470,6 +480,15 @@ bool GPU_HW_OpenGL::CompilePrograms() } m_vram_copy_program = std::move(*prog); + prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {}, + shadergen.GenerateVRAMUpdateDepthFragmentShader()); + if (!prog) + return false; + + prog->Bind(); + prog->Uniform1i("samp0", 0); + m_vram_update_depth_program = std::move(*prog); + if (m_supports_texture_buffer) { prog = m_shader_cache.GetProgram(shadergen.GenerateScreenQuadVertexShader(), {}, @@ -519,6 +538,8 @@ void GPU_HW_OpenGL::DrawBatchVertices(BatchRenderMode render_mode, u32 base_vert glBlendFuncSeparate(GL_ONE, m_supports_dual_source_blend ? GL_SRC1_ALPHA : GL_SRC_ALPHA, GL_ONE, GL_ZERO); } + glDepthFunc(m_GPUSTAT.check_mask_before_draw ? GL_GEQUAL : GL_ALWAYS); + static constexpr std::array gl_primitives = {{GL_LINES, GL_LINE_STRIP, GL_TRIANGLES, GL_TRIANGLE_STRIP}}; glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], m_batch_base_vertex, num_vertices); } @@ -590,6 +611,7 @@ void GPU_HW_OpenGL::UpdateDisplay() { glDisable(GL_BLEND); glDisable(GL_SCISSOR_TEST); + glDisable(GL_DEPTH_TEST); m_display_programs[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)].Bind(); m_display_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); @@ -607,6 +629,7 @@ void GPU_HW_OpenGL::UpdateDisplay() m_batch_ubo_dirty = true; glViewport(0, reinterpret_field_offset, reinterpret_width, scaled_display_height); + glBindVertexArray(m_attributeless_vao_id); glDrawArrays(GL_TRIANGLES, 0, 3); m_host_display->SetDisplayTexture(reinterpret_cast(static_cast(m_display_texture.GetGLId())), @@ -615,8 +638,10 @@ void GPU_HW_OpenGL::UpdateDisplay() scaled_display_width, -static_cast(scaled_display_height)); // restore state - m_vram_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, m_vram_fbo_id); + glBindVertexArray(m_vao_id); glViewport(0, 0, m_vram_texture.GetWidth(), m_vram_texture.GetHeight()); + glEnable(GL_DEPTH_TEST); glEnable(GL_SCISSOR_TEST); } @@ -644,6 +669,7 @@ void GPU_HW_OpenGL::ReadVRAM(u32 x, u32 y, u32 width, u32 height) glDisable(GL_BLEND); glDisable(GL_SCISSOR_TEST); glViewport(0, 0, encoded_width, encoded_height); + glBindVertexArray(m_attributeless_vao_id); glDrawArrays(GL_TRIANGLES, 0, 3); // Readback encoded texture. @@ -688,7 +714,8 @@ void GPU_HW_OpenGL::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) { const auto [r, g, b, a] = RGBA8ToFloat(color); glClearColor(r, g, b, a); - glClear(GL_COLOR_BUFFER_BIT); + glClearDepth(a); + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); SetScissorFromDrawingArea(); } else @@ -705,6 +732,9 @@ void GPU_HW_OpenGL::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) m_vram_interlaced_fill_program.Bind(); UploadUniformBuffer(&uniforms, sizeof(uniforms)); + glDisable(GL_BLEND); + glDepthFunc(GL_ALWAYS); + glBindVertexArray(m_attributeless_vao_id); glDrawArrays(GL_TRIANGLES, 0, 3); RestoreGraphicsAPIState(); @@ -743,13 +773,21 @@ void GPU_HW_OpenGL::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* glViewport(scaled_x, scaled_flipped_y, scaled_width, scaled_height); glDisable(GL_BLEND); glDisable(GL_SCISSOR_TEST); + glDepthFunc(m_GPUSTAT.check_mask_before_draw ? GL_GEQUAL : GL_ALWAYS); m_vram_write_program.Bind(); glBindTexture(GL_TEXTURE_BUFFER, m_texture_buffer_r16ui_texture); - const u32 uniforms[5] = {x, flipped_y, width, height, map_result.index_aligned}; - UploadUniformBuffer(uniforms, sizeof(uniforms)); + const VRAMWriteUBOData uniforms = {x, + flipped_y, + width, + height, + map_result.index_aligned, + m_GPUSTAT.set_mask_while_drawing ? 0xFFu : 0x00, + GetCurrentNormalizedBatchVertexDepthID()}; + UploadUniformBuffer(&uniforms, sizeof(uniforms)); + glBindVertexArray(m_attributeless_vao_id); glDrawArrays(GL_TRIANGLES, 0, 3); RestoreGraphicsAPIState(); @@ -822,21 +860,21 @@ void GPU_HW_OpenGL::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 wid UpdateVRAMReadTexture(); IncludeVRAMDityRectangle(dst_bounds); - VRAMCopyUBOData uniforms = { - src_x * m_resolution_scale, - src_y * m_resolution_scale, - dst_x * m_resolution_scale, - dst_y * m_resolution_scale, - width * m_resolution_scale, - height * m_resolution_scale, - m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, - }; + VRAMCopyUBOData uniforms = {src_x * m_resolution_scale, + src_y * m_resolution_scale, + dst_x * m_resolution_scale, + dst_y * m_resolution_scale, + width * m_resolution_scale, + height * m_resolution_scale, + m_GPUSTAT.set_mask_while_drawing ? 1u : 0u, + GetCurrentNormalizedBatchVertexDepthID()}; uniforms.u_src_y = m_vram_texture.GetHeight() - uniforms.u_src_y - uniforms.u_height; uniforms.u_dst_y = m_vram_texture.GetHeight() - uniforms.u_dst_y - uniforms.u_height; UploadUniformBuffer(&uniforms, sizeof(uniforms)); glDisable(GL_SCISSOR_TEST); glDisable(GL_BLEND); + glDepthFunc(m_GPUSTAT.check_mask_before_draw ? GL_GEQUAL : GL_ALWAYS); const Common::Rectangle dst_bounds_scaled(dst_bounds * m_resolution_scale); glViewport(dst_bounds_scaled.left, @@ -876,7 +914,7 @@ void GPU_HW_OpenGL::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 wid else { glDisable(GL_SCISSOR_TEST); - m_vram_texture.BindFramebuffer(GL_READ_FRAMEBUFFER); + glBindFramebuffer(GL_READ_FRAMEBUFFER, m_vram_fbo_id); glBlitFramebuffer(src_x, src_y, src_x + width, src_y + height, dst_x, dst_y, dst_x + width, dst_y + height, GL_COLOR_BUFFER_BIT, GL_NEAREST); glEnable(GL_SCISSOR_TEST); @@ -904,14 +942,31 @@ void GPU_HW_OpenGL::UpdateVRAMReadTexture() else { m_vram_read_texture.BindFramebuffer(GL_DRAW_FRAMEBUFFER); - m_vram_texture.BindFramebuffer(GL_READ_FRAMEBUFFER); + glBindFramebuffer(GL_READ_FRAMEBUFFER, m_vram_fbo_id); glDisable(GL_SCISSOR_TEST); glBlitFramebuffer(x, y, x + width, y + height, x, y, x + width, y + height, GL_COLOR_BUFFER_BIT, GL_NEAREST); glEnable(GL_SCISSOR_TEST); - m_vram_texture.BindFramebuffer(GL_FRAMEBUFFER); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, m_vram_fbo_id); } } +void GPU_HW_OpenGL::UpdateDepthBufferFromMaskBit() +{ + glDisable(GL_SCISSOR_TEST); + glDisable(GL_BLEND); + glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); + glDepthFunc(GL_ALWAYS); + + m_vram_texture.Bind(); + m_vram_update_depth_program.Bind(); + glBindVertexArray(m_attributeless_vao_id); + glDrawArrays(GL_TRIANGLES, 0, 3); + + glBindVertexArray(m_vao_id); + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glEnable(GL_SCISSOR_TEST); +} + std::unique_ptr GPU::CreateHardwareOpenGLRenderer() { return std::make_unique(); diff --git a/src/core/gpu_hw_opengl.h b/src/core/gpu_hw_opengl.h index cb50b1394..71918c5c3 100644 --- a/src/core/gpu_hw_opengl.h +++ b/src/core/gpu_hw_opengl.h @@ -30,6 +30,7 @@ protected: void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data) override; void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override; void UpdateVRAMReadTexture() override; + void UpdateDepthBufferFromMaskBit() override; void SetScissorFromDrawingArea() override; void MapBatchVertexPointer(u32 required_vertices) override; void UnmapBatchVertexPointer(u32 used_vertices) override; @@ -63,11 +64,13 @@ private: // downsample texture - used for readbacks at >1xIR. GL::Texture m_vram_texture; + GL::Texture m_vram_depth_texture; GL::Texture m_vram_read_texture; GL::Texture m_vram_encoding_texture; GL::Texture m_display_texture; std::unique_ptr m_vertex_stream_buffer; + GLuint m_vram_fbo_id = 0; GLuint m_vao_id = 0; GLuint m_attributeless_vao_id = 0; @@ -85,6 +88,7 @@ private: GL::Program m_vram_read_program; GL::Program m_vram_write_program; GL::Program m_vram_copy_program; + GL::Program m_vram_update_depth_program; u32 m_uniform_buffer_alignment = 1; u32 m_max_texture_buffer_size = 0; diff --git a/src/core/gpu_hw_shadergen.cpp b/src/core/gpu_hw_shadergen.cpp index 35fe5b731..c75e517bf 100644 --- a/src/core/gpu_hw_shadergen.cpp +++ b/src/core/gpu_hw_shadergen.cpp @@ -319,6 +319,9 @@ void GPU_HW_ShaderGen::DeclareVertexEntryPoint( { ss << "void main(\n"; + if (declare_vertex_id) + ss << " in uint v_id : SV_VertexID,\n"; + u32 attribute_counter = 0; for (const char* attribute : attributes) { @@ -326,9 +329,6 @@ void GPU_HW_ShaderGen::DeclareVertexEntryPoint( attribute_counter++; } - if (declare_vertex_id) - ss << " in uint v_id : SV_VertexID,\n"; - for (u32 i = 0; i < num_color_outputs; i++) ss << " out float4 v_col" << i << " : COLOR" << i << ",\n"; @@ -349,7 +349,7 @@ void GPU_HW_ShaderGen::DeclareVertexEntryPoint( void GPU_HW_ShaderGen::DeclareFragmentEntryPoint( std::stringstream& ss, u32 num_color_inputs, u32 num_texcoord_inputs, const std::initializer_list>& additional_inputs, - bool declare_fragcoord /* = false */, bool dual_color_output /* = false */) + bool declare_fragcoord /* = false */, u32 num_color_outputs /* = 1 */, bool depth_output /* = false */) { if (m_glsl) { @@ -381,23 +381,18 @@ void GPU_HW_ShaderGen::DeclareFragmentEntryPoint( if (declare_fragcoord) ss << "#define v_pos gl_FragCoord\n"; + if (depth_output) + ss << "#define o_depth gl_FragDepth\n"; + if (m_use_glsl_binding_layout) { - if (dual_color_output) - { - ss << "layout(location = 0, index = 0) out float4 o_col0;\n"; - ss << "layout(location = 0, index = 1) out float4 o_col1;\n"; - } - else - { - ss << "layout(location = 0) out float4 o_col0;\n"; - } + for (u32 i = 0; i < num_color_outputs; i++) + ss << "layout(location = 0, index = " << i << ") out float4 o_col" << i << ";\n"; } else { - ss << "out float4 o_col0;\n"; - if (dual_color_output) - ss << "out float4 o_col1;\n"; + for (u32 i = 0; i < num_color_outputs; i++) + ss << "out float4 o_col" << i << ";\n"; } ss << "\n"; @@ -425,14 +420,23 @@ void GPU_HW_ShaderGen::DeclareFragmentEntryPoint( if (declare_fragcoord) ss << " in float4 v_pos : SV_Position,\n"; - if (dual_color_output) + if (depth_output) { - ss << " out float4 o_col0 : SV_Target0,\n"; - ss << " out float4 o_col1 : SV_Target1)\n"; + ss << " out float o_depth : SV_Depth"; + if (num_color_outputs > 0) + ss << ",\n"; + else + ss << ")\n"; } - else + + for (u32 i = 0; i < num_color_outputs; i++) { - ss << " out float4 o_col0 : SV_Target)"; + ss << " out float4 o_col" << i << " : SV_Target" << i; + + if (i == (num_color_outputs - 1)) + ss << ")\n"; + else + ss << ",\n"; } } } @@ -440,9 +444,10 @@ void GPU_HW_ShaderGen::DeclareFragmentEntryPoint( void GPU_HW_ShaderGen::WriteBatchUniformBuffer(std::stringstream& ss) { - DeclareUniformBuffer(ss, {"uint2 u_texture_window_mask", "uint2 u_texture_window_offset", "float u_src_alpha_factor", - "float u_dst_alpha_factor", "bool u_set_mask_while_drawing", - "uint u_interlaced_displayed_field"}); + DeclareUniformBuffer(ss, + {"uint2 u_texture_window_mask", "uint2 u_texture_window_offset", "float u_src_alpha_factor", + "float u_dst_alpha_factor", "uint u_interlaced_displayed_field", "uint u_base_vertex_depth_id", + "bool u_check_mask_before_draw", "bool u_set_mask_while_drawing"}); } std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured) @@ -459,11 +464,11 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured) if (textured) { DeclareVertexEntryPoint(ss, {"int2 a_pos", "float4 a_col0", "uint a_texcoord", "uint a_texpage"}, 1, 1, - {{"nointerpolation", "uint4 v_texpage"}}); + {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float v_depth"}}, true); } else { - DeclareVertexEntryPoint(ss, {"int2 a_pos", "float4 a_col0"}, 1, 0, {}); + DeclareVertexEntryPoint(ss, {"int2 a_pos", "float4 a_col0"}, 1, 0, {{"nointerpolation", "float v_depth"}}, true); } ss << R"( @@ -484,6 +489,12 @@ std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured) #endif v_pos = float4(pos_x, pos_y, 0.0, 1.0); +#if API_D3D11 + v_depth = 1.0 - (float(u_base_vertex_depth_id + (u_check_mask_before_draw ? 0u : v_id)) / 65535.0); +#else + v_depth = 1.0 - (float(v_id - u_base_vertex_depth_id) / 65535.0); +#endif + v_col0 = a_col0; #if TEXTURED // Fudge the texture coordinates by half a pixel in screen-space. @@ -616,11 +627,12 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) if (textured) { - DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}}, true, use_dual_source); + DeclareFragmentEntryPoint(ss, 1, 1, {{"nointerpolation", "uint4 v_texpage"}, {"nointerpolation", "float v_depth"}}, + true, use_dual_source ? 2 : 1, true); } else { - DeclareFragmentEntryPoint(ss, 1, 0, {}, true, use_dual_source); + DeclareFragmentEntryPoint(ss, 1, 0, {{"nointerpolation", "float v_depth"}}, true, use_dual_source ? 2 : 1, true); } ss << R"( @@ -736,6 +748,8 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) #else o_col0 = float4(color, u_dst_alpha_factor / ialpha); #endif + + o_depth = oalpha * v_depth; } else { @@ -752,6 +766,8 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) #else o_col0 = float4(color, 1.0 - ialpha); #endif + + o_depth = oalpha * v_depth; } #else // Non-transparency won't enable blending so we can write the mask here regardless. @@ -760,6 +776,8 @@ float4 SampleFromVRAM(uint4 texpage, uint2 icoord) #if USE_DUAL_SOURCE o_col1 = float4(0.0, 0.0, 0.0, 1.0 - ialpha); #endif + + o_depth = oalpha * v_depth; #endif } )"; @@ -783,10 +801,12 @@ CONSTANT float2 WIDTH = (1.0 / float2(VRAM_SIZE)) * float2(RESOLUTION_SCALE, RES ss << R"( in VertexData { float4 v_col0; + nointerpolation float v_depth; } in_data[]; out VertexData { float4 v_col0; + nointerpolation float v_depth; } out_data; layout(lines) in; @@ -799,21 +819,25 @@ void main() { // top-left out_data.v_col0 = in_data[0].v_col0; + out_data.v_depth = in_data[0].v_depth; gl_Position = gl_in[0].gl_Position - offset; EmitVertex(); // top-right out_data.v_col0 = in_data[0].v_col0; + out_data.v_depth = in_data[0].v_depth; gl_Position = gl_in[0].gl_Position + offset; EmitVertex(); // bottom-left out_data.v_col0 = in_data[1].v_col0; + out_data.v_depth = in_data[1].v_depth; gl_Position = gl_in[1].gl_Position - offset; EmitVertex(); // bottom-right out_data.v_col0 = in_data[1].v_col0; + out_data.v_depth = in_data[1].v_depth; gl_Position = gl_in[1].gl_Position + offset; EmitVertex(); @@ -827,6 +851,7 @@ void main() { struct Vertex { float4 col0 : COLOR0; + float depth : TEXCOORD0; float4 pos : SV_Position; }; @@ -841,21 +866,25 @@ void main(line Vertex input[2], inout TriangleStream output) // top-left v.col0 = input[0].col0; + v.depth = input[0].depth; v.pos = input[0].pos - offset; output.Append(v); // top-right v.col0 = input[0].col0; + v.depth = input[0].depth; v.pos = input[0].pos + offset; output.Append(v); // bottom-left v.col0 = input[1].col0; + v.depth = input[1].depth; v.pos = input[1].pos - offset; output.Append(v); // bottom-right v.col0 = input[1].col0; + v.depth = input[1].depth; v.pos = input[1].pos + offset; output.Append(v); @@ -890,11 +919,12 @@ std::string GPU_HW_ShaderGen::GenerateFillFragmentShader() std::stringstream ss; WriteHeader(ss); DeclareUniformBuffer(ss, {"float4 u_fill_color"}); - DeclareFragmentEntryPoint(ss, 0, 1, {}, false, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, false, 1, true); ss << R"( { o_col0 = u_fill_color; + o_depth = u_fill_color.a; } )"; @@ -907,7 +937,7 @@ std::string GPU_HW_ShaderGen::GenerateInterlacedFillFragmentShader() WriteHeader(ss); WriteCommonFunctions(ss); DeclareUniformBuffer(ss, {"float4 u_fill_color", "uint u_interlaced_displayed_field"}); - DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true); ss << R"( { @@ -915,6 +945,7 @@ std::string GPU_HW_ShaderGen::GenerateInterlacedFillFragmentShader() discard; o_col0 = u_fill_color; + o_depth = u_fill_color.a; } )"; @@ -927,12 +958,12 @@ std::string GPU_HW_ShaderGen::GenerateCopyFragmentShader() WriteHeader(ss); DeclareUniformBuffer(ss, {"float4 u_src_rect"}); DeclareTexture(ss, "samp0", 0); - DeclareFragmentEntryPoint(ss, 0, 1, {}, false, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, false, 1); ss << R"( { - float2 coords = u_src_rect.xy + v_tex0 * u_src_rect.zw; - o_col0 = SAMPLE_TEXTURE(samp0, coords); + float2 coords = u_src_rect.xy + v_tex0 * u_src_rect.zw; + o_col0 = SAMPLE_TEXTURE(samp0, coords); } )"; @@ -950,7 +981,7 @@ std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bo DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_field_offset"}); DeclareTexture(ss, "samp0", 0); - DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1); ss << R"( { uint2 icoords = uint2(v_pos.xy) + u_vram_offset; @@ -1013,7 +1044,7 @@ uint SampleVRAM(uint2 coords) } )"; - DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1); ss << R"( { uint2 sample_coords = uint2(uint(v_pos.x) * 2u, uint(v_pos.y)); @@ -1043,10 +1074,11 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader() std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); - DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size", "uint u_buffer_base_offset"}); + DeclareUniformBuffer(ss, {"uint2 u_base_coords", "uint2 u_size", "uint u_buffer_base_offset", "uint u_mask_or_bits", + "float u_depth_value"}); DeclareTextureBuffer(ss, "samp0", 0, true, true); - DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true); ss << R"( { uint2 coords = uint2(v_pos.xy) / uint2(RESOLUTION_SCALE, RESOLUTION_SCALE); @@ -1058,9 +1090,10 @@ std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader() #endif uint buffer_offset = u_buffer_base_offset + (offset.y * u_size.x) + offset.x; - uint value = LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r; + uint value = LOAD_TEXTURE_BUFFER(samp0, int(buffer_offset)).r | u_mask_or_bits; o_col0 = RGBA5551ToRGBA8(value); + o_depth = (o_col0.a == 1.0) ? u_depth_value : 0.0; })"; return ss.str(); @@ -1071,10 +1104,11 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader() std::stringstream ss; WriteHeader(ss); WriteCommonFunctions(ss); - DeclareUniformBuffer(ss, {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_size", "bool u_set_mask_bit"}); + DeclareUniformBuffer( + ss, {"uint2 u_src_coords", "uint2 u_dst_coords", "uint2 u_size", "bool u_set_mask_bit", "float u_depth_value"}); DeclareTexture(ss, "samp0", 0); - DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 1, true); ss << R"( { uint2 dst_coords = uint2(v_pos.xy); @@ -1090,7 +1124,24 @@ std::string GPU_HW_ShaderGen::GenerateVRAMCopyFragmentShader() // sample and apply mask bit float4 color = LOAD_TEXTURE(samp0, int2(src_coords), 0); o_col0 = float4(color.xyz, u_set_mask_bit ? 1.0 : color.a); + o_depth = (u_set_mask_bit ? 1.0f : ((o_col0.a == 1.0) ? u_depth_value : 0.0)); })"; return ss.str(); } + +std::string GPU_HW_ShaderGen::GenerateVRAMUpdateDepthFragmentShader() +{ + std::stringstream ss; + WriteHeader(ss); + DeclareTexture(ss, "samp0", 0); + DeclareFragmentEntryPoint(ss, 0, 1, {}, true, 0, true); + + ss << R"( +{ + o_depth = LOAD_TEXTURE(samp0, int2(v_pos.xy), 0).a; +} +)"; + + return ss.str(); +} diff --git a/src/core/gpu_hw_shadergen.h b/src/core/gpu_hw_shadergen.h index c69eea47a..bf62c0447 100644 --- a/src/core/gpu_hw_shadergen.h +++ b/src/core/gpu_hw_shadergen.h @@ -25,6 +25,7 @@ public: std::string GenerateVRAMReadFragmentShader(); std::string GenerateVRAMWriteFragmentShader(); std::string GenerateVRAMCopyFragmentShader(); + std::string GenerateVRAMUpdateDepthFragmentShader(); private: void SetGLSLVersionString(); @@ -38,7 +39,7 @@ private: bool declare_vertex_id = false); void DeclareFragmentEntryPoint(std::stringstream& ss, u32 num_color_inputs, u32 num_texcoord_inputs, const std::initializer_list>& additional_inputs, - bool declare_fragcoord = false, bool dual_color_output = false); + bool declare_fragcoord = false, u32 num_color_outputs = 1, bool depth_output = false); void WriteCommonFunctions(std::stringstream& ss); void WriteBatchUniformBuffer(std::stringstream& ss);