From d30e076dbda802092390bbc5ba7695fd43090d4c Mon Sep 17 00:00:00 2001 From: Stenzek Date: Mon, 2 Jan 2023 23:14:10 +1000 Subject: [PATCH] GS: Add GPU Target CLUT --- bin/resources/shaders/dx11/convert.fx | 38 ++++- bin/resources/shaders/opengl/convert.glsl | 35 +++++ bin/resources/shaders/vulkan/convert.glsl | 43 ++++++ pcsx2-qt/Settings/GraphicsSettingsWidget.cpp | 1 + pcsx2-qt/Settings/GraphicsSettingsWidget.ui | 32 ++++- pcsx2/Config.h | 8 ++ pcsx2/GS/GS.cpp | 3 +- pcsx2/GS/GSClut.cpp | 59 +++++++- pcsx2/GS/GSClut.h | 7 + pcsx2/GS/Renderers/Common/GSDevice.cpp | 2 + pcsx2/GS/Renderers/Common/GSDevice.h | 5 + pcsx2/GS/Renderers/Common/GSRenderer.cpp | 5 + pcsx2/GS/Renderers/Common/GSRenderer.h | 2 + pcsx2/GS/Renderers/DX11/GSDevice11.cpp | 18 +++ pcsx2/GS/Renderers/DX11/GSDevice11.h | 1 + pcsx2/GS/Renderers/DX12/GSDevice12.cpp | 18 +++ pcsx2/GS/Renderers/DX12/GSDevice12.h | 1 + pcsx2/GS/Renderers/HW/GSRendererHW.cpp | 94 +++++++++---- pcsx2/GS/Renderers/HW/GSRendererHW.h | 12 +- pcsx2/GS/Renderers/HW/GSTextureCache.cpp | 141 +++++++++++++++---- pcsx2/GS/Renderers/HW/GSTextureCache.h | 7 +- pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp | 31 ++++ pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h | 1 + pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp | 17 +++ pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h | 2 + pcsx2/Pcsx2Config.cpp | 3 + 26 files changed, 518 insertions(+), 68 deletions(-) diff --git a/bin/resources/shaders/dx11/convert.fx b/bin/resources/shaders/dx11/convert.fx index 0b8577912b..5228468e52 100644 --- a/bin/resources/shaders/dx11/convert.fx +++ b/bin/resources/shaders/dx11/convert.fx @@ -23,7 +23,8 @@ cbuffer cb0 : register(b0) float4 BGColor; int EMODA; int EMODC; - int cb0_pad[2]; + int DOFFSET; + int cb0_pad; }; static const float3x3 rgb2yuv = @@ -291,6 +292,41 @@ PS_OUTPUT ps_convert_rgba_8i(PS_INPUT input) return output; } +PS_OUTPUT ps_convert_clut_4(PS_INPUT input) +{ + // Borrowing the YUV constant buffer. + float2 scale = BGColor.xy; + uint2 offset = uint2(uint(EMODA), uint(EMODC)) + uint(DOFFSET); + + // CLUT4 is easy, just two rows of 8x8. + uint index = uint(input.p.x); + uint2 pos = uint2(index % 8u, index / 8u); + + int2 final = int2(floor(float2(offset + pos) * scale)); + PS_OUTPUT output; + output.c = Texture.Load(int3(final, 0), 0); + return output; +} + +PS_OUTPUT ps_convert_clut_8(PS_INPUT input) +{ + float2 scale = BGColor.xy; + uint2 offset = uint2(uint(EMODA), uint(EMODC)); + uint index = min(uint(input.p.x) + uint(DOFFSET), 240u); + + // CLUT is arranged into 8 groups of 16x2, with the top-right and bottom-left quadrants swapped. + // This can probably be done better.. + uint subgroup = (index / 8u) % 4u; + uint2 pos; + pos.x = (index % 8u) + ((subgroup >= 2u) ? 8u : 0u); + pos.y = ((index / 32u) * 2u) + (subgroup % 2u); + + int2 final = int2(floor(float2(offset + pos) * scale)); + PS_OUTPUT output; + output.c = Texture.Load(int3(final, 0), 0); + return output; +} + PS_OUTPUT ps_yuv(PS_INPUT input) { PS_OUTPUT output; diff --git a/bin/resources/shaders/opengl/convert.glsl b/bin/resources/shaders/opengl/convert.glsl index 9d4cb9d856..32d5ae8e45 100644 --- a/bin/resources/shaders/opengl/convert.glsl +++ b/bin/resources/shaders/opengl/convert.glsl @@ -314,6 +314,41 @@ void ps_hdr_resolve() } #endif +#ifdef ps_convert_clut_4 +uniform uvec3 offset; +uniform vec2 scale; + +void ps_convert_clut_4() +{ + // CLUT4 is easy, just two rows of 8x8. + uint index = uint(gl_FragCoord.x) + offset.z; + uvec2 pos = uvec2(index % 8u, index / 8u); + + ivec2 final = ivec2(floor(vec2(offset.xy + pos) * scale)); + SV_Target0 = texelFetch(TextureSampler, final, 0); +} +#endif + +#ifdef ps_convert_clut_8 +uniform uvec3 offset; +uniform vec2 scale; + +void ps_convert_clut_8() +{ + uint index = min(uint(gl_FragCoord.x) + offset.z, 240u); + + // CLUT is arranged into 8 groups of 16x2, with the top-right and bottom-left quadrants swapped. + // This can probably be done better.. + uint subgroup = (index / 8u) % 4u; + uvec2 pos; + pos.x = (index % 8u) + ((subgroup >= 2u) ? 8u : 0u); + pos.y = ((index / 32u) * 2u) + (subgroup % 2u); + + ivec2 final = ivec2(floor(vec2(offset.xy + pos) * scale)); + SV_Target0 = texelFetch(TextureSampler, final, 0); +} +#endif + #ifdef ps_yuv uniform ivec2 EMOD; diff --git a/bin/resources/shaders/vulkan/convert.glsl b/bin/resources/shaders/vulkan/convert.glsl index b93298c08f..13f9a242b8 100644 --- a/bin/resources/shaders/vulkan/convert.glsl +++ b/bin/resources/shaders/vulkan/convert.glsl @@ -274,6 +274,49 @@ void ps_convert_rgba_8i() } #endif +#ifdef ps_convert_clut_4 +layout(push_constant) uniform cb10 +{ + vec2 scale; + uvec2 offset; + uint doffset; +}; + +void ps_convert_clut_4() +{ + // CLUT4 is easy, just two rows of 8x8. + uint index = uint(gl_FragCoord.x) + doffset; + uvec2 pos = uvec2(index % 8u, index / 8u); + + ivec2 final = ivec2(floor(vec2(offset + pos) * scale)); + o_col0 = texelFetch(samp0, final, 0); +} +#endif + +#ifdef ps_convert_clut_8 +layout(push_constant) uniform cb10 +{ + vec2 scale; + uvec2 offset; + uint doffset; +}; + +void ps_convert_clut_8() +{ + uint index = min(uint(gl_FragCoord.x) + doffset, 240u); + + // CLUT is arranged into 8 groups of 16x2, with the top-right and bottom-left quadrants swapped. + // This can probably be done better.. + uint subgroup = (index / 8u) % 4u; + uvec2 pos; + pos.x = (index % 8u) + ((subgroup >= 2u) ? 8u : 0u); + pos.y = ((index / 32u) * 2u) + (subgroup % 2u); + + ivec2 final = ivec2(floor(vec2(offset + pos) * scale)); + o_col0 = texelFetch(samp0, final, 0); +} +#endif + #ifdef ps_yuv layout(push_constant) uniform cb10 { diff --git a/pcsx2-qt/Settings/GraphicsSettingsWidget.cpp b/pcsx2-qt/Settings/GraphicsSettingsWidget.cpp index c81d4205d1..f47ab8cc0e 100644 --- a/pcsx2-qt/Settings/GraphicsSettingsWidget.cpp +++ b/pcsx2-qt/Settings/GraphicsSettingsWidget.cpp @@ -205,6 +205,7 @@ GraphicsSettingsWidget::GraphicsSettingsWidget(SettingsDialog* dialog, QWidget* SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.halfScreenFix, "EmuCore/GS", "UserHacks_Half_Bottom_Override", -1, -1); SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.cpuSpriteRenderBW, "EmuCore/GS", "UserHacks_CPUSpriteRenderBW", 0); SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.cpuCLUTRender, "EmuCore/GS", "UserHacks_CPUCLUTRender", 0); + SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.gpuTargetCLUTMode, "EmuCore/GS", "UserHacks_GPUTargetCLUTMode", 0); SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.skipDrawStart, "EmuCore/GS", "UserHacks_SkipDraw_Start", 0); SettingWidgetBinder::BindWidgetToIntSetting(sif, m_ui.skipDrawEnd, "EmuCore/GS", "UserHacks_SkipDraw_End", 0); SettingWidgetBinder::BindWidgetToBoolSetting(sif, m_ui.hwAutoFlush, "EmuCore/GS", "UserHacks_AutoFlush", false); diff --git a/pcsx2-qt/Settings/GraphicsSettingsWidget.ui b/pcsx2-qt/Settings/GraphicsSettingsWidget.ui index 7b317c88ea..3872dc5c28 100644 --- a/pcsx2-qt/Settings/GraphicsSettingsWidget.ui +++ b/pcsx2-qt/Settings/GraphicsSettingsWidget.ui @@ -920,14 +920,14 @@ - + Skipdraw Range: - + @@ -945,7 +945,7 @@ - + @@ -1030,6 +1030,32 @@ + + + + GPU Target CLUT: + + + + + + + + Disabled (Default) + + + + + Enabled (Exact Match) + + + + + Enabled (Check Inside Target) + + + + diff --git a/pcsx2/Config.h b/pcsx2/Config.h index 059f76aebe..669e6dd6fd 100644 --- a/pcsx2/Config.h +++ b/pcsx2/Config.h @@ -350,6 +350,13 @@ enum class GSCASMode : u8 SharpenAndResize, }; +enum class GSGPUTargetCLUTMode : u8 +{ + Disabled, + Enabled, + InsideTarget, +}; + // Template function for casting enumerations to their underlying type template typename std::underlying_type::type enum_cast(Enumeration E) @@ -727,6 +734,7 @@ struct Pcsx2Config int UserHacks_TCOffsetY{0}; int UserHacks_CPUSpriteRenderBW{0}; int UserHacks_CPUCLUTRender{ 0 }; + GSGPUTargetCLUTMode UserHacks_GPUTargetCLUTMode{GSGPUTargetCLUTMode::Disabled}; TriFiltering TriFilter{TriFiltering::Automatic}; int OverrideTextureBarriers{-1}; int OverrideGeometryShaders{-1}; diff --git a/pcsx2/GS/GS.cpp b/pcsx2/GS/GS.cpp index cbfc044c27..b572c2e25c 100644 --- a/pcsx2/GS/GS.cpp +++ b/pcsx2/GS/GS.cpp @@ -728,7 +728,8 @@ void GSUpdateConfig(const Pcsx2Config::GSOptions& new_config) GSConfig.UserHacks_DisablePartialInvalidation != old_config.UserHacks_DisablePartialInvalidation || GSConfig.UserHacks_TextureInsideRt != old_config.UserHacks_TextureInsideRt || GSConfig.UserHacks_CPUSpriteRenderBW != old_config.UserHacks_CPUSpriteRenderBW || - GSConfig.UserHacks_CPUCLUTRender != old_config.UserHacks_CPUCLUTRender) + GSConfig.UserHacks_CPUCLUTRender != old_config.UserHacks_CPUCLUTRender || + GSConfig.UserHacks_GPUTargetCLUTMode != old_config.UserHacks_GPUTargetCLUTMode) { g_gs_renderer->PurgeTextureCache(); g_gs_renderer->PurgePool(); diff --git a/pcsx2/GS/GSClut.cpp b/pcsx2/GS/GSClut.cpp index 2b5cef5488..d37b5865df 100644 --- a/pcsx2/GS/GSClut.cpp +++ b/pcsx2/GS/GSClut.cpp @@ -14,9 +14,11 @@ */ #include "PrecompiledHeader.h" -#include "GSClut.h" -#include "GSLocalMemory.h" -#include "GSGL.h" +#include "GS/GSClut.h" +#include "GS/GSLocalMemory.h" +#include "GS/GSGL.h" +#include "GS/Renderers/Common/GSDevice.h" +#include "GS/Renderers/Common/GSRenderer.h" #include "common/AlignedMalloc.h" GSClut::GSClut(GSLocalMemory* mem) @@ -103,6 +105,11 @@ GSClut::GSClut(GSLocalMemory* mem) GSClut::~GSClut() { + if (m_gpu_clut4) + delete m_gpu_clut4; + if (m_gpu_clut8) + delete m_gpu_clut8; + _aligned_free(m_clut); } @@ -381,6 +388,52 @@ void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA) break; } } + + m_current_gpu_clut = nullptr; + if (GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled) + { + const bool is_4bit = (TEX0.PSM == PSM_PSMT4 || TEX0.PSM == PSM_PSMT4HL || TEX0.PSM == PSM_PSMT4HH); + + u32 CBW; + GSVector2i offset; + GSVector2i size; + if (!TEX0.CSM) + { + CBW = 0; // don't care + offset = {}; + size.x = is_4bit ? 8 : 16; + size.y = is_4bit ? 2 : 16; + } + else + { + CBW = m_write.TEXCLUT.CBW; + offset.x = m_write.TEXCLUT.COU; + offset.y = m_write.TEXCLUT.COV; + size.x = is_4bit ? 16 : 256; + size.y = 1; + } + + GSTexture* src = g_gs_renderer->LookupPaletteSource(TEX0.CBP, TEX0.CPSM, CBW, offset, size); + if (src) + { + GSTexture* dst = is_4bit ? m_gpu_clut4 : m_gpu_clut8; + u32 dst_size = is_4bit ? 16 : 256; + const u32 dOffset = (TEX0.CSA & ((TEX0.CPSM == PSM_PSMCT16 || TEX0.CPSM == PSM_PSMCT16S) ? 15u : 31u)) << 4; + if (!dst) + { + // allocate texture lazily + dst = g_gs_device->CreateRenderTarget(dst_size, 1, GSTexture::Format::Color, false); + is_4bit ? (m_gpu_clut4 = dst) : (m_gpu_clut8 = dst); + } + if (dst) + { + GL_PUSH("Update GPU CLUT [CBP=%04X, CPSM=%s, CBW=%u, CSA=%u, Offset=(%d,%d)]", + TEX0.CBP, psm_str(TEX0.CPSM), CBW, TEX0.CSA, offset.x, offset.y); + g_gs_device->UpdateCLUTTexture(src, offset.x, offset.y, dst, dOffset, dst_size); + m_current_gpu_clut = dst; + } + } + } } } diff --git a/pcsx2/GS/GSClut.h b/pcsx2/GS/GSClut.h index ecb8f8d89b..f616ee86d2 100644 --- a/pcsx2/GS/GSClut.h +++ b/pcsx2/GS/GSClut.h @@ -21,6 +21,7 @@ #include "GSAlignedClass.h" class GSLocalMemory; +class GSTexture; class alignas(32) GSClut final : public GSAlignedClass<32> { @@ -55,6 +56,10 @@ class alignas(32) GSClut final : public GSAlignedClass<32> bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA); } m_read; + GSTexture* m_gpu_clut4 = nullptr; + GSTexture* m_gpu_clut8 = nullptr; + GSTexture* m_current_gpu_clut = nullptr; + typedef void (GSClut::*writeCLUT)(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT); writeCLUT m_wc[2][16][64]; @@ -101,6 +106,8 @@ public: GSClut(GSLocalMemory* mem); ~GSClut(); + __fi GSTexture* GetGPUTexture() const { return m_current_gpu_clut; } + bool InvalidateRange(u32 start_block, u32 end_block, bool is_draw = false); u8 IsInvalid(); void ClearDrawInvalidity(); diff --git a/pcsx2/GS/Renderers/Common/GSDevice.cpp b/pcsx2/GS/Renderers/Common/GSDevice.cpp index a65f3419d4..fee6ef54f2 100644 --- a/pcsx2/GS/Renderers/Common/GSDevice.cpp +++ b/pcsx2/GS/Renderers/Common/GSDevice.cpp @@ -46,6 +46,8 @@ const char* shaderName(ShaderConvert value) case ShaderConvert::RGB5A1_TO_FLOAT16_BILN: return "ps_convert_rgb5a1_float16_biln"; case ShaderConvert::DEPTH_COPY: return "ps_depth_copy"; case ShaderConvert::RGBA_TO_8I: return "ps_convert_rgba_8i"; + case ShaderConvert::CLUT_4: return "ps_convert_clut_4"; + case ShaderConvert::CLUT_8: return "ps_convert_clut_8"; case ShaderConvert::YUV: return "ps_yuv"; // clang-format on default: diff --git a/pcsx2/GS/Renderers/Common/GSDevice.h b/pcsx2/GS/Renderers/Common/GSDevice.h index 4483b068db..41b33ed7c0 100644 --- a/pcsx2/GS/Renderers/Common/GSDevice.h +++ b/pcsx2/GS/Renderers/Common/GSDevice.h @@ -49,6 +49,8 @@ enum class ShaderConvert RGB5A1_TO_FLOAT16_BILN, DEPTH_COPY, RGBA_TO_8I, + CLUT_4, + CLUT_8, YUV, Count }; @@ -834,6 +836,9 @@ public: /// Performs a screen blit for display. If dTex is null, it assumes you are writing to the system framebuffer/swap chain. virtual void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, PresentShader shader, float shaderTime, bool linear) {} + /// Updates a GPU CLUT texture from a source texture. + virtual void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) {} + virtual void RenderHW(GSHWDrawConfig& config) {} __fi FeatureSupport Features() const { return m_features; } diff --git a/pcsx2/GS/Renderers/Common/GSRenderer.cpp b/pcsx2/GS/Renderers/Common/GSRenderer.cpp index 70ba6be14a..a93177b819 100644 --- a/pcsx2/GS/Renderers/Common/GSRenderer.cpp +++ b/pcsx2/GS/Renderers/Common/GSRenderer.cpp @@ -954,6 +954,11 @@ void GSRenderer::PurgeTextureCache() { } +GSTexture* GSRenderer::LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, const GSVector2i& size) +{ + return nullptr; +} + bool GSRenderer::SaveSnapshotToMemory(u32 window_width, u32 window_height, bool apply_aspect, bool crop_borders, u32* width, u32* height, std::vector* pixels) { diff --git a/pcsx2/GS/Renderers/Common/GSRenderer.h b/pcsx2/GS/Renderers/Common/GSRenderer.h index 1c45f32f48..553dc3657f 100644 --- a/pcsx2/GS/Renderers/Common/GSRenderer.h +++ b/pcsx2/GS/Renderers/Common/GSRenderer.h @@ -54,6 +54,8 @@ public: virtual void PurgePool() override; virtual void PurgeTextureCache(); + virtual GSTexture* LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, const GSVector2i& size); + bool SaveSnapshotToMemory(u32 window_width, u32 window_height, bool apply_aspect, bool crop_borders, u32* width, u32* height, std::vector* pixels); diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp index fa674ddc0f..1f5941816e 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp @@ -773,6 +773,24 @@ void GSDevice11::PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* PSSetShaderResources(nullptr, nullptr); } +void GSDevice11::UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) +{ + // match merge cb + struct Uniforms + { + float scaleX, scaleY; + float pad1[2]; + u32 offsetX, offsetY, dOffset; + u32 pad2; + }; + const Uniforms cb = {sTex->GetScale().x, sTex->GetScale().y, 0.0f, 0.0f, offsetX, offsetY, dOffset}; + m_ctx->UpdateSubresource(m_merge.cb.get(), 0, nullptr, &cb, 0, 0); + + const GSVector4 dRect(0, 0, dSize, 1); + const ShaderConvert shader = (dSize == 16) ? ShaderConvert::CLUT_4 : ShaderConvert::CLUT_8; + StretchRect(sTex, GSVector4::zero(), dTex, dRect, m_convert.ps[static_cast(shader)].get(), m_merge.cb.get(), nullptr, false); +} + void GSDevice11::DoMerge(GSTexture* sTex[3], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, const GSRegPMODE& PMODE, const GSRegEXTBUF& EXTBUF, const GSVector4& c) { const GSVector4 full_r(0.0f, 0.0f, 1.0f, 1.0f); diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.h b/pcsx2/GS/Renderers/DX11/GSDevice11.h index 6a620b9456..5781bbd6f7 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.h +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.h @@ -267,6 +267,7 @@ public: void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, bool red, bool green, bool blue, bool alpha) override; void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, ID3D11BlendState* bs, bool linear = true); void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, PresentShader shader, float shaderTime, bool linear) override; + void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) override; void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm); diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp index 4942a479fc..d61c568687 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.cpp +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.cpp @@ -521,6 +521,24 @@ void GSDevice12::PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* m_present[static_cast(shader)].get(), linear); } +void GSDevice12::UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) +{ + struct Uniforms + { + float scaleX, scaleY; + float pad1[2]; + u32 offsetX, offsetY, dOffset; + u32 pad2; + }; + const Uniforms cb = {sTex->GetScale().x, sTex->GetScale().y, 0.0f, 0.0f, offsetX, offsetY, dOffset}; + SetUtilityPushConstants(&cb, sizeof(cb)); + + const GSVector4 dRect(0, 0, dSize, 1); + const ShaderConvert shader = (dSize == 16) ? ShaderConvert::CLUT_4 : ShaderConvert::CLUT_8; + DoStretchRect(static_cast(sTex), GSVector4::zero(), static_cast(dTex), dRect, + m_convert[static_cast(shader)].get(), false); +} + void GSDevice12::BeginRenderPassForStretchRect(GSTexture12* dTex, const GSVector4i& dtex_rc, const GSVector4i& dst_rc) { const bool is_whole_target = dst_rc.eq(dtex_rc); diff --git a/pcsx2/GS/Renderers/DX12/GSDevice12.h b/pcsx2/GS/Renderers/DX12/GSDevice12.h index c2d22ac4d7..58ab04c716 100644 --- a/pcsx2/GS/Renderers/DX12/GSDevice12.h +++ b/pcsx2/GS/Renderers/DX12/GSDevice12.h @@ -264,6 +264,7 @@ public: bool green, bool blue, bool alpha) override; void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, PresentShader shader, float shaderTime, bool linear) override; + void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) override; void BeginRenderPassForStretchRect(GSTexture12* dTex, const GSVector4i& dtex_rc, const GSVector4i& dst_rc); void DoStretchRect(GSTexture12* sTex, const GSVector4& sRect, GSTexture12* dTex, const GSVector4& dRect, diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp index 56a646291b..fb9f9113a1 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp @@ -120,6 +120,11 @@ void GSRendererHW::PurgeTextureCache() m_tc->RemoveAll(); } +GSTexture* GSRendererHW::LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, const GSVector2i& size) +{ + return m_tc->LookupPaletteSource(CBP, CPSM, CBW, offset, size); +} + bool GSRendererHW::UpdateTexIsFB(GSTextureCache::Target* dst, const GIFRegTEX0& TEX0) { if (GSConfig.AccurateBlendingUnit == AccBlendLevel::Minimum || !g_gs_device->Features().texture_barrier) @@ -1406,11 +1411,12 @@ void GSRendererHW::Draw() } // SW CLUT Render enable. - if (GSConfig.UserHacks_CPUCLUTRender > 0) + bool preload = GSConfig.PreloadFrameWithGSData; + if (GSConfig.UserHacks_CPUCLUTRender > 0 || GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled) { - bool result = (GSConfig.UserHacks_CPUCLUTRender == 1) ? PossibleCLUTDraw() : PossibleCLUTDrawAggressive(); + const CLUTDrawTestResult result = (GSConfig.UserHacks_CPUCLUTRender == 2) ? PossibleCLUTDrawAggressive() : PossibleCLUTDraw(); m_mem.m_clut.ClearDrawInvalidity(); - if (result) + if (result == CLUTDrawTestResult::CLUTDrawOnCPU && GSConfig.UserHacks_CPUCLUTRender > 0) { if (SwPrimRender(*this, true)) { @@ -1418,6 +1424,17 @@ void GSRendererHW::Draw() return; } } + else if (result != CLUTDrawTestResult::NotCLUTDraw) + { + // Force enable preloading if any of the existing data is needed. + // e.g. NFSMW only writes the alpha channel, and needs the RGB preloaded. + if (((fm & fm_mask) != fm_mask) || // Some channels masked + !IsOpaque()) // Blending enabled + { + GL_INS("Forcing preload due to partial/blended CLUT draw"); + preload = true; + } + } } if (m_channel_shuffle) @@ -1743,7 +1760,7 @@ void GSRendererHW::Draw() GSTextureCache::Target* rt = nullptr; if (!no_rt) - rt = m_tc->LookupTarget(TEX0, t_size, GSTextureCache::RenderTarget, true, fm); + rt = m_tc->LookupTarget(TEX0, t_size, GSTextureCache::RenderTarget, true, fm, false, 0, 0, preload); TEX0.TBP0 = context->ZBUF.Block(); TEX0.TBW = context->FRAME.FBW; @@ -1751,7 +1768,7 @@ void GSRendererHW::Draw() GSTextureCache::Target* ds = nullptr; if (!no_ds) - ds = m_tc->LookupTarget(TEX0, t_size, GSTextureCache::DepthStencil, context->DepthWrite()); + ds = m_tc->LookupTarget(TEX0, t_size, GSTextureCache::DepthStencil, context->DepthWrite(), 0, false, 0, 0, preload); if (rt) { @@ -3964,46 +3981,46 @@ void GSRendererHW::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc g_gs_device->RenderHW(m_conf); } -bool GSRendererHW::PossibleCLUTDraw() +GSRendererHW::CLUTDrawTestResult GSRendererHW::PossibleCLUTDraw() { // No shuffles. if (m_channel_shuffle || m_texture_shuffle) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Keep the draws simple, no alpha testing, blending, mipmapping, Z writes, and make sure it's flat. const bool fb_only = m_context->TEST.ATE && m_context->TEST.AFAIL == 1 && m_context->TEST.ATST == ATST_NEVER; // No Z writes, unless it's points, then it's quite likely to be a palette and they left it on. if (!m_context->ZBUF.ZMSK && !fb_only && !(m_vt.m_primclass == GS_POINT_CLASS)) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Make sure it's flat. if (m_vt.m_eq.z != 0x1) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // No mipmapping, please never be any mipmapping... if (m_context->TEX1.MXL) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Writing to the framebuffer for output. We're not interested. - Note: This stops NFS HP2 Busted screens working, but they're glitchy anyway // what NFS HP2 really needs is a kind of shuffle with mask, 32bit target is interpreted as 16bit and masked. if ((m_regs->DISP[0].DISPFB.Block() == m_context->FRAME.Block()) || (m_regs->DISP[1].DISPFB.Block() == m_context->FRAME.Block()) || (PRIM->TME && ((m_regs->DISP[0].DISPFB.Block() == m_context->TEX0.TBP0) || (m_regs->DISP[1].DISPFB.Block() == m_context->TEX0.TBP0)) && !(m_mem.m_clut.IsInvalid() & 2))) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Ignore recursive/shuffle effects, but possible it will recursively draw, but make sure it's staying in page width if (PRIM->TME && m_context->TEX0.TBP0 == m_context->FRAME.Block() && (m_context->FRAME.FBW != 1 && m_context->TEX0.TBW == m_context->FRAME.FBW)) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Hopefully no games draw a CLUT with a CLUT, that would be evil, most likely a channel shuffle. if (PRIM->TME && GSLocalMemory::m_psm[m_context->TEX0.PSM].pal > 0) - return false; + return CLUTDrawTestResult::NotCLUTDraw; const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[m_context->FRAME.PSM]; // Make sure the CLUT formats are matching. if (GSLocalMemory::m_psm[m_mem.m_clut.GetCLUTCPSM()].bpp != psm.bpp) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Max size for a CLUT/Current page size. constexpr float min_clut_width = 7.0f; @@ -4013,7 +4030,7 @@ bool GSRendererHW::PossibleCLUTDraw() // If the coordinates aren't starting within the page, it's likely not a CLUT draw. if (floor(m_vt.m_min.p.x) < 0 || floor(m_vt.m_min.p.y) < 0 || floor(m_vt.m_min.p.x) > page_width || floor(m_vt.m_min.p.y) > page_height) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Make sure it's a division of 8 in width to avoid bad draws. Points will go from 0-7 inclusive, but sprites etc will do 0-16 exclusive. int draw_divder_match = false; @@ -4035,13 +4052,36 @@ bool GSRendererHW::PossibleCLUTDraw() // Make sure the draw hits the next CLUT and it's marked as invalid (kind of a sanity check). // We can also allow draws which are of a sensible size within the page, as they could also be CLUT draws (or gradients for the CLUT). if (!valid_size) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (PRIM->TME) { // If we're using a texture to draw our CLUT/whatever, we need the GPU to write back dirty data we need. const GSVector4i r = GetTextureMinMax(m_context->TEX0, m_context->CLAMP, m_vt.IsLinear()).coverage; + // If we have GPU CLUT enabled, don't do a CPU draw when it would result in a download. + if (GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled) + { + GSTextureCache::Target* tgt = m_tc->GetExactTarget(m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM); + if (tgt) + { + bool is_dirty = false; + for (const GSDirtyRect& rc : tgt->m_dirty) + { + if (!rc.GetDirtyRect(m_context->TEX0).rintersect(r).rempty()) + { + is_dirty = true; + break; + } + } + if (!is_dirty) + { + GL_INS("GPU clut is enabled and this draw would readback, leaving on GPU"); + return CLUTDrawTestResult::CLUTDrawOnGPU; + } + } + } + GIFRegBITBLTBUF BITBLTBUF; BITBLTBUF.SBP = m_context->TEX0.TBP0; BITBLTBUF.SBW = m_context->TEX0.TBW; @@ -4054,41 +4094,41 @@ bool GSRendererHW::PossibleCLUTDraw() //const u32 endbp = psm.info.bn(m_vt.m_max.p.x, m_vt.m_max.p.y, m_context->FRAME.Block(), m_context->FRAME.FBW); //DevCon.Warning("Draw width %f height %f page width %f height %f TPSM %x TBP0 %x FPSM %x FBP %x CBP %x valid size %d Invalid %d DISPFB0 %x DISPFB1 %x start %x end %x draw %d", draw_width, draw_height, page_width, page_height, m_context->TEX0.PSM, m_context->TEX0.TBP0, m_context->FRAME.PSM, m_context->FRAME.Block(), m_mem.m_clut.GetCLUTCBP(), valid_size, m_mem.m_clut.IsInvalid(), m_regs->DISP[0].DISPFB.Block(), m_regs->DISP[1].DISPFB.Block(), startbp, endbp, s_n); - return true; + return CLUTDrawTestResult::CLUTDrawOnCPU; } // Slight more aggressive version that kinda YOLO's it if the draw is anywhere near the CLUT or is point/line (providing it's not too wide of a draw and a few other parameters. // This is pretty much tuned for the Sega Model 2 games, which draw a huge gradient, then pick lines out of it to make up CLUT's for about 4000 draws... -bool GSRendererHW::PossibleCLUTDrawAggressive() +GSRendererHW::CLUTDrawTestResult GSRendererHW::PossibleCLUTDrawAggressive() { // Avoid any shuffles. if (m_channel_shuffle || m_texture_shuffle) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Keep the draws simple, no alpha testing, blending, mipmapping, Z writes, and make sure it's flat. if (m_context->TEST.ATE) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (PRIM->ABE) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (m_context->TEX1.MXL) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (m_context->FRAME.FBW != 1) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (!m_context->ZBUF.ZMSK) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (m_vt.m_eq.z != 0x1) - return false; + return CLUTDrawTestResult::NotCLUTDraw; if (!((m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) || ((m_mem.m_clut.GetCLUTCBP() >> 5) >= m_context->FRAME.FBP && (m_context->FRAME.FBP + 1U) >= (m_mem.m_clut.GetCLUTCBP() >> 5) && m_vt.m_primclass == GS_SPRITE_CLASS))) - return false; + return CLUTDrawTestResult::NotCLUTDraw; // Avoid invalidating anything here, we just want to avoid the thing being drawn on the GPU. - return true; + return CLUTDrawTestResult::CLUTDrawOnCPU; } bool GSRendererHW::CanUseSwPrimRender(bool no_rt, bool no_ds, bool draw_sprite_tex) diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.h b/pcsx2/GS/Renderers/HW/GSRendererHW.h index 730832f506..13a26fb782 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.h +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.h @@ -66,8 +66,15 @@ private: void SwSpriteRender(); bool CanUseSwSpriteRender(); - bool PossibleCLUTDraw(); - bool PossibleCLUTDrawAggressive(); + enum class CLUTDrawTestResult + { + NotCLUTDraw, + CLUTDrawOnCPU, + CLUTDrawOnGPU, + }; + + CLUTDrawTestResult PossibleCLUTDraw(); + CLUTDrawTestResult PossibleCLUTDrawAggressive(); bool CanUseSwPrimRender(bool no_rt, bool no_ds, bool draw_sprite_tex); bool (*SwPrimRender)(GSRendererHW&, bool invalidate_tc); @@ -153,6 +160,7 @@ public: void Draw() override; void PurgeTextureCache() override; + GSTexture* LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, const GSVector2i& size) override; // Called by the texture cache to know if current texture is useful bool UpdateTexIsFB(GSTextureCache::Target* src, const GIFRegTEX0& TEX0); diff --git a/pcsx2/GS/Renderers/HW/GSTextureCache.cpp b/pcsx2/GS/Renderers/HW/GSTextureCache.cpp index d408e91e32..6b4cc0b757 100644 --- a/pcsx2/GS/Renderers/HW/GSTextureCache.cpp +++ b/pcsx2/GS/Renderers/HW/GSTextureCache.cpp @@ -224,12 +224,13 @@ GSTextureCache::Source* GSTextureCache::LookupDepthSource(const GIFRegTEX0& TEX0 GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, const GSVector2i* lod) { - GL_CACHE("TC: Lookup Source <%d,%d => %d,%d> (0x%x, %s, BW: %u)", r.x, r.y, r.z, r.w, TEX0.TBP0, psm_str(TEX0.PSM), TEX0.TBW); + GL_CACHE("TC: Lookup Source <%d,%d => %d,%d> (0x%x, %s, BW: %u, CBP: 0x%x)", r.x, r.y, r.z, r.w, TEX0.TBP0, psm_str(TEX0.PSM), TEX0.TBW, TEX0.CBP); const GSLocalMemory::psm_t& psm_s = GSLocalMemory::m_psm[TEX0.PSM]; //const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[TEX0.CPSM] : psm; - const u32* clut = g_gs_renderer->m_mem.m_clut; + const u32* const clut = g_gs_renderer->m_mem.m_clut; + GSTexture* const gpu_clut = (psm_s.pal > 0) ? g_gs_renderer->m_mem.m_clut.GetGPUTexture() : nullptr; Source* src = NULL; @@ -246,16 +247,25 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con // Target are converted (AEM & palette) on the fly by the GPU. They don't need extra check if (!s->m_target) { - // We request a palette texture (psm_s.pal). If the texture was - // converted by the CPU (!s->m_palette), we need to ensure - // palette content is the same. - if (psm_s.pal > 0 && !s->m_palette && !s->ClutMatch({clut, psm_s.pal})) - continue; + if (psm_s.pal > 0) + { + // If we're doing GPU CLUT, we don't want to use the CPU-converted version. + if (gpu_clut && !s->m_palette) + continue; - // We request a 24/16 bit RGBA texture. Alpha expansion was done by - // the CPU. We need to check that TEXA is identical - if (psm_s.pal == 0 && psm_s.fmt > 0 && s->m_TEXA.U64 != TEXA.U64) - continue; + // We request a palette texture (psm_s.pal). If the texture was + // converted by the CPU (!s->m_palette), we need to ensure + // palette content is the same. + if (!s->m_palette && !s->ClutMatch({ clut, psm_s.pal })) + continue; + } + else + { + // We request a 24/16 bit RGBA texture. Alpha expansion was done by + // the CPU. We need to check that TEXA is identical + if (psm_s.fmt > 0 && s->m_TEXA.U64 != TEXA.U64) + continue; + } // Same base mip texture, but we need to check that MXL was the same as well. // When mipmapping is off, this will be 0,0 vs 0,0. @@ -404,9 +414,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con } } - bool new_source = false; - - if (src == NULL) + if (!src) { #ifdef ENABLE_OGL_DEBUG if (dst) @@ -425,8 +433,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con GL_CACHE("TC: src miss (0x%x, 0x%x, %s)", TEX0.TBP0, psm_s.pal > 0 ? TEX0.CBP : 0, psm_str(TEX0.PSM)); } #endif - src = CreateSource(TEX0, TEXA, dst, half_right, x_offset, y_offset, lod, &r); - new_source = true; + src = CreateSource(TEX0, TEXA, dst, half_right, x_offset, y_offset, lod, &r, gpu_clut); } else { @@ -434,11 +441,11 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con src->m_texture ? src->m_texture->GetID() : 0, TEX0.TBP0, psm_s.pal > 0 ? TEX0.CBP : 0, psm_str(TEX0.PSM)); - } - if (src->m_palette && !new_source && !src->ClutMatch({clut, psm_s.pal})) - { - AttachPaletteToSource(src, psm_s.pal, true); + if (gpu_clut) + AttachPaletteToSource(src, gpu_clut); + else if (src->m_palette && (!src->m_palette_obj || !src->ClutMatch({clut, psm_s.pal}))) + AttachPaletteToSource(src, psm_s.pal, true); } src->Update(r); @@ -448,7 +455,7 @@ GSTextureCache::Source* GSTextureCache::LookupSource(const GIFRegTEX0& TEX0, con return src; } -GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, const GSVector2i& size, int type, bool used, u32 fbmask, const bool is_frame, const int real_w, const int real_h) +GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, const GSVector2i& size, int type, bool used, u32 fbmask, const bool is_frame, const int real_w, const int real_h, bool preload) { const GSLocalMemory::psm_t& psm_s = GSLocalMemory::m_psm[TEX0.PSM]; const GSVector2& new_s = static_cast(g_gs_renderer.get())->GetTextureScaleFactor(); @@ -656,7 +663,7 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, con // From a performance point of view, it might cost a little on big upscaling // but normally few RT are miss so it must remain reasonable. bool supported_fmt = !GSConfig.UserHacks_DisableDepthSupport || psm_s.depth == 0; - if (GSConfig.PreloadFrameWithGSData && TEX0.TBW > 0 && supported_fmt) + if (preload && TEX0.TBW > 0 && supported_fmt) { GL_INS("Preloading the RT DATA"); // RT doesn't have height but if we use a too big value, we will read outside of the GS memory. @@ -1726,7 +1733,7 @@ void GSTextureCache::IncAge() } //Fixme: Several issues in here. Not handling depth stencil, pitch conversion doesnt work. -GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, Target* dst, bool half_right, int x_offset, int y_offset, const GSVector2i* lod, const GSVector4i* src_range) +GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, Target* dst, bool half_right, int x_offset, int y_offset, const GSVector2i* lod, const GSVector4i* src_range, GSTexture* gpu_clut) { const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[TEX0.PSM]; Source* src = new Source(TEX0, TEXA, false); @@ -2042,28 +2049,33 @@ GSTextureCache::Source* GSTextureCache::CreateSource(const GIFRegTEX0& TEX0, con else { // maintain the clut even when paltex is on for the dump/replacement texture lookup - bool paltex = (GSConfig.GPUPaletteConversion && psm.pal > 0); + bool paltex = (GSConfig.GPUPaletteConversion && psm.pal > 0) || gpu_clut; const u32* clut = (psm.pal > 0) ? static_cast(g_gs_renderer->m_mem.m_clut) : nullptr; // try the hash cache if ((src->m_from_hash_cache = LookupHashCache(TEX0, TEXA, paltex, clut, lod)) != nullptr) { src->m_texture = src->m_from_hash_cache->texture; - if (psm.pal > 0) + if (gpu_clut) + AttachPaletteToSource(src, gpu_clut); + else if (psm.pal > 0) AttachPaletteToSource(src, psm.pal, paltex); } else if (paltex) { src->m_texture = g_gs_device->CreateTexture(tw, th, tlevels, GSTexture::Format::UNorm8); - AttachPaletteToSource(src, psm.pal, true); + if (gpu_clut) + AttachPaletteToSource(src, gpu_clut); + else + AttachPaletteToSource(src, psm.pal, true); } else { src->m_texture = g_gs_device->CreateTexture(tw, th, tlevels, GSTexture::Format::Color); - if (psm.pal > 0) - { + if (gpu_clut) + AttachPaletteToSource(src, gpu_clut); + else if (psm.pal > 0) AttachPaletteToSource(src, psm.pal, false); - } } } @@ -2243,6 +2255,71 @@ GSTextureCache::Target* GSTextureCache::CreateTarget(const GIFRegTEX0& TEX0, int return t; } +GSTexture* GSTextureCache::LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, const GSVector2i& size) +{ + for (auto t : m_dst[RenderTarget]) + { + if (!t->m_used) + continue; + + GSVector2i this_offset; + if (t->m_TEX0.TBP0 == CBP) + { + // Exact match, this one's likely fine, unless the format is different. + if (t->m_TEX0.PSM != CPSM || (CBW != 0 && t->m_TEX0.TBW != CBW)) + continue; + + GL_INS("Exact match on BP 0x%04x BW %u", t->m_TEX0.CBP, t->m_TEX0.TBW); + this_offset.x = 0; + this_offset.y = 0; + } + else if (GSConfig.UserHacks_GPUTargetCLUTMode == GSGPUTargetCLUTMode::InsideTarget && + t->m_TEX0.TBP0 < CBP && t->m_end_block >= CBP) + { + // Somewhere within this target, can we find it? + const GSVector4i rc(0, 0, size.x, size.y); + SurfaceOffset so = ComputeSurfaceOffset(CBP, std::max(CBW, 0), CPSM, rc, t); + if (!so.is_valid) + continue; + + GL_INS("Match inside RT at BP 0x%04X-0x%04X BW %u", t->m_TEX0.TBP0, t->m_end_block, t->m_TEX0.TBW); + this_offset.x = so.b2a_offset.left; + this_offset.y = so.b2a_offset.top; + } + else + { + // Not inside this target, skip. + continue; + } + + // Make sure the clut isn't in an area of the target where the EE has overwritten it. + // Otherwise, we'll be using stale data on the CPU. + if (!t->m_dirty.empty()) + { + GL_INS("Candidate is dirty, checking"); + + const GSVector4i clut_rc(this_offset.x, this_offset.y, this_offset.x + size.x, this_offset.y + size.y); + bool is_dirty = false; + for (const GSDirtyRect& dirty : t->m_dirty) + { + if (!dirty.GetDirtyRect(t->m_TEX0).rintersect(clut_rc).rempty()) + { + GL_INS("Dirty rectangle overlaps CLUT rectangle, skipping"); + is_dirty = true; + break; + } + } + if (is_dirty) + continue; + } + + offset = this_offset; + return t->m_texture; + } + + return nullptr; +} + void GSTextureCache::Read(Target* t, const GSVector4i& r) { if (!t->m_dirty.empty() || r.width() == 0 || r.height() == 0) @@ -2980,6 +3057,12 @@ void GSTextureCache::AttachPaletteToSource(Source* s, u16 pal, bool need_gs_text s->m_palette = need_gs_texture ? s->m_palette_obj->GetPaletteGSTexture() : nullptr; } +void GSTextureCache::AttachPaletteToSource(Source* s, GSTexture* gpu_clut) +{ + s->m_palette_obj = nullptr; + s->m_palette = gpu_clut; +} + GSTextureCache::SurfaceOffset GSTextureCache::ComputeSurfaceOffset(const GSOffset& off, const GSVector4i& r, const Target* t) { // Computes offset from Target to offset+rectangle in Target coords. diff --git a/pcsx2/GS/Renderers/HW/GSTextureCache.h b/pcsx2/GS/Renderers/HW/GSTextureCache.h index c1f252790d..393d8babcd 100644 --- a/pcsx2/GS/Renderers/HW/GSTextureCache.h +++ b/pcsx2/GS/Renderers/HW/GSTextureCache.h @@ -308,7 +308,7 @@ protected: std::unordered_map m_surface_offset_cache; Source* m_temporary_source = nullptr; // invalidated after the draw - Source* CreateSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, Target* t = NULL, bool half_right = false, int x_offset = 0, int y_offset = 0, const GSVector2i* lod = nullptr, const GSVector4i* src_range = nullptr); + Source* CreateSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, Target* t, bool half_right, int x_offset, int y_offset, const GSVector2i* lod, const GSVector4i* src_range, GSTexture* gpu_clut); Target* CreateTarget(const GIFRegTEX0& TEX0, int w, int h, int type, const bool clear); /// Expands a target when the block pointer for a display framebuffer is within another target, but the read offset @@ -337,10 +337,12 @@ public: void RemovePartial(); void AddDirtyRectTarget(Target* target, GSVector4i rect, u32 psm, u32 bw); + GSTexture* LookupPaletteSource(u32 CBP, u32 CPSM, u32 CBW, GSVector2i& offset, const GSVector2i& size); + Source* LookupSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, const GSVector2i* lod); Source* LookupDepthSource(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, const GSVector4i& r, bool palette = false); - Target* LookupTarget(const GIFRegTEX0& TEX0, const GSVector2i& size, int type, bool used, u32 fbmask = 0, const bool is_frame = false, const int real_w = 0, const int real_h = 0); + Target* LookupTarget(const GIFRegTEX0& TEX0, const GSVector2i& size, int type, bool used, u32 fbmask = 0, const bool is_frame = false, const int real_w = 0, const int real_h = 0, bool preload = GSConfig.PreloadFrameWithGSData); Target* LookupDisplayTarget(const GIFRegTEX0& TEX0, const GSVector2i& size, const int real_w, const int real_h); /// Looks up a target in the cache, and only returns it if the BP/BW/PSM match exactly. @@ -367,6 +369,7 @@ public: void PrintMemoryUsage(); void AttachPaletteToSource(Source* s, u16 pal, bool need_gs_texture); + void AttachPaletteToSource(Source* s, GSTexture* gpu_clut); SurfaceOffset ComputeSurfaceOffset(const GSOffset& off, const GSVector4i& r, const Target* t); SurfaceOffset ComputeSurfaceOffset(const uint32_t bp, const uint32_t bw, const uint32_t psm, const GSVector4i& r, const Target* t); SurfaceOffset ComputeSurfaceOffset(const SurfaceOffsetKey& sok); diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp index a94782df5b..efa5df6f2c 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp @@ -295,7 +295,14 @@ bool GSDeviceOGL::Create() m_convert.ps[i].SetFormattedName("Convert pipe %s", name); if (static_cast(i) == ShaderConvert::YUV) + { m_convert.ps[i].RegisterUniform("EMOD"); + } + else if (static_cast(i) == ShaderConvert::CLUT_4 || static_cast(i) == ShaderConvert::CLUT_8) + { + m_convert.ps[i].RegisterUniform("offset"); + m_convert.ps[i].RegisterUniform("scale"); + } } const PSSamplerSelector point; @@ -1278,6 +1285,30 @@ void GSDeviceOGL::PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture EndScene(); } +void GSDeviceOGL::UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) +{ + BeginScene(); + + const ShaderConvert shader = (dSize == 16) ? ShaderConvert::CLUT_4 : ShaderConvert::CLUT_8; + GL::Program& prog = m_convert.ps[static_cast(shader)]; + prog.Bind(); + prog.Uniform3ui(0, offsetX, offsetY, dOffset); + prog.Uniform2f(1, sTex->GetScale().x, sTex->GetScale().y); + + OMSetDepthStencilState(m_convert.dss); + OMSetBlendState(false); + OMSetColorMaskState(); + OMSetRenderTargets(dTex, nullptr); + + PSSetShaderResource(0, sTex); + PSSetSamplerState(m_convert.pt); + + const GSVector4 dRect(0, 0, dSize, 1); + DrawStretchRect(GSVector4::zero(), dRect, dTex->GetSize()); + + EndScene(); +} + void GSDeviceOGL::DrawStretchRect(const GSVector4& sRect, const GSVector4& dRect, const GSVector2i& ds) { // Original code from DX diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h index 56bec7eba2..21801cf6b5 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h @@ -334,6 +334,7 @@ public: void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, bool red, bool green, bool blue, bool alpha) final; void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, const GL::Program& ps, bool alpha_blend, OMColorMaskSelector cms, bool linear = true); void PresentRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, PresentShader shader, float shaderTime, bool linear) final; + void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) final; void RenderHW(GSHWDrawConfig& config) final; void SendHWDraw(const GSHWDrawConfig& config, bool needs_barrier); diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp index 98ea92f3b2..55e69aaedf 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.cpp @@ -763,6 +763,23 @@ void GSDeviceVK::BlitRect(GSTexture* sTex, const GSVector4i& sRect, u32 sLevel, &ib, linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST); } +void GSDeviceVK::UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) +{ + struct Uniforms + { + float scaleX, scaleY; + u32 offsetX, offsetY, dOffset; + }; + + const Uniforms uniforms = {sTex->GetScale().x, sTex->GetScale().y, offsetX, offsetY, dOffset}; + SetUtilityPushConstants(&uniforms, sizeof(uniforms)); + + const GSVector4 dRect(0, 0, dSize, 1); + const ShaderConvert shader = (dSize == 16) ? ShaderConvert::CLUT_4 : ShaderConvert::CLUT_8; + DoStretchRect(static_cast(sTex), GSVector4::zero(), static_cast(dTex), dRect, + m_convert[static_cast(shader)], false); +} + void GSDeviceVK::DoMerge(GSTexture* sTex[3], GSVector4* sRect, GSTexture* dTex, GSVector4* dRect, const GSRegPMODE& PMODE, const GSRegEXTBUF& EXTBUF, const GSVector4& c) { diff --git a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h index 00f840db53..598245e08b 100644 --- a/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h +++ b/pcsx2/GS/Renderers/Vulkan/GSDeviceVK.h @@ -251,6 +251,8 @@ public: void BlitRect(GSTexture* sTex, const GSVector4i& sRect, u32 sLevel, GSTexture* dTex, const GSVector4i& dRect, u32 dLevel, bool linear); + void UpdateCLUTTexture(GSTexture* sTex, u32 offsetX, u32 offsetY, GSTexture* dTex, u32 dOffset, u32 dSize) override; + void SetupDATE(GSTexture* rt, GSTexture* ds, bool datm, const GSVector4i& bbox); GSTextureVK* SetupPrimitiveTrackingDATE(GSHWDrawConfig& config); diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp index 31ad0bd9ec..f2c733a004 100644 --- a/pcsx2/Pcsx2Config.cpp +++ b/pcsx2/Pcsx2Config.cpp @@ -505,6 +505,7 @@ bool Pcsx2Config::GSOptions::OptionsAreEqual(const GSOptions& right) const OpEqu(UserHacks_TCOffsetY) && OpEqu(UserHacks_CPUSpriteRenderBW) && OpEqu(UserHacks_CPUCLUTRender) && + OpEqu(UserHacks_GPUTargetCLUTMode) && OpEqu(OverrideTextureBarriers) && OpEqu(OverrideGeometryShaders) && @@ -681,6 +682,7 @@ void Pcsx2Config::GSOptions::LoadSave(SettingsWrapper& wrap) GSSettingIntEx(UserHacks_TCOffsetY, "UserHacks_TCOffsetY"); GSSettingIntEx(UserHacks_CPUSpriteRenderBW, "UserHacks_CPUSpriteRenderBW"); GSSettingIntEx(UserHacks_CPUCLUTRender, "UserHacks_CPUCLUTRender"); + GSSettingIntEnumEx(UserHacks_GPUTargetCLUTMode, "UserHacks_GPUTargetCLUTMode"); GSSettingIntEnumEx(TriFilter, "TriFilter"); GSSettingIntEx(OverrideTextureBarriers, "OverrideTextureBarriers"); GSSettingIntEx(OverrideGeometryShaders, "OverrideGeometryShaders"); @@ -746,6 +748,7 @@ void Pcsx2Config::GSOptions::MaskUserHacks() UserHacks_TCOffsetY = 0; UserHacks_CPUSpriteRenderBW = 0; UserHacks_CPUCLUTRender = 0; + UserHacks_GPUTargetCLUTMode = GSGPUTargetCLUTMode::Disabled; SkipDrawStart = 0; SkipDrawEnd = 0; }