diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index 337f0cf11f..0ab0afa8e6 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -614,6 +614,7 @@ set(pcsx2GSSources GS/Renderers/Null/GSTextureNull.cpp GS/Renderers/HW/GSHwHack.cpp GS/Renderers/HW/GSRendererHW.cpp + GS/Renderers/HW/GSRendererNew.cpp GS/Renderers/HW/GSTextureCache.cpp GS/Renderers/SW/GSDrawScanline.cpp GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp @@ -681,6 +682,7 @@ set(pcsx2GSHeaders GS/Renderers/Null/GSRendererNull.h GS/Renderers/Null/GSTextureNull.h GS/Renderers/HW/GSRendererHW.h + GS/Renderers/HW/GSRendererNew.h GS/Renderers/HW/GSTextureCache.h GS/Renderers/HW/GSVertexHW.h GS/Renderers/SW/GSDrawScanlineCodeGenerator.h diff --git a/pcsx2/GS/GS.cpp b/pcsx2/GS/GS.cpp index 63b6b2957e..6539e9859f 100644 --- a/pcsx2/GS/GS.cpp +++ b/pcsx2/GS/GS.cpp @@ -23,6 +23,7 @@ #include "Renderers/Null/GSDeviceNull.h" #include "Renderers/OpenGL/GSDeviceOGL.h" #include "Renderers/OpenGL/GSRendererOGL.h" +#include "Renderers/HW/GSRendererNew.h" #include "GSLzma.h" #include "common/pxStreams.h" @@ -218,13 +219,9 @@ int _GSopen(const WindowInfo& wi, const char* title, GSRendererType renderer, in switch (renderer) { default: -#ifdef _WIN32 case GSRendererType::DX1011_HW: - s_gs = (GSRenderer*)new GSRendererDX11(); - break; -#endif case GSRendererType::OGL_HW: - s_gs = (GSRenderer*)new GSRendererOGL(); + s_gs = (GSRenderer*)new GSRendererNew(); break; case GSRendererType::OGL_SW: s_gs = new GSRendererSW(threads); diff --git a/pcsx2/GS/Renderers/Common/GSDevice.h b/pcsx2/GS/Renderers/Common/GSDevice.h index eb8e7c432b..7af1c279d9 100644 --- a/pcsx2/GS/Renderers/Common/GSDevice.h +++ b/pcsx2/GS/Renderers/Common/GSDevice.h @@ -134,8 +134,360 @@ struct HWBlend u16 flags, op, src, dst; }; +struct GSHWDrawConfig +{ + enum class Topology: u8 + { + Point, + Line, + Triangle, + }; + enum class GSTopology: u8 + { + Point, + Line, + Triangle, + Sprite, + }; + struct GSSelector + { + union + { + struct + { + GSTopology topology : 2; + bool expand : 1; + bool iip : 1; + }; + u8 key; + }; + GSSelector(): key(0) {} + GSSelector(u8 k): key(k) {} + }; + struct VSSelector + { + union + { + struct + { + u8 fst : 1; + u8 tme : 1; + u8 _free : 6; + }; + u8 key; + }; + VSSelector(): key(0) {} + VSSelector(u8 k): key(k) {} + }; + struct PSSelector + { + // Performance note: there are too many shader combinations + // It might hurt the performance due to frequent toggling worse it could consume + // a lots of memory. + union + { + struct + { + // *** Word 1 + // Format + u32 aem_fmt : 2; + u32 pal_fmt : 2; + u32 dfmt : 2; // 0 → 32-bit, 1 → 24-bit, 2 → 16-bit + u32 depth_fmt : 2; // 0 → None, 1 → 32-bit, 2 → 16-bit, 3 → RGBA + // Alpha extension/Correction + u32 aem : 1; + u32 fba : 1; + // Fog + u32 fog : 1; + // Flat/goround shading + u32 iip : 1; + // Pixel test + u32 date : 3; + u32 atst : 3; + // Color sampling + u32 fst : 1; // Investigate to do it on the VS + u32 tfx : 3; + u32 tcc : 1; + u32 wms : 2; + u32 wmt : 2; + u32 ltf : 1; + // Shuffle and fbmask effect + u32 shuffle : 1; + u32 read_ba : 1; + u32 write_rg : 1; + u32 fbmask : 1; + + //u32 _free1:0; + + // *** Word 2 + // Blend and Colclip + u32 blend_a : 2; + u32 blend_b : 2; + u32 blend_c : 2; + u32 blend_d : 2; + u32 clr1 : 1; // useful? + u32 hdr : 1; + u32 colclip : 1; + u32 pabe : 1; + + // Others ways to fetch the texture + u32 channel : 3; + + // Dithering + u32 dither : 2; + + // Depth clamp + u32 zclamp : 1; + + // Hack + u32 tcoffsethack : 1; + u32 urban_chaos_hle : 1; + u32 tales_of_abyss_hle : 1; + u32 tex_is_fb : 1; // Jak Shadows + u32 automatic_lod : 1; + u32 manual_lod : 1; + u32 point_sampler : 1; + u32 invalid_tex0 : 1; // Lupin the 3rd + + u32 _free2 : 6; + }; + + u64 key; + }; + PSSelector(): key(0) {} + }; + struct SamplerSelector + { + union + { + struct + { + u8 tau : 1; + u8 tav : 1; + u8 biln : 1; + u8 triln : 3; + u8 aniso : 1; + + u8 _free : 1; + }; + u8 key; + }; + SamplerSelector(): key(0) {} + SamplerSelector(u32 k): key(k) {} + static SamplerSelector Point() { return SamplerSelector(); } + static SamplerSelector Linear() + { + SamplerSelector out; + out.biln = 1; + return out; + } + }; + struct DepthStencilSelector + { + union + { + struct + { + u8 ztst : 2; + u8 zwe : 1; + u8 date : 1; + u8 date_one : 1; + + u8 _free : 3; + }; + u8 key; + }; + DepthStencilSelector(): key(0) {} + DepthStencilSelector(u32 k): key(k) {} + static DepthStencilSelector NoDepth() + { + DepthStencilSelector out; + out.ztst = ZTST_ALWAYS; + return out; + } + }; + struct ColorMaskSelector + { + union + { + struct + { + u8 wr : 1; + u8 wg : 1; + u8 wb : 1; + u8 wa : 1; + + u8 _free : 4; + }; + struct + { + u8 wrgba : 4; + }; + u8 key; + }; + ColorMaskSelector(): key(0xF) {} + ColorMaskSelector(u32 c): key(0) { wrgba = c; } + }; + struct VSConstantBuffer + { + GSVector2 vertex_scale; + GSVector2 vertex_offset; + GSVector2 texture_scale; + GSVector2 texture_offset; + GSVector2 point_size; + GSVector2i max_depth; + }; + struct PSConstantBuffer + { + union + { + struct + { + u8 fog_color[3]; + u8 aref; + }; + u32 fog_color_aref; + }; + union + { + struct + { + u8 r, g, b, a; + } fbmask; + u32 fbmask_int; + }; + u32 max_depth; + union + { + struct + { + u8 ta0; + u8 ta1; + u8 _pad; + u8 alpha_fix; + }; + u32 ta_af; + }; + union + { + struct + { + u8 blue_mask; + u8 blue_shift; + u8 green_mask; + u8 green_shift; + } channel_shuffle; + u32 channel_shuffle_int; + }; + union + { + struct + { + u16 umsk; + u16 vmsk; + u16 ufix; + u16 vfix; + }; + u64 uv_msk_fix; + }; + GIFRegDIMX dither_matrix; + GSVector2 tc_offset; + GSVector4 texture_size; // xy → PS2 size, wz → emulator size + + GSVector4 half_texel; + GSVector4 uv_min_max; + }; + struct BlendState + { + union + { + struct + { + u8 index; + u8 factor; + bool is_constant : 1; + bool is_accumulation : 1; + bool is_mixed_hw_sw : 1; + }; + u32 key; + }; + BlendState(): key(0) {} + BlendState(u8 index, u8 factor, bool is_constant, bool is_accumulation, bool is_mixed_hw_sw) + : key(0) + { + this->index = index; + this->factor = factor; + this->is_constant = is_constant; + this->is_accumulation = is_accumulation; + this->is_mixed_hw_sw = is_mixed_hw_sw; + } + }; + enum class DestinationAlphaMode : u8 + { + Off, ///< No destination alpha test + Stencil, ///< Emulate using read-only stencil + StencilOne, ///< Emulate using read-write stencil (first write wins) + PrimIDTracking, ///< Emulate by tracking the primitive ID of the last pixel allowed through + Full, ///< Full emulation (using barriers / ROV) + }; + + GSTexture* rt; ///< Render target + GSTexture* ds; ///< Depth stencil + GSTexture* tex; ///< Source texture + GSTexture* pal; ///< Palette texture + GSTexture* raw_tex; ///< Used by channel shuffles + GSVertex* verts; ///< Vertices to draw + u32* indices; ///< Indices to draw + u32 nverts; ///< Number of vertices + u32 nindices; ///< Number of indices + u32 indices_per_prim; ///< Number of indices that make up one primitive + const std::vector* drawlist; ///< For reducing barriers on sprites + GSVector4i scissor; ///< Scissor rect + Topology topology; ///< Draw topology + + GSSelector gs; + VSSelector vs; + PSSelector ps; + + BlendState blend; + SamplerSelector sampler; + ColorMaskSelector colormask; + DepthStencilSelector depth; + + bool require_one_barrier; ///< Require texture barrier before draw (also used to requst an rt copy if texture barrier isn't supported) + bool require_full_barrier; ///< Require texture barrier between all prims + + DestinationAlphaMode destination_alpha; + bool datm; + + VSConstantBuffer cb_vs; + PSConstantBuffer cb_ps; + + struct AlphaSecondPass + { + bool enable; + PSSelector ps; + PSConstantBuffer cb_ps; + ColorMaskSelector colormask; + DepthStencilSelector depth; + } alpha_second_pass; +}; + class GSDevice : public GSAlignedClass<32> { +public: + struct FeatureSupport + { + bool broken_point_sampler : 1; ///< Issue with AMD cards, see tfx shader for details + bool geometry_shader : 1; ///< Supports geometry shader + bool image_load_store : 1; ///< Supports atomic min and max on images (for use with prim tracking destination alpha algorithm) + bool texture_barrier : 1; ///< Supports sampling rt and hopefully texture barrier + FeatureSupport() + { + memset(this, 0, sizeof(*this)); + } + }; + private: FastList m_pool; static std::array m_blendMap; @@ -174,6 +526,7 @@ protected: } m_index; unsigned int m_frame; // for ageing the pool bool m_linear_present; + FeatureSupport m_features; virtual GSTexture* CreateSurface(GSTexture::Type type, int w, int h, GSTexture::Format format) = 0; virtual GSTexture* FetchSurface(GSTexture::Type type, int w, int h, GSTexture::Format format); @@ -245,6 +598,9 @@ public: void StretchRect(GSTexture* sTex, GSTexture* dTex, const GSVector4& dRect, ShaderConvert shader = ShaderConvert::COPY, bool linear = true); + virtual void RenderHW(GSHWDrawConfig& config) {} + + FeatureSupport Features() { return m_features; } GSTexture* GetCurrent(); void Merge(GSTexture* sTex[3], GSVector4* sRect, GSVector4* dRect, const GSVector2i& fs, const GSRegPMODE& PMODE, const GSRegEXTBUF& EXTBUF, const GSVector4& c); diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp index 03d84ae07f..bc5d73dd21 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.cpp +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.cpp @@ -41,6 +41,11 @@ GSDevice11::GSDevice11() m_aniso_filter = aniso_level; else m_aniso_filter = 0; + + m_features.broken_point_sampler = true; // Not technically the case but the most common reason to use DX11 is because you're on AMD + m_features.geometry_shader = true; + m_features.image_load_store = false; + m_features.texture_barrier = false; } bool GSDevice11::SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode) @@ -1472,6 +1477,255 @@ void GSDevice11::CompileShader(const std::string& source, const char* fn, ID3DIn throw GSRecoverableError(); } +static GSDevice11::VSConstantBuffer convertCB(const GSHWDrawConfig::VSConstantBuffer& cb) +{ + GSDevice11::VSConstantBuffer out; + out.VertexScale = GSVector4(cb.vertex_scale.x, -cb.vertex_scale.y, ldexpf(1, -32), 0.0f); + out.VertexOffset = GSVector4(cb.vertex_offset.x, -cb.vertex_offset.y, 0.0f, -1.0f); + out.Texture_Scale_Offset = GSVector4::loadl(&cb.texture_scale).upld(GSVector4::loadl(&cb.texture_offset)); + out.MaxDepth = cb.max_depth; + return out; +} + +static GSDevice11::GSConstantBuffer convertCBGS(const GSHWDrawConfig::VSConstantBuffer& cb) +{ + GSDevice11::GSConstantBuffer out; + out.PointSize = cb.point_size; + return out; +} + +static GSDevice11::PSConstantBuffer convertCB(const GSHWDrawConfig::PSConstantBuffer& cb, int atst) +{ + GSDevice11::PSConstantBuffer out; + out.FogColor_AREF = GSVector4(GSVector4i::load(cb.fog_color_aref).u8to32()); + if (atst == 1 || atst == 2) // Greater / Less alpha + out.FogColor_AREF.w -= 0.1f; + out.HalfTexel = cb.half_texel; + out.WH = cb.texture_size; + out.MinMax = cb.uv_min_max; + const GSVector4 ta_af(GSVector4i::load(cb.ta_af).u8to32()); + out.MinF_TA = (GSVector4(out.MskFix) + 0.5f).xyxy(ta_af) / out.WH.xyxy(GSVector4(255, 255)); + out.MskFix = GSVector4i::loadl(&cb.uv_msk_fix).u16to32(); + out.ChannelShuffle = GSVector4i::load(cb.channel_shuffle_int).u8to32(); + out.FbMask = GSVector4i::load(cb.fbmask_int).u8to32(); + out.TC_OffsetHack = GSVector4(cb.tc_offset.x, cb.tc_offset.y).xyxy(); + out.Af_MaxDepth = GSVector4(ta_af.a / 128.f, cb.max_depth * ldexpf(1, -32)); + + GSVector4i dither = GSVector4i::loadl(&cb.dither_matrix).u8to16(); + const GSVector4i ditherLow = dither.sll16(13).sra16(13); + const GSVector4i ditherHi = dither.sll16(9).sra16(5); + dither = ditherLow.blend8(ditherHi, GSVector4i(0xFF00FF00)); + + out.DitherMatrix[0] = GSVector4(dither.xxxx().i8to32()); + out.DitherMatrix[1] = GSVector4(dither.yyyy().i8to32()); + out.DitherMatrix[2] = GSVector4(dither.zzzz().i8to32()); + out.DitherMatrix[3] = GSVector4(dither.wwww().i8to32()); + + return out; +} + +static GSDevice11::OMDepthStencilSelector convertSel(GSHWDrawConfig::DepthStencilSelector sel) +{ + GSDevice11::OMDepthStencilSelector out; + out.zwe = sel.zwe; + out.ztst = sel.ztst; + out.date = sel.date; + out.date_one = sel.date_one; + out.fba = 0; // No longer seems to be in use? + return out; +} + +static GSDevice11::OMBlendSelector convertSel(GSHWDrawConfig::ColorMaskSelector cm, GSHWDrawConfig::BlendState blend) +{ + GSDevice11::OMBlendSelector out; + out.wrgba = cm.wrgba; + out.abe = blend.index != 0; + out.blend_index = blend.index; + out.accu_blend = blend.is_accumulation; + out.blend_mix = blend.is_mixed_hw_sw; + return out; +} + +static GSDevice11::VSSelector convertSel(GSHWDrawConfig::VSSelector sel) +{ + GSDevice11::VSSelector out; + out.tme = sel.tme; + out.fst = sel.fst; + return out; +} + +static GSDevice11::PSSelector convertSel(GSHWDrawConfig::PSSelector sel) +{ + GSDevice11::PSSelector out; + out.fmt = sel.pal_fmt << 2 | sel.aem_fmt; + out.dfmt = sel.dfmt; + out.depth_fmt = sel.depth_fmt; + out.aem = sel.aem; + out.fba = sel.fba; + out.fog = sel.fog; + out.atst = sel.atst; + out.fst = sel.fst; + out.tfx = sel.tfx; + out.tcc = sel.tcc; + out.wms = sel.wms; + out.wmt = sel.wmt; + out.ltf = sel.ltf; + out.shuffle = sel.shuffle; + out.read_ba = sel.read_ba; + out.fbmask = sel.fbmask; + out.hdr = sel.hdr; + out.blend_a = sel.blend_a; + out.blend_b = sel.blend_b; + out.blend_c = sel.blend_c; + out.blend_d = sel.blend_d; + out.clr1 = sel.clr1; + out.colclip = sel.colclip; + out.pabe = sel.pabe; + out.channel = sel.channel; + out.dither = sel.dither; + out.zclamp = sel.zclamp; + out.tcoffsethack = sel.tcoffsethack; + out.urban_chaos_hle = sel.urban_chaos_hle; + out.tales_of_abyss_hle = sel.tales_of_abyss_hle; + out.point_sampler = sel.point_sampler; + out.invalid_tex0 = sel.invalid_tex0; + return out; +} + +static GSDevice11::GSSelector convertSel(GSHWDrawConfig::GSSelector sel) +{ + GSDevice11::GSSelector out; + out.iip = sel.iip; + switch (sel.topology) + { + case GSHWDrawConfig::GSTopology::Point: + out.point = sel.expand; + out.prim = GS_POINT_CLASS; + break; + case GSHWDrawConfig::GSTopology::Line: + out.line = sel.expand; + out.prim = GS_LINE_CLASS; + break; + case GSHWDrawConfig::GSTopology::Triangle: + out.prim = GS_TRIANGLE_CLASS; + break; + case GSHWDrawConfig::GSTopology::Sprite: + out.cpu_sprite = !sel.expand; + out.prim = GS_SPRITE_CLASS; + break; + } + return out; +} + +static GSDevice11::PSSamplerSelector convertSel(GSHWDrawConfig::SamplerSelector sel) +{ + GSDevice11::PSSamplerSelector out; + out.tau = sel.tau; + out.tav = sel.tav; + out.ltf = sel.biln; + return out; +} + +void GSDevice11::RenderHW(GSHWDrawConfig& config) +{ + ASSERT(!config.require_full_barrier); // We always specify no support so it shouldn't request this + + if (config.destination_alpha != GSHWDrawConfig::DestinationAlphaMode::Off) + { + const GSVector4 src = GSVector4(config.scissor) / GSVector4(config.ds->GetSize()).xyxy(); + const GSVector4 dst = src * 2.0f - 1.0f; + + GSVertexPT1 vertices[] = + { + {GSVector4(dst.x, -dst.y, 0.5f, 1.0f), GSVector2(src.x, src.y)}, + {GSVector4(dst.z, -dst.y, 0.5f, 1.0f), GSVector2(src.z, src.y)}, + {GSVector4(dst.x, -dst.w, 0.5f, 1.0f), GSVector2(src.x, src.w)}, + {GSVector4(dst.z, -dst.w, 0.5f, 1.0f), GSVector2(src.z, src.w)}, + }; + + SetupDATE(config.rt, config.ds, vertices, config.datm); + } + + GSTexture* hdr_rt = nullptr; + if (config.ps.hdr) + { + const GSVector2i size = config.rt->GetSize(); + const GSVector4 dRect(config.scissor); + const GSVector4 sRect = dRect / GSVector4(size.x, size.y).xyxy(); + hdr_rt = CreateRenderTarget(size.x, size.y, GSTexture::Format::FloatColor); + hdr_rt->CommitRegion(GSVector2i(config.scissor.z, config.scissor.w)); + // Warning: StretchRect must be called before BeginScene otherwise + // vertices will be overwritten. Trust me you don't want to do that. + StretchRect(config.rt, sRect, hdr_rt, dRect, ShaderConvert::COPY, false); + } + + BeginScene(); + + void* ptr = nullptr; + if (IAMapVertexBuffer(&ptr, sizeof(*config.verts), config.nverts)) + { + GSVector4i::storent(ptr, config.verts, config.nverts * sizeof(*config.verts)); + IAUnmapVertexBuffer(); + } + IASetIndexBuffer(config.indices, config.nindices); + D3D11_PRIMITIVE_TOPOLOGY topology; + switch (config.topology) + { + case GSHWDrawConfig::Topology::Point: topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; break; + case GSHWDrawConfig::Topology::Line: topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; break; + case GSHWDrawConfig::Topology::Triangle: topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; break; + } + IASetPrimitiveTopology(topology); + + PSSetShaderResources(config.tex, config.pal); + PSSetShaderResource(4, config.raw_tex); + + if (config.require_one_barrier) // Used as "bind rt" flag when texture barrier is unsupported + { + // Bind the RT.This way special effect can use it. + // Do not always bind the rt when it's not needed, + // only bind it when effects use it such as fbmask emulation currently + // because we copy the frame buffer and it is quite slow. + PSSetShaderResource(3, config.rt); + } + + const VSConstantBuffer cb_vs = convertCB(config.cb_vs); + const GSConstantBuffer cb_gs = convertCBGS(config.cb_vs); + PSConstantBuffer cb_ps = convertCB(config.cb_ps, config.ps.atst); + + SetupOM(convertSel(config.depth), convertSel(config.colormask, config.blend), config.blend.factor); + SetupVS(convertSel(config.vs), &cb_vs); + SetupGS(convertSel(config.gs), &cb_gs); + SetupPS(convertSel(config.ps), &cb_ps, convertSel(config.sampler)); + + OMSetRenderTargets(hdr_rt ? hdr_rt : config.rt, config.ds, &config.scissor); + + DrawIndexedPrimitive(); + + if (config.alpha_second_pass.enable) + { + if (0 != memcmp(&config.cb_ps, &config.alpha_second_pass.cb_ps, sizeof(config.cb_ps))) + { + cb_ps = convertCB(config.alpha_second_pass.cb_ps, config.alpha_second_pass.ps.atst); + } + SetupPS(convertSel(config.alpha_second_pass.ps), &cb_ps, convertSel(config.sampler)); + SetupOM(convertSel(config.alpha_second_pass.depth), convertSel(config.alpha_second_pass.colormask, config.blend), config.blend.factor); + + DrawIndexedPrimitive(); + } + + EndScene(); + + if (hdr_rt) + { + const GSVector2i size = config.rt->GetSize(); + const GSVector4 dRect(config.scissor); + const GSVector4 sRect = dRect / GSVector4(size.x, size.y).xyxy(); + StretchRect(hdr_rt, sRect, config.rt, dRect, ShaderConvert::MOD_256, false); + Recycle(hdr_rt); + } +} + u16 GSDevice11::ConvertBlendEnum(u16 generic) { switch (generic) diff --git a/pcsx2/GS/Renderers/DX11/GSDevice11.h b/pcsx2/GS/Renderers/DX11/GSDevice11.h index c1d55e2958..1081eaa70a 100644 --- a/pcsx2/GS/Renderers/DX11/GSDevice11.h +++ b/pcsx2/GS/Renderers/DX11/GSDevice11.h @@ -587,6 +587,8 @@ public: void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel); void SetupOM(OMDepthStencilSelector dssel, OMBlendSelector bsel, u8 afix); + void RenderHW(GSHWDrawConfig& config) final; + ID3D11Device* operator->() { return m_dev.get(); } operator ID3D11Device*() { return m_dev.get(); } operator ID3D11DeviceContext*() { return m_ctx.get(); } diff --git a/pcsx2/GS/Renderers/HW/GSRendererNew.cpp b/pcsx2/GS/Renderers/HW/GSRendererNew.cpp new file mode 100644 index 0000000000..08e958a69e --- /dev/null +++ b/pcsx2/GS/Renderers/HW/GSRendererNew.cpp @@ -0,0 +1,1604 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2021 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#include "PrecompiledHeader.h" +#include "GSRendererNew.h" +#include "GS/GSGL.h" + +GSRendererNew::GSRendererNew() +{ + if (theApp.GetConfigB("UserHacks")) + UserHacks_tri_filter = static_cast(theApp.GetConfigI("UserHacks_TriFilter")); + else + UserHacks_tri_filter = TriFiltering::None; + + // Hope nothing requires too many draw calls. + m_drawlist.reserve(2048); + + memset(&m_conf, 0, sizeof(m_conf)); + + m_prim_overlap = PRIM_OVERLAP_UNKNOW; + ResetStates(); +} + +bool GSRendererNew::CreateDevice(GSDevice* dev, const WindowInfo& wi) +{ + if (!GSRendererHW::CreateDevice(dev, wi)) + return false; + + if (dev->Features().texture_barrier) + m_sw_blending = static_cast(theApp.GetConfigI("accurate_blending_unit")); + else + m_sw_blending = static_cast(theApp.GetConfigI("accurate_blending_unit_d3d11")); + + return true; +} + +void GSRendererNew::SetupIA(const float& sx, const float& sy) +{ + GL_PUSH("IA"); + + if (m_userhacks_wildhack && !m_isPackedUV_HackFlag && PRIM->TME && PRIM->FST) + { + for (unsigned int i = 0; i < m_vertex.next; i++) + m_vertex.buff[i].UV &= 0x3FEF3FEF; + } + const bool unscale_pt_ln = m_userHacks_enabled_unscale_ptln && (GetUpscaleMultiplier() != 1) && m_dev->Features().geometry_shader; + + switch (m_vt.m_primclass) + { + case GS_POINT_CLASS: + if (unscale_pt_ln) + { + m_conf.gs.expand = true; + m_conf.cb_vs.point_size = GSVector2(16.0f * sx, 16.0f * sy); + } + + m_conf.gs.topology = GSHWDrawConfig::GSTopology::Point; + m_conf.topology = GSHWDrawConfig::Topology::Point; + m_conf.indices_per_prim = 1; + break; + + case GS_LINE_CLASS: + if (unscale_pt_ln) + { + m_conf.gs.expand = true; + m_conf.cb_vs.point_size = GSVector2(16.0f * sx, 16.0f * sy); + } + + m_conf.gs.topology = GSHWDrawConfig::GSTopology::Line; + m_conf.topology = GSHWDrawConfig::Topology::Line; + m_conf.indices_per_prim = 2; + break; + + case GS_SPRITE_CLASS: + // Heuristics: trade-off + // Lines: GPU conversion => ofc, more GPU. And also more CPU due to extra shader validation stage. + // Triangles: CPU conversion => ofc, more CPU ;) more bandwidth (72 bytes / sprite) + // + // Note: severals openGL operation does draw call under the wood like texture upload. So even if + // you do 10 consecutive draw with the geometry shader, you will still pay extra validation if new + // texture are uploaded. (game Shadow Hearts) + // + // Note2: Due to MultiThreaded driver, Nvidia suffers less of the previous issue. Still it isn't free + // Shadow Heart is 90 fps (gs) vs 113 fps (no gs) + // + // Note3: Some GPUs (Happens on GT 750m, not on Intel 5200) don't properly divide by large floats (e.g. FLT_MAX/FLT_MAX == 0) + // Lines2Sprites predivides by Q, avoiding this issue, so always use it if m_vt.m_accurate_stq + + // If the draw calls contains few primitives. Geometry Shader gain with be rather small versus + // the extra validation cost of the extra stage. + // + // Note: keep Geometry Shader in the replayer to ease debug. + if (m_dev->Features().geometry_shader && !m_vt.m_accurate_stq && (m_vertex.next > 32 || GLLoader::in_replayer)) // <=> 16 sprites (based on Shadow Hearts) + { + m_conf.gs.expand = true; + + m_conf.topology = GSHWDrawConfig::Topology::Line; + m_conf.indices_per_prim = 2; + } + else + { + Lines2Sprites(); + + m_conf.topology = GSHWDrawConfig::Topology::Triangle; + m_conf.indices_per_prim = 6; + } + m_conf.gs.topology = GSHWDrawConfig::GSTopology::Sprite; + break; + + case GS_TRIANGLE_CLASS: + m_conf.gs.topology = GSHWDrawConfig::GSTopology::Triangle; + m_conf.topology = GSHWDrawConfig::Topology::Triangle; + m_conf.indices_per_prim = 3; + break; + + default: + __assume(0); + } + + m_conf.verts = m_vertex.buff; + m_conf.nverts = m_vertex.next; + m_conf.indices = m_index.buff; + m_conf.nindices = m_index.tail; +} + +void GSRendererNew::EmulateZbuffer() +{ + if (m_context->TEST.ZTE) + { + m_conf.depth.ztst = m_context->TEST.ZTST; + // AA1: Z is not written on lines since coverage is always less than 0x80. + m_conf.depth.zwe = (m_context->ZBUF.ZMSK || (PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS)) ? 0 : 1; + } + else + { + m_conf.depth.ztst = ZTST_ALWAYS; + } + + // On the real GS we appear to do clamping on the max z value the format allows. + // Clamping is done after rasterization. + const u32 max_z = 0xFFFFFFFF >> (GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8); + const bool clamp_z = (u32)(GSVector4i(m_vt.m_max.p).z) > max_z; + + m_conf.cb_vs.max_depth = GSVector2i(0xFFFFFFFF); + //ps_cb.MaxDepth = GSVector4(0.0f, 0.0f, 0.0f, 1.0f); + m_conf.ps.zclamp = 0; + + if (clamp_z) + { + if (m_vt.m_primclass == GS_SPRITE_CLASS || m_vt.m_primclass == GS_POINT_CLASS) + { + m_conf.cb_vs.max_depth = GSVector2i(max_z); + } + else if (!m_context->ZBUF.ZMSK) + { + m_conf.cb_ps.max_depth = max_z; + m_conf.ps.zclamp = 1; + } + } + + const GSVertex* v = &m_vertex.buff[0]; + // Minor optimization of a corner case (it allow to better emulate some alpha test effects) + if (m_conf.depth.ztst == ZTST_GEQUAL && m_vt.m_eq.z && v[0].XYZ.Z == max_z) + { + GL_DBG("Optimize Z test GEQUAL to ALWAYS (%s)", psm_str(m_context->ZBUF.PSM)); + m_conf.depth.ztst = ZTST_ALWAYS; + } +} + +void GSRendererNew::EmulateTextureShuffleAndFbmask() +{ + // Uncomment to disable texture shuffle emulation. + // m_texture_shuffle = false; + + bool enable_fbmask_emulation = false; + if (m_dev->Features().texture_barrier) + { + enable_fbmask_emulation = m_sw_blending != AccBlendLevel::None; + } + else + { + // FBmask blend level selection. + // We do this becaue: + // 1. D3D sucks. + // 2. FB copy is slow, especially on triangle primitives which is unplayable with some games. + // 3. SW blending isn't implemented yet. + switch (m_sw_blending) + { + case AccBlendLevel::Ultra: + case AccBlendLevel::Full: + case AccBlendLevel::High: + // Fully enable Fbmask emulation like on opengl, note misses sw blending to work as opengl on some games (Genji). + // Debug + enable_fbmask_emulation = true; + break; + case AccBlendLevel::Medium: + // Enable Fbmask emulation excluding triangle class because it is quite slow. + // Exclude 0x80000000 because Genji needs sw blending, otherwise it breaks some effects. + enable_fbmask_emulation = ((m_vt.m_primclass != GS_TRIANGLE_CLASS) && (m_context->FRAME.FBMSK != 0x80000000)); + break; + case AccBlendLevel::Basic: + // Enable Fbmask emulation excluding triangle class because it is quite slow. + // Exclude 0x80000000 because Genji needs sw blending, otherwise it breaks some effects. + // Also exclude fbmask emulation on texture shuffle just in case, it is probably safe tho. + enable_fbmask_emulation = (!m_texture_shuffle && (m_vt.m_primclass != GS_TRIANGLE_CLASS) && (m_context->FRAME.FBMSK != 0x80000000)); + break; + case AccBlendLevel::None: + break; + } + } + + if (m_texture_shuffle) + { + m_conf.ps.shuffle = 1; + m_conf.ps.dfmt = 0; + + bool write_ba; + bool read_ba; + + ConvertSpriteTextureShuffle(write_ba, read_ba); + + // If date is enabled you need to test the green channel instead of the + // alpha channel. Only enable this code in DATE mode to reduce the number + // of shader. + m_conf.ps.write_rg = !write_ba && m_dev->Features().texture_barrier && m_context->TEST.DATE; + + m_conf.ps.read_ba = read_ba; + + // Please bang my head against the wall! + // 1/ Reduce the frame mask to a 16 bit format + const u32& m = m_context->FRAME.FBMSK; + const u32 fbmask = ((m >> 3) & 0x1F) | ((m >> 6) & 0x3E0) | ((m >> 9) & 0x7C00) | ((m >> 16) & 0x8000); + // FIXME GSVector will be nice here + const u8 rg_mask = fbmask & 0xFF; + const u8 ba_mask = (fbmask >> 8) & 0xFF; + m_conf.colormask.wrgba = 0; + + // 2 Select the new mask (Please someone put SSE here) + if (rg_mask != 0xFF) + { + if (write_ba) + { + GL_INS("Color shuffle %s => B", read_ba ? "B" : "R"); + m_conf.colormask.wb = 1; + } + else + { + GL_INS("Color shuffle %s => R", read_ba ? "B" : "R"); + m_conf.colormask.wr = 1; + } + if (rg_mask) + m_conf.ps.fbmask = 1; + } + + if (ba_mask != 0xFF) + { + if (write_ba) + { + GL_INS("Color shuffle %s => A", read_ba ? "A" : "G"); + m_conf.colormask.wa = 1; + } + else + { + GL_INS("Color shuffle %s => G", read_ba ? "A" : "G"); + m_conf.colormask.wg = 1; + } + if (ba_mask) + m_conf.ps.fbmask = 1; + } + + if (m_conf.ps.fbmask && enable_fbmask_emulation) + { + m_conf.cb_ps.fbmask.r = rg_mask; + m_conf.cb_ps.fbmask.g = rg_mask; + m_conf.cb_ps.fbmask.b = ba_mask; + m_conf.cb_ps.fbmask.a = ba_mask; + + // No blending so hit unsafe path. + if (!PRIM->ABE || !m_dev->Features().texture_barrier) + { + GL_INS("FBMASK Unsafe SW emulated fb_mask:%x on tex shuffle", fbmask); + m_conf.require_one_barrier = true; + } + else + { + GL_INS("FBMASK SW emulated fb_mask:%x on tex shuffle", fbmask); + m_conf.require_full_barrier = true; + } + } + else + { + m_conf.ps.fbmask = 0; + } + } + else + { + m_conf.ps.dfmt = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt; + + const GSVector4i fbmask_v = GSVector4i::load((int)m_context->FRAME.FBMSK); + const int ff_fbmask = fbmask_v.eq8(GSVector4i::xffffffff()).mask(); + const int zero_fbmask = fbmask_v.eq8(GSVector4i::zero()).mask(); + + m_conf.colormask.wrgba = ~ff_fbmask; // Enable channel if at least 1 bit is 0 + + m_conf.ps.fbmask = enable_fbmask_emulation && (~ff_fbmask & ~zero_fbmask & 0xF); + + if (m_conf.ps.fbmask) + { + m_conf.cb_ps.fbmask_int = m_context->FRAME.FBMSK; + // Only alpha is special here, I think we can take a very unsafe shortcut + // Alpha isn't blended on the GS but directly copyied into the RT. + // + // Behavior is clearly undefined however there is a high probability that + // it will work. Masked bit will be constant and normally the same everywhere + // RT/FS output/Cached value. + // + // Just to be sure let's add a new safe hack for unsafe access :) + // + // Here the GL spec quote to emphasize the unexpected behavior. + /* + - If a texel has been written, then in order to safely read the result + a texel fetch must be in a subsequent Draw separated by the command + + void TextureBarrier(void); + + TextureBarrier() will guarantee that writes have completed and caches + have been invalidated before subsequent Draws are executed. + */ + // No blending so hit unsafe path. + if (!PRIM->ABE || !(~ff_fbmask & ~zero_fbmask & 0x7) || !m_dev->Features().texture_barrier) + { + GL_INS("FBMASK Unsafe SW emulated fb_mask:%x on %d bits format", m_context->FRAME.FBMSK, + (GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt == 2) ? 16 : 32); + m_conf.require_one_barrier = true; + } + else + { + // The safe and accurate path (but slow) + GL_INS("FBMASK SW emulated fb_mask:%x on %d bits format", m_context->FRAME.FBMSK, + (GSLocalMemory::m_psm[m_context->FRAME.PSM].fmt == 2) ? 16 : 32); + m_conf.require_full_barrier = true; + } + } + } +} + +void GSRendererNew::EmulateChannelShuffle(GSTexture** rt, const GSTextureCache::Source* tex) +{ + // Uncomment to disable HLE emulation (allow to trace the draw call) + // m_channel_shuffle = false; + + // First let's check we really have a channel shuffle effect + if (m_channel_shuffle) + { + if (m_game.title == CRC::GT4 || m_game.title == CRC::GT3 || m_game.title == CRC::GTConcept || m_game.title == CRC::TouristTrophy) + { + GL_INS("Gran Turismo RGB Channel"); + m_conf.ps.channel = ChannelFetch_RGB; + m_context->TEX0.TFX = TFX_DECAL; + *rt = tex->m_from_target; + } + else if (m_game.title == CRC::Tekken5) + { + if (m_context->FRAME.FBW == 1) + { + // Used in stages: Secret Garden, Acid Rain, Moonlit Wilderness + GL_INS("Tekken5 RGB Channel"); + m_conf.ps.channel = ChannelFetch_RGB; + m_context->FRAME.FBMSK = 0xFF000000; + // 12 pages: 2 calls by channel, 3 channels, 1 blit + // Minus current draw call + m_skip = 12 * (3 + 3 + 1) - 1; + *rt = tex->m_from_target; + } + else + { + // Could skip model drawing if wrongly detected + m_channel_shuffle = false; + } + } + else if ((tex->m_texture->GetType() == GSTexture::Type::DepthStencil) && !(tex->m_32_bits_fmt)) + { + // So far 2 games hit this code path. Urban Chaos and Tales of Abyss + // UC: will copy depth to green channel + // ToA: will copy depth to alpha channel + if ((m_context->FRAME.FBMSK & 0xFF0000) == 0xFF0000) + { + // Green channel is masked + GL_INS("Tales Of Abyss Crazyness (MSB 16b depth to Alpha)"); + m_conf.ps.tales_of_abyss_hle = 1; + } + else + { + GL_INS("Urban Chaos Crazyness (Green extraction)"); + m_conf.ps.urban_chaos_hle = 1; + } + } + else if (m_index.tail <= 64 && m_context->CLAMP.WMT == 3) + { + // Blood will tell. I think it is channel effect too but again + // implemented in a different way. I don't want to add more CRC stuff. So + // let's disable channel when the signature is different + // + // Note: Tales Of Abyss and Tekken5 could hit this path too. Those games are + // handled above. + GL_INS("Maybe not a channel!"); + m_channel_shuffle = false; + } + else if (m_context->CLAMP.WMS == 3 && ((m_context->CLAMP.MAXU & 0x8) == 8)) + { + // Read either blue or Alpha. Let's go for Blue ;) + // MGS3/Kill Zone + GL_INS("Blue channel"); + m_conf.ps.channel = ChannelFetch_BLUE; + } + else if (m_context->CLAMP.WMS == 3 && ((m_context->CLAMP.MINU & 0x8) == 0)) + { + // Read either Red or Green. Let's check the V coordinate. 0-1 is likely top so + // red. 2-3 is likely bottom so green (actually depends on texture base pointer offset) + const bool green = PRIM->FST && (m_vertex.buff[0].V & 32); + if (green && (m_context->FRAME.FBMSK & 0x00FFFFFF) == 0x00FFFFFF) + { + // Typically used in Terminator 3 + const int blue_mask = m_context->FRAME.FBMSK >> 24; + int blue_shift = -1; + + // Note: potentially we could also check the value of the clut + switch (blue_mask) + { + case 0xFF: ASSERT(0); break; + case 0xFE: blue_shift = 1; break; + case 0xFC: blue_shift = 2; break; + case 0xF8: blue_shift = 3; break; + case 0xF0: blue_shift = 4; break; + case 0xE0: blue_shift = 5; break; + case 0xC0: blue_shift = 6; break; + case 0x80: blue_shift = 7; break; + default: break; + } + + if (blue_shift >= 0) + { + const int green_mask = ~blue_mask & 0xFF; + const int green_shift = 8 - blue_shift; + + GL_INS("Green/Blue channel (%d, %d)", blue_shift, green_shift); + m_conf.cb_ps.channel_shuffle.blue_mask = blue_mask; + m_conf.cb_ps.channel_shuffle.blue_shift = blue_shift; + m_conf.cb_ps.channel_shuffle.green_mask = green_mask; + m_conf.cb_ps.channel_shuffle.green_shift = green_shift; + m_conf.ps.channel = ChannelFetch_GXBY; + m_context->FRAME.FBMSK = 0x00FFFFFF; + } + else + { + GL_INS("Green channel (wrong mask) (fbmask %x)", blue_mask); + m_conf.ps.channel = ChannelFetch_GREEN; + } + } + else if (green) + { + GL_INS("Green channel"); + m_conf.ps.channel = ChannelFetch_GREEN; + } + else + { + // Pop + GL_INS("Red channel"); + m_conf.ps.channel = ChannelFetch_RED; + } + } + else + { + GL_INS("Channel not supported"); + m_channel_shuffle = false; + } + } + + // Effect is really a channel shuffle effect so let's cheat a little + if (m_channel_shuffle) + { + m_conf.raw_tex = tex->m_from_target; + if (m_dev->Features().texture_barrier) + m_conf.require_one_barrier = true; + + // Replace current draw with a fullscreen sprite + // + // Performance GPU note: it could be wise to reduce the size to + // the rendered size of the framebuffer + + GSVertex* s = &m_vertex.buff[0]; + s[0].XYZ.X = (u16)(m_context->XYOFFSET.OFX + 0); + s[1].XYZ.X = (u16)(m_context->XYOFFSET.OFX + 16384); + s[0].XYZ.Y = (u16)(m_context->XYOFFSET.OFY + 0); + s[1].XYZ.Y = (u16)(m_context->XYOFFSET.OFY + 16384); + + m_vertex.head = m_vertex.tail = m_vertex.next = 2; + m_index.tail = 2; + } + else + { + m_conf.raw_tex = nullptr; + } +} + +void GSRendererNew::EmulateBlending(bool& DATE_GL42, bool& DATE_GL45) +{ + // AA1: Don't enable blending on AA1, not yet implemented on hardware mode, + // it requires coverage sample so it's safer to turn it off instead. + const bool aa1 = PRIM->AA1 && (m_vt.m_primclass == GS_LINE_CLASS); + + // No blending or coverage anti-aliasing so early exit + if (!(PRIM->ABE || m_env.PABE.PABE || aa1)) + { + m_conf.blend = {}; + return; + } + + // Compute the blending equation to detect special case + const GIFRegALPHA& ALPHA = m_context->ALPHA; + u8 blend_index = u8(((ALPHA.A * 3 + ALPHA.B) * 3 + ALPHA.C) * 3 + ALPHA.D); + const int blend_flag = m_dev->GetBlendFlags(blend_index); + + // Do the multiplication in shader for blending accumulation: Cs*As + Cd or Cs*Af + Cd + bool accumulation_blend = !!(blend_flag & BLEND_ACCU); + + // Blending doesn't require barrier, or sampling of the rt + const bool blend_non_recursive = !!(blend_flag & BLEND_NO_REC); + + // BLEND MIX selection, use a mix of hw/sw blending + if (!m_vt.m_alpha.valid && (ALPHA.C == 0)) + GetAlphaMinMax(); + const bool blend_mix1 = !!(blend_flag & BLEND_MIX1); + const bool blend_mix2 = !!(blend_flag & BLEND_MIX2); + const bool blend_mix3 = !!(blend_flag & BLEND_MIX3); + bool blend_mix = (blend_mix1 || blend_mix2 || blend_mix3) + // Do not enable if As > 128 or F > 128, hw blend clamps to 1 + && !((ALPHA.C == 0 && m_vt.m_alpha.max > 128) || (ALPHA.C == 2 && ALPHA.FIX > 128u)); + + // SW Blend is (nearly) free. Let's use it. + const bool impossible_or_free_blend = (blend_flag & BLEND_A_MAX) // Impossible blending + || blend_non_recursive // Free sw blending, doesn't require barriers or reading fb + || accumulation_blend // Mix of hw/sw blending + || (m_prim_overlap == PRIM_OVERLAP_NO) // Blend can be done in a single draw + || (m_conf.require_full_barrier); // Another effect (for example fbmask) already requires a full barrier + + // Warning no break on purpose + // Note: the [[fallthrough]] attribute tell compilers not to complain about not having breaks. + bool sw_blending = false; + if (m_dev->Features().texture_barrier) + { + switch (m_sw_blending) + { + case AccBlendLevel::Ultra: + sw_blending |= true; + [[fallthrough]]; + case AccBlendLevel::Full: + sw_blending |= (ALPHA.A != ALPHA.B) && ((ALPHA.C == 0 && m_vt.m_alpha.max > 128) || (ALPHA.C == 2 && ALPHA.FIX > 128u)); + [[fallthrough]]; + case AccBlendLevel::High: + sw_blending |= (ALPHA.C == 1); + [[fallthrough]]; + case AccBlendLevel::Medium: + // Initial idea was to enable accurate blending for sprite rendering to handle + // correctly post-processing effect. Some games (ZoE) use tons of sprites as particles. + // In order to keep it fast, let's limit it to smaller draw call. + sw_blending |= m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 100; + [[fallthrough]]; + case AccBlendLevel::Basic: + sw_blending |= impossible_or_free_blend; + [[fallthrough]]; + case AccBlendLevel::None: + /*sw_blending |= accumulation_blend*/; + } + } + else + { + if (static_cast(m_sw_blending) >= static_cast(AccBlendLevel::Basic)) + sw_blending |= accumulation_blend || blend_non_recursive; + } + + // Do not run BLEND MIX if sw blending is already present, it's less accurate + if (m_sw_blending != AccBlendLevel::None) + { + blend_mix &= !sw_blending; + sw_blending |= blend_mix; + } + + // Color clip + if (m_env.COLCLAMP.CLAMP == 0) + { + // Safe FBMASK, avoid hitting accumulation mode on 16bit, + // fixes shadows in Superman shadows of Apokolips. + const bool sw_fbmask_colclip = !m_conf.require_one_barrier && m_conf.ps.fbmask; + bool free_colclip; + if (m_dev->Features().texture_barrier) + free_colclip = m_prim_overlap == PRIM_OVERLAP_NO || blend_non_recursive || sw_fbmask_colclip; + else + free_colclip = blend_non_recursive; + GL_DBG("COLCLIP Info (Blending: %d/%d/%d/%d, SW FBMASK: %d, OVERLAP: %d)", + ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D, sw_fbmask_colclip, m_prim_overlap); + if (free_colclip) + { + // The fastest algo that requires a single pass + GL_INS("COLCLIP Free mode ENABLED"); + m_conf.ps.colclip = 1; + sw_blending = true; + accumulation_blend = false; // disable the HDR algo + blend_mix = false; + } + else if (accumulation_blend || blend_mix) + { + // A fast algo that requires 2 passes + GL_INS("COLCLIP Fast HDR mode ENABLED"); + m_conf.ps.hdr = 1; + sw_blending = true; // Enable sw blending for the HDR algo + } + else if (sw_blending && m_dev->Features().texture_barrier) + { + // A slow algo that could requires several passes (barely used) + GL_INS("COLCLIP SW mode ENABLED"); + m_conf.ps.colclip = 1; + } + else + { + GL_INS("COLCLIP HDR mode ENABLED"); + m_conf.ps.hdr = 1; + } + } + + // Per pixel alpha blending + if (m_env.PABE.PABE) + { + // Breath of Fire Dragon Quarter, Strawberry Shortcake, Super Robot Wars, Cartoon Network Racing. + + if (sw_blending) + { + GL_INS("PABE mode ENABLED"); + m_conf.ps.pabe = 1; + accumulation_blend = false; + blend_mix = false; + } + else if (ALPHA.A == 0 && ALPHA.B == 1 && ALPHA.C == 0 && ALPHA.D == 1) + { + // this works because with PABE alpha blending is on when alpha >= 0x80, but since the pixel shader + // cannot output anything over 0x80 (== 1.0) blending with 0x80 or turning it off gives the same result + blend_index = 0; + } + } + + // GL42 interact very badly with sw blending. GL42 uses the primitiveID to find the primitive + // that write the bad alpha value. Sw blending will force the draw to run primitive by primitive + // (therefore primitiveID will be constant to 1). + // Switch DATE_GL42 with DATE_GL45 in such cases to ensure accuracy. + // No mix of COLCLIP + sw blend + DATE_GL42, neither sw fbmask + DATE_GL42. + // Note: Do the swap after colclip to avoid adding extra conditions. + if (sw_blending && DATE_GL42) + { + GL_PERF("DATE: Swap DATE_GL42 with DATE_GL45"); + m_conf.require_full_barrier = true; + DATE_GL42 = false; + DATE_GL45 = true; + } + + // For stat to optimize accurate option +#if 0 + GL_INS("BLEND_INFO: %d/%d/%d/%d. Clamp:%d. Prim:%d number %d (drawlist %d) (sw %d)", + ALPHA.A, ALPHA.B, ALPHA.C, ALPHA.D, m_env.COLCLAMP.CLAMP, m_vt.m_primclass, m_vertex.next, m_drawlist.size(), sw_blending); +#endif + if (sw_blending) + { + m_conf.ps.blend_a = ALPHA.A; + m_conf.ps.blend_b = ALPHA.B; + m_conf.ps.blend_c = ALPHA.C; + m_conf.ps.blend_d = ALPHA.D; + + if (accumulation_blend) + { + // Keep HW blending to do the addition/subtraction + m_conf.blend = {blend_index, 0, false, true, false}; + if (ALPHA.A == 2) + { + // The blend unit does a reverse subtraction so it means + // the shader must output a positive value. + // Replace 0 - Cs by Cs - 0 + m_conf.ps.blend_a = ALPHA.B; + m_conf.ps.blend_b = 2; + } + // Remove the addition/substraction from the SW blending + m_conf.ps.blend_d = 2; + + // Note accumulation_blend doesn't require a barrier + } + else if (blend_mix) + { + m_conf.blend = {blend_index, ALPHA.FIX, ALPHA.C == 2, false, true}; + + if (blend_mix1) + { + m_conf.ps.blend_a = 0; + m_conf.ps.blend_b = 2; + m_conf.ps.blend_d = 2; + } + else if (blend_mix2) + { + m_conf.ps.blend_a = 0; + m_conf.ps.blend_b = 2; + m_conf.ps.blend_d = 0; + } + else if (blend_mix3) + { + m_conf.ps.blend_a = 2; + m_conf.ps.blend_b = 0; + m_conf.ps.blend_d = 0; + } + } + else + { + // Disable HW blending + m_conf.blend = {}; + + m_conf.require_full_barrier |= !blend_non_recursive; + + // Only BLEND_NO_REC should hit this code path for now + if (!m_dev->Features().texture_barrier) + ASSERT(blend_non_recursive); + } + + // Require the fix alpha vlaue + if (ALPHA.C == 2) + m_conf.cb_ps.alpha_fix = ALPHA.FIX; + } + else + { + m_conf.ps.clr1 = !!(blend_flag & BLEND_C_CLR); + if (m_conf.ps.dfmt == 1 && ALPHA.C == 1) + { + // 24 bits doesn't have an alpha channel so use 1.0f fix factor as equivalent + const u8 hacked_blend_index = blend_index + 3; // +3 <=> +1 on C + m_conf.blend = {hacked_blend_index, 128, true, false, false}; + } + else + { + m_conf.blend = {blend_index, ALPHA.FIX, ALPHA.C == 2, false, false}; + } + } +} + +void GSRendererNew::EmulateTextureSampler(const GSTextureCache::Source* tex) +{ + // Warning fetch the texture PSM format rather than the context format. The latter could have been corrected in the texture cache for depth. + //const GSLocalMemory::psm_t &psm = GSLocalMemory::m_psm[m_context->TEX0.PSM]; + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[tex->m_TEX0.PSM]; + const GSLocalMemory::psm_t& cpsm = psm.pal > 0 ? GSLocalMemory::m_psm[m_context->TEX0.CPSM] : psm; + + const u8 wms = m_context->CLAMP.WMS; + const u8 wmt = m_context->CLAMP.WMT; + const bool complex_wms_wmt = !!((wms | wmt) & 2); + + const bool need_mipmap = IsMipMapDraw(); + const bool shader_emulated_sampler = tex->m_palette || cpsm.fmt != 0 || complex_wms_wmt || psm.depth; + const bool trilinear_manual = need_mipmap && m_mipmap == 2; + + bool bilinear = m_vt.IsLinear(); + int trilinear = 0; + bool trilinear_auto = false; + switch (UserHacks_tri_filter) + { + case TriFiltering::Forced: + trilinear = static_cast(GS_MIN_FILTER::Linear_Mipmap_Linear); + trilinear_auto = m_mipmap != 2; + break; + + case TriFiltering::PS2: + if (need_mipmap && m_mipmap != 2) + { + trilinear = m_context->TEX1.MMIN; + trilinear_auto = true; + } + break; + + case TriFiltering::None: + default: + break; + } + + // 1 and 0 are equivalent + m_conf.ps.wms = (wms & 2) ? wms : 0; + m_conf.ps.wmt = (wmt & 2) ? wmt : 0; + + // Depth + bilinear filtering isn't done yet (And I'm not sure we need it anyway but a game will prove me wrong) + // So of course, GTA set the linear mode, but sampling is done at texel center so it is equivalent to nearest sampling + ASSERT(!(psm.depth && m_vt.IsLinear())); + + // Performance note: + // 1/ Don't set 0 as it is the default value + // 2/ Only keep aem when it is useful (avoid useless shader permutation) + if (m_conf.ps.shuffle) + { + // Force a 32 bits access (normally shuffle is done on 16 bits) + // m_ps_sel.tex_fmt = 0; // removed as an optimization + m_conf.ps.aem = m_env.TEXA.AEM; + ASSERT(tex->m_target); + + // Require a float conversion if the texure is a depth otherwise uses Integral scaling + if (psm.depth) + { + m_conf.ps.depth_fmt = (tex->m_texture->GetType() != GSTexture::Type::DepthStencil) ? 3 : 1; + } + + // Shuffle is a 16 bits format, so aem is always required + m_conf.cb_ps.ta0 = m_env.TEXA.TA0; + m_conf.cb_ps.ta1 = m_env.TEXA.TA1; + + // The purpose of texture shuffle is to move color channel. Extra interpolation is likely a bad idea. + bilinear &= m_vt.IsLinear(); + + GSVector4 half_pixel = RealignTargetTextureCoordinate(tex); + m_conf.cb_vs.texture_offset = GSVector2(half_pixel.x, half_pixel.y); + } + else if (tex->m_target) + { + // Use an old target. AEM and index aren't resolved it must be done + // on the GPU + + // Select the 32/24/16 bits color (AEM) + m_conf.ps.aem_fmt = cpsm.fmt; + m_conf.ps.aem = m_env.TEXA.AEM; + + // Don't upload AEM if format is 32 bits + if (cpsm.fmt) + { + m_conf.cb_ps.ta0 = m_env.TEXA.TA0; + m_conf.cb_ps.ta1 = m_env.TEXA.TA1; + } + + // Select the index format + if (tex->m_palette) + { + // FIXME Potentially improve fmt field in GSLocalMemory + if (m_context->TEX0.PSM == PSM_PSMT4HL) + m_conf.ps.pal_fmt = 1; + else if (m_context->TEX0.PSM == PSM_PSMT4HH) + m_conf.ps.pal_fmt = 2; + else + m_conf.ps.pal_fmt = 3; + + // Alpha channel of the RT is reinterpreted as an index. Star + // Ocean 3 uses it to emulate a stencil buffer. It is a very + // bad idea to force bilinear filtering on it. + bilinear &= m_vt.IsLinear(); + } + + // Depth format + if (tex->m_texture->GetType() == GSTexture::Type::DepthStencil) + { + // Require a float conversion if the texure is a depth format + m_conf.ps.depth_fmt = (psm.bpp == 16) ? 2 : 1; + + // Don't force interpolation on depth format + bilinear &= m_vt.IsLinear(); + } + else if (psm.depth) + { + // Use Integral scaling + m_conf.ps.depth_fmt = 3; + + // Don't force interpolation on depth format + bilinear &= m_vt.IsLinear(); + } + + GSVector4 half_pixel = RealignTargetTextureCoordinate(tex); + m_conf.cb_vs.texture_offset = GSVector2(half_pixel.x, half_pixel.y); + } + else if (tex->m_palette) + { + // Use a standard 8 bits texture. AEM is already done on the CLUT + // Therefore you only need to set the index + // m_conf.ps.aem = 0; // removed as an optimization + + // Note 4 bits indexes are converted to 8 bits + m_conf.ps.pal_fmt = 3; + } + else + { + // Standard texture. Both index and AEM expansion were already done by the CPU. + // m_conf.ps.tex_fmt = 0; // removed as an optimization + // m_conf.ps.aem = 0; // removed as an optimization + } + + if (m_context->TEX0.TFX == TFX_MODULATE && m_vt.m_eq.rgba == 0xFFFF && m_vt.m_min.c.eq(GSVector4i(128))) + { + // Micro optimization that reduces GPU load (removes 5 instructions on the FS program) + m_conf.ps.tfx = TFX_DECAL; + } + else + { + m_conf.ps.tfx = m_context->TEX0.TFX; + } + + m_conf.ps.tcc = m_context->TEX0.TCC; + + m_conf.ps.ltf = bilinear && shader_emulated_sampler; + m_conf.ps.point_sampler = m_dev->Features().broken_point_sampler && (!bilinear || shader_emulated_sampler); + + const int w = tex->m_texture->GetWidth(); + const int h = tex->m_texture->GetHeight(); + + const int tw = (int)(1 << m_context->TEX0.TW); + const int th = (int)(1 << m_context->TEX0.TH); + + const GSVector4 WH(tw, th, w, h); + + m_conf.ps.fst = !!PRIM->FST; + + m_conf.cb_ps.texture_size = WH; + m_conf.cb_ps.half_texel = GSVector4(-0.5f, 0.5f).xxyy() / WH.zwzw(); + if (complex_wms_wmt) + { + m_conf.cb_ps.umsk = m_context->CLAMP.MINU; + m_conf.cb_ps.vmsk = m_context->CLAMP.MINV; + m_conf.cb_ps.ufix = m_context->CLAMP.MAXU; + m_conf.cb_ps.vfix = m_context->CLAMP.MAXV; + m_conf.cb_ps.uv_min_max = GSVector4(GSVector4i::loadl(&m_conf.cb_ps.uv_msk_fix).u16to32()) / WH.xyxy(); + } + else if (trilinear_manual) + { + // Reuse uv_min_max for mipmap parameter to avoid an extension of the UBO + m_conf.cb_ps.uv_min_max.x = (float)m_context->TEX1.K / 16.0f; + m_conf.cb_ps.uv_min_max.y = float(1 << m_context->TEX1.L); + m_conf.cb_ps.uv_min_max.z = float(m_lod.x); // Offset because first layer is m_lod, dunno if we can do better + m_conf.cb_ps.uv_min_max.w = float(m_lod.y); + } + else if (trilinear_auto) + { + tex->m_texture->GenerateMipmap(); + } + + // TC Offset Hack + m_conf.ps.tcoffsethack = m_userhacks_tcoffset; + GSVector4 tc_oh_ts = GSVector4(1 / 16.0f, 1 / 16.0f, m_userhacks_tcoffset_x, m_userhacks_tcoffset_y) / WH.xyxy(); + m_conf.cb_vs.texture_scale = GSVector2(tc_oh_ts.x, tc_oh_ts.y); + m_conf.cb_ps.tc_offset = GSVector2(tc_oh_ts.z, tc_oh_ts.y); + + // Must be done after all coordinates math + if (m_context->HasFixedTEX0() && !PRIM->FST) + { + m_conf.ps.invalid_tex0 = 1; + // Use invalid size to denormalize ST coordinate + m_conf.cb_ps.texture_size.x = (float)(1 << m_context->stack.TEX0.TW); + m_conf.cb_ps.texture_size.y = (float)(1 << m_context->stack.TEX0.TH); + + // We can't handle m_target with invalid_tex0 atm due to upscaling + ASSERT(!tex->m_target); + } + + // Only enable clamping in CLAMP mode. REGION_CLAMP will be done manually in the shader + m_conf.sampler.tau = (wms != CLAMP_CLAMP); + m_conf.sampler.tav = (wmt != CLAMP_CLAMP); + if (shader_emulated_sampler) + { + m_conf.sampler.biln = 0; + m_conf.sampler.aniso = 0; + m_conf.sampler.triln = 0; + } + else + { + m_conf.sampler.biln = bilinear; + // Aniso filtering doesn't work with textureLod so use texture (automatic_lod) instead. + // Enable aniso only for triangles. Sprites are flat so aniso is likely useless (it would save perf for others primitives). + const bool anisotropic = m_vt.m_primclass == GS_TRIANGLE_CLASS && !trilinear_manual; + m_conf.sampler.aniso = anisotropic; + m_conf.sampler.triln = trilinear; + if (trilinear_manual) + { + m_conf.ps.manual_lod = 1; + } + else if (trilinear_auto || anisotropic) + { + m_conf.ps.automatic_lod = 1; + } + } + + m_conf.tex = tex->m_texture; + m_conf.pal = tex->m_palette; +} + +GSRendererNew::PRIM_OVERLAP GSRendererNew::PrimitiveOverlap() +{ + // Either 1 triangle or 1 line or 3 POINTs + // It is bad for the POINTs but low probability that they overlap + if (m_vertex.next < 4) + return PRIM_OVERLAP_NO; + + if (m_vt.m_primclass != GS_SPRITE_CLASS) + return PRIM_OVERLAP_UNKNOW; // maybe, maybe not + + // Check intersection of sprite primitive only + const size_t count = m_vertex.next; + PRIM_OVERLAP overlap = PRIM_OVERLAP_NO; + const GSVertex* v = m_vertex.buff; + + m_drawlist.clear(); + size_t i = 0; + while (i < count) + { + // In order to speed up comparison a bounding-box is accumulated. It removes a + // loop so code is much faster (check game virtua fighter). Besides it allow to check + // properly the Y order. + + // .x = min(v[i].XYZ.X, v[i+1].XYZ.X) + // .y = min(v[i].XYZ.Y, v[i+1].XYZ.Y) + // .z = max(v[i].XYZ.X, v[i+1].XYZ.X) + // .w = max(v[i].XYZ.Y, v[i+1].XYZ.Y) + GSVector4i all = GSVector4i(v[i].m[1]).upl16(GSVector4i(v[i + 1].m[1])).upl16().xzyw(); + all = all.xyxy().blend(all.zwzw(), all > all.zwxy()); + + size_t j = i + 2; + while (j < count) + { + GSVector4i sprite = GSVector4i(v[j].m[1]).upl16(GSVector4i(v[j + 1].m[1])).upl16().xzyw(); + sprite = sprite.xyxy().blend(sprite.zwzw(), sprite > sprite.zwxy()); + + // Be sure to get vertex in good order, otherwise .r* function doesn't + // work as expected. + ASSERT(sprite.x <= sprite.z); + ASSERT(sprite.y <= sprite.w); + ASSERT(all.x <= all.z); + ASSERT(all.y <= all.w); + + if (all.rintersect(sprite).rempty()) + { + all = all.runion_ordered(sprite); + } + else + { + overlap = PRIM_OVERLAP_YES; + break; + } + j += 2; + } + m_drawlist.push_back((j - i) >> 1); // Sprite count + i = j; + } + +#if 0 + // Old algo: less constraint but O(n^2) instead of O(n) as above + + // You have no guarantee on the sprite order, first vertex can be either top-left or bottom-left + // There is a high probability that the draw call will uses same ordering for all vertices. + // In order to keep a small performance impact only the first sprite will be checked + // + // Some safe-guard will be added in the outer-loop to avoid corruption with a limited perf impact + if (v[1].XYZ.Y < v[0].XYZ.Y) { + // First vertex is Top-Left + for(size_t i = 0; i < count; i += 2) { + if (v[i+1].XYZ.Y > v[i].XYZ.Y) { + return PRIM_OVERLAP_UNKNOW; + } + GSVector4i vi(v[i].XYZ.X, v[i+1].XYZ.Y, v[i+1].XYZ.X, v[i].XYZ.Y); + for (size_t j = i+2; j < count; j += 2) { + GSVector4i vj(v[j].XYZ.X, v[j+1].XYZ.Y, v[j+1].XYZ.X, v[j].XYZ.Y); + GSVector4i inter = vi.rintersect(vj); + if (!inter.rempty()) { + return PRIM_OVERLAP_YES; + } + } + } + } else { + // First vertex is Bottom-Left + for(size_t i = 0; i < count; i += 2) { + if (v[i+1].XYZ.Y < v[i].XYZ.Y) { + return PRIM_OVERLAP_UNKNOW; + } + GSVector4i vi(v[i].XYZ.X, v[i].XYZ.Y, v[i+1].XYZ.X, v[i+1].XYZ.Y); + for (size_t j = i+2; j < count; j += 2) { + GSVector4i vj(v[j].XYZ.X, v[j].XYZ.Y, v[j+1].XYZ.X, v[j+1].XYZ.Y); + GSVector4i inter = vi.rintersect(vj); + if (!inter.rempty()) { + return PRIM_OVERLAP_YES; + } + } + } + } +#endif + + // fprintf(stderr, "%d: Yes, code can be optimized (draw of %d vertices)\n", s_n, count); + return overlap; +} + +void GSRendererNew::EmulateATST(GSHWDrawConfig::PSConstantBuffer& cb, GSHWDrawConfig::PSSelector& ps, bool pass_2) +{ + static const u32 inverted_atst[] = {ATST_ALWAYS, ATST_NEVER, ATST_GEQUAL, ATST_GREATER, ATST_NOTEQUAL, ATST_LESS, ATST_LEQUAL, ATST_EQUAL}; + + if (!m_context->TEST.ATE) + return; + + // Check for pass 2, otherwise do pass 1. + const int atst = pass_2 ? inverted_atst[m_context->TEST.ATST] : m_context->TEST.ATST; + + + switch (atst) + { + case ATST_LESS: + cb.aref = m_context->TEST.AREF; + ps.atst = 1; + break; + case ATST_LEQUAL: + cb.aref = m_context->TEST.AREF + 1; + ps.atst = 1; + break; + case ATST_GEQUAL: + cb.aref = m_context->TEST.AREF; + ps.atst = 2; + break; + case ATST_GREATER: + cb.aref = m_context->TEST.AREF + 1; + ps.atst = 2; + break; + case ATST_EQUAL: + cb.aref = m_context->TEST.AREF; + ps.atst = 3; + break; + case ATST_NOTEQUAL: + cb.aref = m_context->TEST.AREF; + ps.atst = 4; + break; + case ATST_NEVER: // Draw won't be done so no need to implement it in shader + case ATST_ALWAYS: + default: + ps.atst = 0; + break; + } +} + +void GSRendererNew::ResetStates() +{ + GSHWDrawConfig::VSConstantBuffer vs_tmp = m_conf.cb_vs; + GSHWDrawConfig::PSConstantBuffer ps_tmp = m_conf.cb_ps; + memset(&m_conf, 0, sizeof(m_conf)); + m_conf.cb_vs = vs_tmp; + m_conf.cb_ps = ps_tmp; +} + +void GSRendererNew::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) +{ +#ifdef ENABLE_OGL_DEBUG + GSVector4i area_out = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(m_context->scissor.in)); + GSVector4i area_in = GSVector4i(m_vt.m_min.t.xyxy(m_vt.m_max.t)); + + GL_PUSH("GL Draw from %d (area %d,%d => %d,%d) in %d (Depth %d) (area %d,%d => %d,%d)", + tex && tex->m_texture ? tex->m_texture->GetID() : -1, + area_in.x, area_in.y, area_in.z, area_in.w, + rt ? rt->GetID() : -1, ds ? ds->GetID() : -1, + area_out.x, area_out.y, area_out.z, area_out.w); +#endif + + const GSVector2i& rtsize = ds ? ds->GetSize() : rt->GetSize(); + const GSVector2& rtscale = ds ? ds->GetScale() : rt->GetScale(); + + const bool DATE = m_context->TEST.DATE && m_context->FRAME.PSM != PSM_PSMCT24; + bool DATE_GL42 = false; + bool DATE_GL45 = false; + bool DATE_one = false; + + const bool ate_first_pass = m_context->TEST.DoFirstPass(); + const bool ate_second_pass = m_context->TEST.DoSecondPass(); + + ResetStates(); + m_conf.cb_vs.texture_offset = GSVector2(0, 0); + + ASSERT(m_dev != NULL); + + // HLE implementation of the channel selection effect + // + // Warning it must be done at the begining because it will change the + // vertex list (it will interact with PrimitiveOverlap and accurate + // blending) + EmulateChannelShuffle(&rt, tex); + + // Upscaling hack to avoid various line/grid issues + MergeSprite(tex); + + // Always check if primitive overlap as it is used in plenty of effects. + if (m_dev->Features().texture_barrier) + m_prim_overlap = PrimitiveOverlap(); + else + m_prim_overlap = PRIM_OVERLAP_UNKNOW; // Prim overlap check is useless without texture barrier + + // Detect framebuffer read that will need special handling + if (m_dev->Features().texture_barrier && (m_context->FRAME.Block() == m_context->TEX0.TBP0) && PRIM->TME && m_sw_blending != AccBlendLevel::None) + { + if ((m_context->FRAME.FBMSK == 0x00FFFFFF) && (m_vt.m_primclass == GS_TRIANGLE_CLASS)) + { + // This pattern is used by several games to emulate a stencil (shadow) + // Ratchet & Clank, Jak do alpha integer multiplication (tfx) which is mostly equivalent to +1/-1 + // Tri-Ace (Star Ocean 3/RadiataStories/VP2) uses a palette to handle the +1/-1 + GL_DBG("Source and Target are the same! Let's sample the framebuffer"); + m_conf.ps.tex_is_fb = 1; + m_conf.require_full_barrier = true; + } + else if (m_prim_overlap != PRIM_OVERLAP_NO) + { + // Note: It is fine if the texture fits in a single GS page. First access will cache + // the page in the GS texture buffer. + GL_INS("ERROR: Source and Target are the same!"); + } + } + + EmulateTextureShuffleAndFbmask(); + + // DATE: selection of the algorithm. Must be done before blending because GL42 is not compatible with blending + if (DATE) + { + if (m_prim_overlap == PRIM_OVERLAP_NO || m_texture_shuffle) + { + // It is way too complex to emulate texture shuffle with DATE. So just use + // the slow but accurate algo + GL_PERF("DATE: With %s", m_texture_shuffle ? "texture shuffle" : "no prim overlap"); + if (m_dev->Features().texture_barrier) + { + m_conf.require_full_barrier = true; + DATE_GL45 = true; + } + } + else if (m_conf.colormask.wa && !m_context->TEST.ATE) + { + // Performance note: check alpha range with GetAlphaMinMax() + // Note: all my dump are already above 120fps, but it seems to reduce GPU load + // with big upscaling + GetAlphaMinMax(); + if (m_context->TEST.DATM && m_vt.m_alpha.max < 128) + { + // Only first pixel (write 0) will pass (alpha is 1) + GL_PERF("DATE: Fast with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max); + DATE_one = true; + } + else if (!m_context->TEST.DATM && m_vt.m_alpha.min >= 128) + { + // Only first pixel (write 1) will pass (alpha is 0) + GL_PERF("DATE: Fast with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max); + DATE_one = true; + } + else if ((m_vt.m_primclass == GS_SPRITE_CLASS && m_drawlist.size() < 50) || (m_index.tail < 100)) + { + // texture barrier will split the draw call into n draw call. It is very efficient for + // few primitive draws. Otherwise it sucks. + GL_PERF("DATE: Slow with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max); + if (m_dev->Features().texture_barrier) + { + m_conf.require_full_barrier = true; + DATE_GL45 = true; + } + } + else if (m_accurate_date) + { + // Note: Fast level (DATE_one) was removed as it's less accurate. + GL_PERF("DATE: Full AD with alpha %d-%d", m_vt.m_alpha.min, m_vt.m_alpha.max); + if (m_dev->Features().image_load_store) + { + DATE_GL42 = true; + } + else if (m_dev->Features().texture_barrier) + { + m_conf.require_full_barrier = true; + DATE_GL45 = true; + } + else + { + DATE_one = true; + } + } + } + else if (!m_conf.colormask.wa && !m_context->TEST.ATE) + { + // TODO: is it legal ? Likely but it need to be tested carefully + // DATE_GL45 = true; + // m_conf.require_one_barrier = true; << replace it with a cheap barrier + } + + // Will save my life ! + ASSERT(!(DATE_GL45 && DATE_one)); + ASSERT(!(DATE_GL42 && DATE_one)); + ASSERT(!(DATE_GL42 && DATE_GL45)); + } + + // Blend + + if (!IsOpaque() && rt) + { + EmulateBlending(DATE_GL42, DATE_GL45); + } + else + { + m_conf.blend = {}; // No blending please + } + + if (m_conf.ps.dfmt == 1) + { + // Disable writing of the alpha channel + m_conf.colormask.wa = 0; + } + + // DATE setup, no DATE_GL45 please + + if (!DATE) + m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Off; + else if (DATE_one) + m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::StencilOne; + else if (DATE_GL42) + m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking; + else if (DATE_GL45) + m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Full; + else + m_conf.destination_alpha = GSHWDrawConfig::DestinationAlphaMode::Stencil; + + m_conf.datm = m_context->TEST.DATM; + + // om + + EmulateZbuffer(); // will update VS depth mask + + // vs + + m_conf.vs.tme = PRIM->TME; + m_conf.vs.fst = PRIM->FST; + + // FIXME D3D11 and GL support half pixel center. Code could be easier!!! + const float sx = 2.0f * rtscale.x / (rtsize.x << 4); + const float sy = 2.0f * rtscale.y / (rtsize.y << 4); + const float ox = (float)(int)m_context->XYOFFSET.OFX; + const float oy = (float)(int)m_context->XYOFFSET.OFY; + float ox2 = -1.0f / rtsize.x; + float oy2 = -1.0f / rtsize.y; + + //This hack subtracts around half a pixel from OFX and OFY. + // + //The resulting shifted output aligns better with common blending / corona / blurring effects, + //but introduces a few bad pixels on the edges. + + if (rt && rt->LikelyOffset && m_userHacks_HPO == 1) + { + ox2 *= rt->OffsetHack_modx; + oy2 *= rt->OffsetHack_mody; + } + + m_conf.cb_vs.vertex_scale = GSVector2(sx, sy); + m_conf.cb_vs.vertex_offset = GSVector2(ox * sx + ox2 + 1, oy * sy + oy2 + 1); + // END of FIXME + + // GS_SPRITE_CLASS are already flat (either by CPU or the GS) + m_conf.ps.iip = (m_vt.m_primclass == GS_SPRITE_CLASS) ? 1 : PRIM->IIP; + m_conf.gs.iip = m_conf.ps.iip; + + if (DATE_GL45) + { + m_conf.ps.date = 5 + m_context->TEST.DATM; + } + else if (DATE_one) + { + if (m_dev->Features().texture_barrier) + { + m_conf.require_one_barrier = true; + m_conf.ps.date = 5 + m_context->TEST.DATM; + } + m_conf.depth.date = 1; + m_conf.depth.date_one = 1; + } + else if (DATE) + { + if (DATE_GL42) + m_conf.ps.date = 1 + m_context->TEST.DATM; + else + m_conf.depth.date = 1; + } + + m_conf.ps.fba = m_context->FBA.FBA; + m_conf.ps.dither = m_dithering > 0 && m_conf.ps.dfmt == 2 && m_env.DTHE.DTHE; + + if (m_conf.ps.dither) + { + GL_DBG("DITHERING mode ENABLED (%d)", m_dithering); + + m_conf.ps.dither = m_dithering; + m_conf.cb_ps.dither_matrix.U64 = m_env.DIMX.U64 & 0x7777777777777777ull; + } + + if (PRIM->FGE) + { + m_conf.ps.fog = 1; + + m_conf.cb_ps.fog_color[0] = m_env.FOGCOL.FCR; + m_conf.cb_ps.fog_color[1] = m_env.FOGCOL.FCG; + m_conf.cb_ps.fog_color[2] = m_env.FOGCOL.FCB; + } + + // Warning must be done after EmulateZbuffer + // Depth test is always true so it can be executed in 2 passes (no order required) unlike color. + // The idea is to compute first the color which is independent of the alpha test. And then do a 2nd + // pass to handle the depth based on the alpha test. + bool ate_RGBA_then_Z = false; + bool ate_RGB_then_ZA = false; + if (ate_first_pass & ate_second_pass) + { + GL_DBG("Complex Alpha Test"); + const bool commutative_depth = (m_conf.depth.ztst == ZTST_GEQUAL && m_vt.m_eq.z) || (m_conf.depth.ztst == ZTST_ALWAYS); + const bool commutative_alpha = (m_context->ALPHA.C != 1); // when either Alpha Src or a constant + + ate_RGBA_then_Z = (m_context->TEST.AFAIL == AFAIL_FB_ONLY) & commutative_depth; + ate_RGB_then_ZA = (m_context->TEST.AFAIL == AFAIL_RGB_ONLY) & commutative_depth & commutative_alpha; + } + + if (ate_RGBA_then_Z) + { + GL_DBG("Alternate ATE handling: ate_RGBA_then_Z"); + // Render all color but don't update depth + // ATE is disabled here + m_conf.depth.zwe = false; + } + else if (ate_RGB_then_ZA) + { + GL_DBG("Alternate ATE handling: ate_RGB_then_ZA"); + // Render RGB color but don't update depth/alpha + // ATE is disabled here + m_conf.depth.zwe = false; + m_conf.colormask.wa = false; + } + else + { + EmulateATST(m_conf.cb_ps, m_conf.ps, false); + } + + if (tex) + { + EmulateTextureSampler(tex); + } + else + { + m_conf.ps.tfx = 4; + } + + if (m_game.title == CRC::ICO) + { + const GSVertex* v = &m_vertex.buff[0]; + const GSVideoMode mode = GetVideoMode(); + if (tex && m_vt.m_primclass == GS_SPRITE_CLASS && m_vertex.next == 2 && PRIM->ABE && // Blend texture + ((v[1].U == 8200 && v[1].V == 7176 && mode == GSVideoMode::NTSC) || // at display resolution 512x448 + (v[1].U == 8200 && v[1].V == 8200 && mode == GSVideoMode::PAL)) && // at display resolution 512x512 + tex->m_TEX0.PSM == PSM_PSMT8H) // i.e. read the alpha channel of a 32 bits texture + { + // Note potentially we can limit to TBP0:0x2800 + + // Depth buffer was moved so GS will invalide it which means a + // downscale. ICO uses the MSB depth bits as the texture alpha + // channel. However this depth of field effect requires + // texel:pixel mapping accuracy. + // + // Use an HLE shader to sample depth directly as the alpha channel + GL_INS("ICO sample depth as alpha"); + m_conf.require_full_barrier = true; + // Extract the depth as palette index + m_conf.ps.depth_fmt = 1; + m_conf.ps.channel = ChannelFetch_BLUE; + m_conf.raw_tex = ds; + + // We need the palette to convert the depth to the correct alpha value. + if (!tex->m_palette) + { + const u16 pal = GSLocalMemory::m_psm[tex->m_TEX0.PSM].pal; + m_tc->AttachPaletteToSource(tex, pal, true); + m_conf.pal = tex->m_palette; + } + } + } + + // rs + const GSVector4& hacked_scissor = m_channel_shuffle ? GSVector4(0, 0, 1024, 1024) : m_context->scissor.in; + const GSVector4i scissor = GSVector4i(GSVector4(rtscale).xyxy() * hacked_scissor).rintersect(GSVector4i(rtsize).zwxy()); + + const GSVector4i commitRect = ComputeBoundingBox(rtscale, rtsize); + m_conf.scissor = (DATE && !DATE_GL45) ? scissor.rintersect(commitRect) : scissor; + + SetupIA(sx, sy); + + if (rt) + rt->CommitRegion(GSVector2i(commitRect.z, commitRect.w)); + + if (ds) + ds->CommitRegion(GSVector2i(commitRect.z, commitRect.w)); + + m_conf.alpha_second_pass.enable = ate_second_pass; + + if (ate_second_pass) + { + ASSERT(!m_env.PABE.PABE); + memcpy(&m_conf.alpha_second_pass.cb_ps, &m_conf.cb_ps, sizeof(m_conf.cb_ps)); + memcpy(&m_conf.alpha_second_pass.ps, &m_conf.ps, sizeof(m_conf.ps)); + memcpy(&m_conf.alpha_second_pass.colormask, &m_conf.colormask, sizeof(m_conf.colormask)); + memcpy(&m_conf.alpha_second_pass.depth, &m_conf.depth, sizeof(m_conf.depth)); + + if (ate_RGBA_then_Z | ate_RGB_then_ZA) + { + // Enable ATE as first pass to update the depth + // of pixels that passed the alpha test + EmulateATST(m_conf.alpha_second_pass.cb_ps, m_conf.alpha_second_pass.ps, false); + } + else + { + // second pass will process the pixels that failed + // the alpha test + EmulateATST(m_conf.alpha_second_pass.cb_ps, m_conf.alpha_second_pass.ps, true); + } + + + bool z = m_conf.depth.zwe; + bool r = m_conf.colormask.wr; + bool g = m_conf.colormask.wg; + bool b = m_conf.colormask.wb; + bool a = m_conf.colormask.wa; + + switch (m_context->TEST.AFAIL) + { + case AFAIL_KEEP: z = r = g = b = a = false; break; // none + case AFAIL_FB_ONLY: z = false; break; // rgba + case AFAIL_ZB_ONLY: r = g = b = a = false; break; // z + case AFAIL_RGB_ONLY: z = a = false; break; // rgb + default: __assume(0); + } + + // Depth test should be disabled when depth writes are masked and similarly, Alpha test must be disabled + // when writes to all of the alpha bits in the Framebuffer are masked. + if (ate_RGBA_then_Z) + { + z = !m_context->ZBUF.ZMSK; + r = g = b = a = false; + } + else if (ate_RGB_then_ZA) + { + z = !m_context->ZBUF.ZMSK; + a = (m_context->FRAME.FBMSK & 0xFF000000) != 0xFF000000; + r = g = b = false; + } + + if (z || r || g || b || a) + { + m_conf.alpha_second_pass.depth.zwe = z; + m_conf.alpha_second_pass.colormask.wr = r; + m_conf.alpha_second_pass.colormask.wg = g; + m_conf.alpha_second_pass.colormask.wb = b; + m_conf.alpha_second_pass.colormask.wa = a; + } + else + { + m_conf.alpha_second_pass.enable = false; + } + } + + if (!ate_first_pass) + { + if (!m_conf.alpha_second_pass.enable) + return; + + // RenderHW always renders first pass, replace first pass with second + memcpy(&m_conf.cb_ps, &m_conf.alpha_second_pass.cb_ps, sizeof(m_conf.cb_ps)); + memcpy(&m_conf.ps, &m_conf.alpha_second_pass.ps, sizeof(m_conf.ps)); + memcpy(&m_conf.colormask, &m_conf.alpha_second_pass.colormask, sizeof(m_conf.colormask)); + memcpy(&m_conf.depth, &m_conf.alpha_second_pass.depth, sizeof(m_conf.depth)); + m_conf.alpha_second_pass.enable = false; + } + + if (m_conf.require_full_barrier && m_prim_overlap == PRIM_OVERLAP_NO) + { + m_conf.require_full_barrier = false; + m_conf.require_one_barrier = true; + } + + if (m_conf.require_full_barrier && m_vt.m_primclass == GS_SPRITE_CLASS) + { + m_conf.drawlist = &m_drawlist; + } + + m_conf.rt = rt; + m_conf.ds = ds; + m_dev->RenderHW(m_conf); +} + +bool GSRendererNew::IsDummyTexture() const +{ + // Texture is actually the frame buffer. Stencil emulation to compute shadow (Jak series/tri-ace game) + // Will hit the "m_ps_sel.tex_is_fb = 1" path in the draw + return m_dev->Features().texture_barrier && (m_context->FRAME.Block() == m_context->TEX0.TBP0) && PRIM->TME && m_sw_blending != AccBlendLevel::None && m_vt.m_primclass == GS_TRIANGLE_CLASS && (m_context->FRAME.FBMSK == 0x00FFFFFF); +} diff --git a/pcsx2/GS/Renderers/HW/GSRendererNew.h b/pcsx2/GS/Renderers/HW/GSRendererNew.h new file mode 100644 index 0000000000..32bded65cc --- /dev/null +++ b/pcsx2/GS/Renderers/HW/GSRendererNew.h @@ -0,0 +1,58 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2021 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#pragma once + +#include "GS/Renderers/HW/GSRendererHW.h" +#include "GS/Renderers/HW/GSVertexHW.h" + +class GSRendererNew final : public GSRendererHW +{ + enum PRIM_OVERLAP + { + PRIM_OVERLAP_UNKNOW, + PRIM_OVERLAP_YES, + PRIM_OVERLAP_NO + }; + +private: + PRIM_OVERLAP m_prim_overlap; + std::vector m_drawlist; + + TriFiltering UserHacks_tri_filter; + + GSHWDrawConfig m_conf; + +private: + inline void ResetStates(); + inline void SetupIA(const float& sx, const float& sy); + inline void EmulateTextureShuffleAndFbmask(); + inline void EmulateChannelShuffle(GSTexture** rt, const GSTextureCache::Source* tex); + inline void EmulateBlending(bool& DATE_GL42, bool& DATE_GL45); + inline void EmulateTextureSampler(const GSTextureCache::Source* tex); + inline void EmulateZbuffer(); + inline void EmulateATST(GSHWDrawConfig::PSConstantBuffer& cb, GSHWDrawConfig::PSSelector& ps, bool pass_2); + +public: + GSRendererNew(); + ~GSRendererNew() override {} + + bool CreateDevice(GSDevice* dev, const WindowInfo& wi) override; + void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) override; + + PRIM_OVERLAP PrimitiveOverlap(); + + bool IsDummyTexture() const override; +}; diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp index 1d3708c3e0..fd74a4e918 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.cpp @@ -83,6 +83,11 @@ GSDeviceOGL::GSDeviceOGL() m_debug_gl_call = theApp.GetConfigB("debug_opengl"); m_disable_hw_gl_draw = theApp.GetConfigB("disable_hw_gl_draw"); + + m_features.broken_point_sampler = GLLoader::vendor_id_amd; + m_features.geometry_shader = GLLoader::found_geometry_shader; + m_features.image_load_store = GLLoader::found_GL_ARB_shader_image_load_store && GLLoader::found_GL_ARB_clear_texture; + m_features.texture_barrier = true; } GSDeviceOGL::~GSDeviceOGL() @@ -2012,6 +2017,270 @@ void GSDeviceOGL::SetupOM(OMDepthStencilSelector dssel) OMSetDepthStencilState(m_om_dss[dssel]); } +static GSDeviceOGL::VSConstantBuffer convertCB(const GSHWDrawConfig::VSConstantBuffer& cb) +{ + GSDeviceOGL::VSConstantBuffer out; + out.Vertex_Scale_Offset = GSVector4::loadl(&cb.vertex_scale).upld(GSVector4::loadl(&cb.vertex_offset)); + out.Texture_Scale_Offset = GSVector4::loadl(&cb.texture_scale).upld(GSVector4::loadl(&cb.texture_offset)); + out.PointSize = cb.point_size; + out.MaxDepth = cb.max_depth; + return out; +} + +static GSDeviceOGL::PSConstantBuffer convertCB(const GSHWDrawConfig::PSConstantBuffer& cb, int atst) +{ + GSDeviceOGL::PSConstantBuffer out; + out.FogColor_AREF = GSVector4(GSVector4i::load(cb.fog_color_aref).u8to32()); + if (atst == 1 || atst == 2) // Greater / Less alpha + out.FogColor_AREF.w -= 0.1f; + out.WH = cb.texture_size; + out.TA_MaxDepth_Af = GSVector4(GSVector4i::load(cb.ta_af).u8to32()) / GSVector4(255.f, 255.f, 1.f, 128.f); + out.TA_MaxDepth_Af.z = cb.max_depth * ldexpf(1, -32); + out.MskFix = GSVector4i::loadl(&cb.uv_msk_fix).u16to32(); + out.FbMask = GSVector4i::load(cb.fbmask_int).u8to32(); + out.HalfTexel = cb.half_texel; + out.MinMax = cb.uv_min_max; + out.TC_OH = GSVector4::zero().upld(GSVector4(cb.tc_offset)); + + GSVector4i dither = GSVector4i::loadl(&cb.dither_matrix).u8to16(); + const GSVector4i ditherLow = dither.sll16(13).sra16(13); + const GSVector4i ditherHi = dither.sll16( 9).sra16( 5); + dither = ditherLow.blend8(ditherHi, GSVector4i(0xFF00FF00)); + + out.DitherMatrix[0] = GSVector4(dither.xxxx().i8to32()); + out.DitherMatrix[1] = GSVector4(dither.yyyy().i8to32()); + out.DitherMatrix[2] = GSVector4(dither.zzzz().i8to32()); + out.DitherMatrix[3] = GSVector4(dither.wwww().i8to32()); + + return out; +} + +static GSDeviceOGL::VSSelector convertSel(const GSHWDrawConfig::VSSelector sel) +{ + GSDeviceOGL::VSSelector out; + out.int_fst = !sel.fst; + return out; +} + +void GSDeviceOGL::RenderHW(GSHWDrawConfig& config) +{ + glScissor(config.scissor.x, config.scissor.y, config.scissor.width(), config.scissor.height()); + GLState::scissor = config.scissor; + + // Destination Alpha Setup + switch (config.destination_alpha) + { + case GSHWDrawConfig::DestinationAlphaMode::Off: + case GSHWDrawConfig::DestinationAlphaMode::Full: + break; // No setup + case GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking: + InitPrimDateTexture(config.rt, config.scissor); + break; + case GSHWDrawConfig::DestinationAlphaMode::StencilOne: + ClearStencil(config.ds, 1); + break; + case GSHWDrawConfig::DestinationAlphaMode::Stencil: + { + const GSVector4 src = GSVector4(config.scissor) / GSVector4(config.ds->GetSize()).xyxy(); + const GSVector4 dst = src * 2.f - 1.f; + GSVertexPT1 vertices[] = + { + {GSVector4(dst.x, dst.y, 0.0f, 0.0f), GSVector2(src.x, src.y)}, + {GSVector4(dst.z, dst.y, 0.0f, 0.0f), GSVector2(src.z, src.y)}, + {GSVector4(dst.x, dst.w, 0.0f, 0.0f), GSVector2(src.x, src.w)}, + {GSVector4(dst.z, dst.w, 0.0f, 0.0f), GSVector2(src.z, src.w)}, + }; + SetupDATE(config.rt, config.ds, vertices, config.datm); + } + } + + GSTexture* hdr_rt = nullptr; + if (config.ps.hdr) + { + GSVector2i size = config.rt->GetSize(); + hdr_rt = CreateRenderTarget(size.x, size.y, GSTexture::Format::FloatColor); + hdr_rt->CommitRegion(GSVector2i(config.scissor.z, config.scissor.w)); + OMSetRenderTargets(hdr_rt, config.ds, &config.scissor); + + // save blend state, since BlitRect destroys it + const bool old_blend = GLState::blend; + BlitRect(config.rt, config.scissor, config.rt->GetSize(), false, false); + if (old_blend) + { + GLState::blend = old_blend; + glEnable(GL_BLEND); + } + } + + BeginScene(); + + IASetVertexBuffer(config.verts, config.nverts); + IASetIndexBuffer(config.indices, config.nindices); + GLenum topology = 0; + switch (config.topology) + { + case GSHWDrawConfig::Topology::Point: topology = GL_POINTS; break; + case GSHWDrawConfig::Topology::Line: topology = GL_LINES; break; + case GSHWDrawConfig::Topology::Triangle: topology = GL_TRIANGLES; break; + } + IASetPrimitiveTopology(topology); + + PSSetShaderResources(config.tex, config.pal); + PSSetShaderResource(4, config.raw_tex); + // Always bind the RT. This way special effect can use it. + PSSetShaderResource(3, config.rt); + + SetupSampler(PSSamplerSelector(config.sampler.key)); + OMSetBlendState(config.blend.index, config.blend.factor, config.blend.is_constant, config.blend.is_accumulation, config.blend.is_mixed_hw_sw); + OMSetColorMaskState(OMColorMaskSelector(config.colormask.key)); + SetupOM(OMDepthStencilSelector(config.depth.key)); + + VSConstantBuffer cb_vs = convertCB(config.cb_vs); + PSConstantBuffer cb_ps = convertCB(config.cb_ps, config.ps.atst); + SetupCB(&cb_vs, &cb_ps); + + if (config.cb_ps.channel_shuffle_int) + { + SetupCBMisc(GSVector4i::load(config.cb_ps.channel_shuffle_int).u8to32()); + } + + GSSelector gssel; + if (config.gs.expand) + { + switch (config.gs.topology) + { + case GSHWDrawConfig::GSTopology::Point: gssel.point = 1; break; + case GSHWDrawConfig::GSTopology::Line: gssel.line = 1; break; + case GSHWDrawConfig::GSTopology::Sprite: gssel.sprite = 1; break; + case GSHWDrawConfig::GSTopology::Triangle: ASSERT(0); break; + } + } + + PSSelector pssel; + pssel.key = config.ps.key; + const VSSelector vssel = convertSel(config.vs); + SetupPipeline(vssel, gssel, pssel); + + if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking) + { + GL_PUSH("Date GL42"); + // It could be good idea to use stencil in the same time. + // Early stencil test will reduce the number of atomic-load operation + + // Create an r32i image that will contain primitive ID + // Note: do it at the beginning because the clean will dirty the FBO state + //dev->InitPrimDateTexture(rtsize.x, rtsize.y); + + // I don't know how much is it legal to mount rt as Texture/RT. No write is done. + // In doubt let's detach RT. + OMSetRenderTargets(NULL, config.ds, &config.scissor); + + // Don't write anything on the color buffer + // Neither in the depth buffer + glDepthMask(false); + // Compute primitiveID max that pass the date test (Draw without barrier) + DrawIndexedPrimitive(); + + // Ask PS to discard shader above the primitiveID max + glDepthMask(GLState::depth_mask); + + pssel.date = 3; + config.ps.date = 3; + config.alpha_second_pass.ps.date = 3; + SetupPipeline(vssel, gssel, pssel); + + // Be sure that first pass is finished ! + Barrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + } + + OMSetRenderTargets(hdr_rt ? hdr_rt : config.rt, config.ds, &config.scissor); + + SendHWDraw(config); + + if (config.alpha_second_pass.enable) + { + if (0 != memcmp(&config.cb_ps, &config.alpha_second_pass.cb_ps, sizeof(config.cb_ps))) + { + cb_ps = convertCB(config.alpha_second_pass.cb_ps, config.alpha_second_pass.ps.atst); + SetupCB(&cb_vs, &cb_ps); + } + pssel.key = config.alpha_second_pass.ps.key; + SetupPipeline(vssel, gssel, pssel); + OMSetColorMaskState(OMColorMaskSelector(config.alpha_second_pass.colormask.key)); + SetupOM(OMDepthStencilSelector(config.alpha_second_pass.depth.key)); + + SendHWDraw(config); + } + + if (config.destination_alpha == GSHWDrawConfig::DestinationAlphaMode::PrimIDTracking) + RecycleDateTexture(); + + EndScene(); + + // Warning: EndScene must be called before StretchRect otherwise + // vertices will be overwritten. Trust me you don't want to do that. + if (hdr_rt) + { + GSVector2i size = config.rt->GetSize(); + GSVector4 dRect(config.scissor); + const GSVector4 sRect = dRect / GSVector4(size.x, size.y).xyxy(); + StretchRect(hdr_rt, sRect, config.rt, dRect, ShaderConvert::MOD_256, false); + + Recycle(hdr_rt); + } +} + +void GSDeviceOGL::SendHWDraw(const GSHWDrawConfig& config) +{ + if (config.drawlist) + { + GL_PUSH("Split the draw (SPRITE)"); +#if defined(_DEBUG) + // Check how draw call is split. + std::map frequency; + for (const auto& it : *config.drawlist) + ++frequency[it]; + + std::string message; + for (const auto& it : frequency) + message += " " + std::to_string(it.first) + "(" + std::to_string(it.second) + ")"; + + GL_PERF("Split single draw (%d sprites) into %zu draws: consecutive draws(frequency):%s", + config.nindices / config.indices_per_prim, config.drawlist->size(), message.c_str()); +#endif + + for (size_t count = 0, p = 0, n = 0; n < config.drawlist->size(); p += count, ++n) + { + count = (*config.drawlist)[n] * config.indices_per_prim; + glTextureBarrier(); + DrawIndexedPrimitive(p, count); + } + } + else if (config.require_full_barrier) + { + GL_PUSH("Split the draw"); + + GL_PERF("Split single draw in %d draw", config.nindices / config.indices_per_prim); + + for (size_t p = 0; p < config.nindices; p += config.indices_per_prim) + { + glTextureBarrier(); + DrawIndexedPrimitive(p, config.indices_per_prim); + } + } + else if (config.require_one_barrier) + { + // One barrier needed + glTextureBarrier(); + DrawIndexedPrimitive(); + } + else + { + // No barriers needed + DrawIndexedPrimitive(); + } +} + // Note: used as a callback of DebugMessageCallback. Don't change the signature void GSDeviceOGL::DebugOutputToFile(GLenum gl_source, GLenum gl_type, GLuint id, GLenum gl_severity, GLsizei gl_length, const GLchar* gl_message, const void* userParam) { diff --git a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h index 6ac410d059..983cea76ee 100644 --- a/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h +++ b/pcsx2/GS/Renderers/OpenGL/GSDeviceOGL.h @@ -614,6 +614,9 @@ public: void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, bool red, bool green, bool blue, bool alpha) final; void StretchRect(GSTexture* sTex, const GSVector4& sRect, GSTexture* dTex, const GSVector4& dRect, GLuint ps, int bs, OMColorMaskSelector cms, bool linear = true); + void RenderHW(GSHWDrawConfig& config) final; + void SendHWDraw(const GSHWDrawConfig& config); + void SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vertices, bool datm); void IASetPrimitiveTopology(GLenum topology); diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 15ac94b0bf..b17996d370 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -488,6 +488,7 @@ + @@ -849,6 +850,7 @@ + diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index b243c92bce..b6b173592f 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -1553,6 +1553,9 @@ System\Ps2\GS\Renderers\Hardware + + System\Ps2\GS\Renderers\Hardware + System\Ps2\GS\Renderers\Hardware @@ -2654,6 +2657,9 @@ System\Ps2\GS\Renderers\Hardware + + System\Ps2\GS\Renderers\Hardware + System\Ps2\GS\Renderers\Hardware