diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index c8490635e8..afa8a25a3d 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -681,6 +681,7 @@ set(pcsx2GSSources GS/Renderers/Common/GSRenderer.cpp GS/Renderers/Common/GSTexture.cpp GS/Renderers/Common/GSVertexTrace.cpp + GS/Renderers/Common/GSVertexTraceFMM.cpp GS/Renderers/Null/GSDeviceNull.cpp GS/Renderers/Null/GSRendererNull.cpp GS/Renderers/Null/GSTextureNull.cpp diff --git a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp index 46cd1262a8..8062dd0a3a 100644 --- a/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp +++ b/pcsx2/GS/Renderers/Common/GSVertexTrace.cpp @@ -17,34 +17,13 @@ #include "GSVertexTrace.h" #include "GS/GSUtil.h" #include "GS/GSState.h" -#include - -CONSTINIT const GSVector4 GSVertexTrace::s_minmax = GSVector4::cxpr(FLT_MAX, -FLT_MAX, 0.f, 0.f); GSVertexTrace::GSVertexTrace(const GSState* state, bool provoking_vertex_first) : m_accurate_stq(false), m_state(state), m_primclass(GS_INVALID_CLASS) { memset(&m_alpha, 0, sizeof(m_alpha)); - #define InitUpdate3(P, IIP, TME, FST, COLOR) \ - m_fmm[COLOR][FST][TME][IIP][P] = GetFMM(provoking_vertex_first); - - #define InitUpdate2(P, IIP, TME) \ - InitUpdate3(P, IIP, TME, 0, 0) \ - InitUpdate3(P, IIP, TME, 0, 1) \ - InitUpdate3(P, IIP, TME, 1, 0) \ - InitUpdate3(P, IIP, TME, 1, 1) \ - - #define InitUpdate(P) \ - InitUpdate2(P, 0, 0) \ - InitUpdate2(P, 0, 1) \ - InitUpdate2(P, 1, 0) \ - InitUpdate2(P, 1, 1) \ - - InitUpdate(GS_POINT_CLASS); - InitUpdate(GS_LINE_CLASS); - InitUpdate(GS_TRIANGLE_CLASS); - InitUpdate(GS_SPRITE_CLASS); + MULTI_ISA_SELECT(GSVertexTracePopulateFunctions)(*this, provoking_vertex_first); } void GSVertexTrace::Update(const void* vertex, const u32* index, int v_count, int i_count, GS_PRIM_CLASS primclass) @@ -59,7 +38,7 @@ void GSVertexTrace::Update(const void* vertex, const u32* index, int v_count, in u32 fst = m_state->PRIM->FST; u32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC); - (this->*m_fmm[color][fst][tme][iip][primclass])(vertex, index, i_count); + m_fmm[color][fst][tme][iip][primclass](*this, vertex, index, i_count); // Potential float overflow detected. Better uses the slower division instead // Note: If Q is too big, 1/Q will end up as 0. 1e30 is a random number @@ -151,222 +130,6 @@ void GSVertexTrace::Update(const void* vertex, const u32* index, int v_count, in } } -template -GSVertexTrace::FindMinMaxPtr GSVertexTrace::GetFMM(bool provoking_vertex_first) -{ - constexpr bool real_iip = primclass == GS_SPRITE_CLASS ? false : iip; - constexpr bool real_fst = tme ? fst : false; - constexpr bool provoking_vertex_first_class = primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS; - const bool swap = provoking_vertex_first_class && !iip && provoking_vertex_first; - - if (swap) - return &GSVertexTrace::FindMinMax; - else - return &GSVertexTrace::FindMinMax; -} - -template -void GSVertexTrace::FindMinMax(const void* vertex, const u32* index, int count) -{ - const GSDrawingContext* context = m_state->m_context; - - int n = 1; - - switch (primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - GSVector4 tmin = s_minmax.xxxx(); - GSVector4 tmax = s_minmax.yyyy(); - GSVector4i cmin = GSVector4i::xffffffff(); - GSVector4i cmax = GSVector4i::zero(); - - GSVector4i pmin = GSVector4i::xffffffff(); - GSVector4i pmax = GSVector4i::zero(); - - const GSVertex* RESTRICT v = (GSVertex*)vertex; - - // Process 2 vertices at a time for increased efficiency - auto processVertices = [&](const GSVertex& v0, const GSVertex& v1, bool finalVertex) - { - if (color) - { - GSVector4i c0 = GSVector4i::load(v0.RGBAQ.U32[0]); - GSVector4i c1 = GSVector4i::load(v1.RGBAQ.U32[0]); - if (iip || finalVertex) - { - cmin = cmin.min_u8(c0.min_u8(c1)); - cmax = cmax.max_u8(c0.max_u8(c1)); - } - else if (n == 2) - { - // For even n, we process v1 and v2 of the same prim - // (For odd n, we process one vertex from each of two prims) - GSVector4i c = flat_swapped ? c0 : c1; - cmin = cmin.min_u8(c); - cmax = cmax.max_u8(c); - } - } - - if (tme) - { - if (!fst) - { - GSVector4 stq0 = GSVector4::cast(GSVector4i(v0.m[0])); - GSVector4 stq1 = GSVector4::cast(GSVector4i(v1.m[0])); - - GSVector4 q; - // Sprites always have indices == vertices, so we don't have to look at the index table here - if (primclass == GS_SPRITE_CLASS) - q = stq1.wwww(); - else - q = stq0.wwww(stq1); - - // Note: If in the future this is changed in a way that causes parts of calculations to go unused, - // make sure to remove the z (rgba) field as it's often denormal. - // Then, use GSVector4::noopt() to prevent clang from optimizing out your "useless" shuffle - // e.g. stq = (stq.xyww() / stq.wwww()).noopt().xyww(stq); - GSVector4 st = stq0.xyxy(stq1) / q; - - stq0 = st.xyww(primclass == GS_SPRITE_CLASS ? stq1 : stq0); - stq1 = st.zwww(stq1); - - tmin = tmin.min(stq0.min(stq1)); - tmax = tmax.max(stq0.max(stq1)); - } - else - { - GSVector4i uv0(v0.m[1]); - GSVector4i uv1(v1.m[1]); - - GSVector4 st0 = GSVector4(uv0.uph16()).xyxy(); - GSVector4 st1 = GSVector4(uv1.uph16()).xyxy(); - - tmin = tmin.min(st0.min(st1)); - tmax = tmax.max(st0.max(st1)); - } - } - - GSVector4i xyzf0(v0.m[1]); - GSVector4i xyzf1(v1.m[1]); - - GSVector4i xy0 = xyzf0.upl16(); - GSVector4i zf0 = xyzf0.ywyw(); - GSVector4i xy1 = xyzf1.upl16(); - GSVector4i zf1 = xyzf1.ywyw(); - - GSVector4i p0 = xy0.blend32<0xc>(primclass == GS_SPRITE_CLASS ? zf1 : zf0); - GSVector4i p1 = xy1.blend32<0xc>(zf1); - - pmin = pmin.min_u32(p0.min_u32(p1)); - pmax = pmax.max_u32(p0.max_u32(p1)); - }; - - if (n == 2) - { - for (int i = 0; i < count; i += 2) - { - processVertices(v[index[i + 0]], v[index[i + 1]], false); - } - } - else if (iip || n == 1) // iip means final and non-final vertexes are treated the same - { - int i = 0; - for (; i < (count - 1); i += 2) // 2x loop unroll - { - processVertices(v[index[i + 0]], v[index[i + 1]], true); - } - if (count & 1) - { - // Compiler optimizations go! - // (And if they don't, it's only one vertex out of many) - processVertices(v[index[i]], v[index[i]], true); - } - } - else if (n == 3) - { - int i = 0; - for (; i < (count - 3); i += 6) - { - processVertices(v[index[i + 0]], v[index[i + 3]], flat_swapped); - processVertices(v[index[i + 1]], v[index[i + 4]], false); - processVertices(v[index[i + 2]], v[index[i + 5]], !flat_swapped); - } - if (count & 1) - { - if (flat_swapped) - { - processVertices(v[index[i + 1]], v[index[i + 2]], false); - // Compiler optimizations go! - // (And if they don't, it's only one vertex out of many) - processVertices(v[index[i + 0]], v[index[i + 0]], true); - } - else - { - processVertices(v[index[i + 0]], v[index[i + 1]], false); - // Compiler optimizations go! - // (And if they don't, it's only one vertex out of many) - processVertices(v[index[i + 2]], v[index[i + 2]], true); - } - } - } - else - { - pxAssertRel(0, "Bad n value"); - } - - GSVector4 o(context->XYOFFSET); - GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f); - - m_min.p = (GSVector4(pmin) - o) * s; - m_max.p = (GSVector4(pmax) - o) * s; - - // Fix signed int conversion - m_min.p = m_min.p.insert32<0, 2>(GSVector4::load((float)(u32)pmin.extract32<2>())); - m_max.p = m_max.p.insert32<0, 2>(GSVector4::load((float)(u32)pmax.extract32<2>())); - - if (tme) - { - if (fst) - { - s = GSVector4(1.0f / 16, 1.0f).xxyy(); - } - else - { - s = GSVector4(1 << context->TEX0.TW, 1 << context->TEX0.TH, 1, 1); - } - - m_min.t = tmin * s; - m_max.t = tmax * s; - } - else - { - m_min.t = GSVector4::zero(); - m_max.t = GSVector4::zero(); - } - - if (color) - { - m_min.c = cmin.u8to32(); - m_max.c = cmax.u8to32(); - } - else - { - m_min.c = GSVector4i::zero(); - m_max.c = GSVector4i::zero(); - } -} - void GSVertexTrace::CorrectDepthTrace(const void* vertex, int count) { if (m_eq.z == 0) diff --git a/pcsx2/GS/Renderers/Common/GSVertexTrace.h b/pcsx2/GS/Renderers/Common/GSVertexTrace.h index 07c0aa1314..a9c62c773c 100644 --- a/pcsx2/GS/Renderers/Common/GSVertexTrace.h +++ b/pcsx2/GS/Renderers/Common/GSVertexTrace.h @@ -17,15 +17,22 @@ #include "GS/GS.h" #include "GS/GSDrawingContext.h" +#include "GS/MultiISA.h" #include "GSVertex.h" #include "GS/Renderers/SW/GSVertexSW.h" #include "GS/Renderers/HW/GSVertexHW.h" #include "GSFunctionMap.h" class GSState; +class GSVertexTrace; + +MULTI_ISA_DEF(class GSVertexTraceFMM;) +MULTI_ISA_DEF(void GSVertexTracePopulateFunctions(GSVertexTrace& vt, bool provoking_vertex_first);) class alignas(32) GSVertexTrace : public GSAlignedClass<32> { + MULTI_ISA_FRIEND(GSVertexTraceFMM) + public: struct Vertex { @@ -42,18 +49,10 @@ public: protected: const GSState* m_state; - static const GSVector4 s_minmax; - - typedef void (GSVertexTrace::*FindMinMaxPtr)(const void* vertex, const u32* index, int count); + typedef void (*FindMinMaxPtr)(GSVertexTrace& vt, const void* vertex, const u32* index, int count); FindMinMaxPtr m_fmm[2][2][2][2][4]; - template - void FindMinMax(const void* vertex, const u32* index, int count); - - template - FindMinMaxPtr GetFMM(bool provoking_vertex_first); - public: GS_PRIM_CLASS m_primclass; diff --git a/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp b/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp new file mode 100644 index 0000000000..a719e52b0c --- /dev/null +++ b/pcsx2/GS/Renderers/Common/GSVertexTraceFMM.cpp @@ -0,0 +1,278 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2021 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#include "GSVertexTrace.h" +#include "GS/GSState.h" +#include + +class CURRENT_ISA::GSVertexTraceFMM +{ + static constexpr GSVector4 s_minmax = GSVector4::cxpr(FLT_MAX, -FLT_MAX, 0.f, 0.f); + + template + static void FindMinMax(GSVertexTrace& vt, const void* vertex, const u32* index, int count); + + template + static constexpr GSVertexTrace::FindMinMaxPtr GetFMM(bool provoking_vertex_first); + +public: + static void Populate(GSVertexTrace& vt, bool provoking_vertex_first); +}; + +MULTI_ISA_UNSHARED_IMPL; + +void CURRENT_ISA::GSVertexTracePopulateFunctions(GSVertexTrace& vt, bool provoking_vertex_first) +{ + GSVertexTraceFMM::Populate(vt, provoking_vertex_first); +} + +template +constexpr GSVertexTrace::FindMinMaxPtr GSVertexTraceFMM::GetFMM(bool provoking_vertex_first) +{ + constexpr bool real_iip = primclass == GS_SPRITE_CLASS ? false : iip; + constexpr bool real_fst = tme ? fst : false; + constexpr bool provoking_vertex_first_class = primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS; + const bool swap = provoking_vertex_first_class && !iip && provoking_vertex_first; + + if (swap) + return FindMinMax; + else + return FindMinMax; +} + +void GSVertexTraceFMM::Populate(GSVertexTrace& vt, bool provoking_vertex_first) +{ + #define InitUpdate3(P, IIP, TME, FST, COLOR) \ + vt.m_fmm[COLOR][FST][TME][IIP][P] = GetFMM(provoking_vertex_first); + + #define InitUpdate2(P, IIP, TME) \ + InitUpdate3(P, IIP, TME, 0, 0) \ + InitUpdate3(P, IIP, TME, 0, 1) \ + InitUpdate3(P, IIP, TME, 1, 0) \ + InitUpdate3(P, IIP, TME, 1, 1) \ + + #define InitUpdate(P) \ + InitUpdate2(P, 0, 0) \ + InitUpdate2(P, 0, 1) \ + InitUpdate2(P, 1, 0) \ + InitUpdate2(P, 1, 1) \ + + InitUpdate(GS_POINT_CLASS); + InitUpdate(GS_LINE_CLASS); + InitUpdate(GS_TRIANGLE_CLASS); + InitUpdate(GS_SPRITE_CLASS); +} + +template +void GSVertexTraceFMM::FindMinMax(GSVertexTrace& vt, const void* vertex, const u32* index, int count) +{ + const GSDrawingContext* context = vt.m_state->m_context; + + int n = 1; + + switch (primclass) + { + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; + } + + GSVector4 tmin = s_minmax.xxxx(); + GSVector4 tmax = s_minmax.yyyy(); + GSVector4i cmin = GSVector4i::xffffffff(); + GSVector4i cmax = GSVector4i::zero(); + + GSVector4i pmin = GSVector4i::xffffffff(); + GSVector4i pmax = GSVector4i::zero(); + + const GSVertex* RESTRICT v = (GSVertex*)vertex; + + // Process 2 vertices at a time for increased efficiency + auto processVertices = [&](const GSVertex& v0, const GSVertex& v1, bool finalVertex) + { + if (color) + { + GSVector4i c0 = GSVector4i::load(v0.RGBAQ.U32[0]); + GSVector4i c1 = GSVector4i::load(v1.RGBAQ.U32[0]); + if (iip || finalVertex) + { + cmin = cmin.min_u8(c0.min_u8(c1)); + cmax = cmax.max_u8(c0.max_u8(c1)); + } + else if (n == 2) + { + // For even n, we process v1 and v2 of the same prim + // (For odd n, we process one vertex from each of two prims) + GSVector4i c = flat_swapped ? c0 : c1; + cmin = cmin.min_u8(c); + cmax = cmax.max_u8(c); + } + } + + if (tme) + { + if (!fst) + { + GSVector4 stq0 = GSVector4::cast(GSVector4i(v0.m[0])); + GSVector4 stq1 = GSVector4::cast(GSVector4i(v1.m[0])); + + GSVector4 q; + // Sprites always have indices == vertices, so we don't have to look at the index table here + if (primclass == GS_SPRITE_CLASS) + q = stq1.wwww(); + else + q = stq0.wwww(stq1); + + // Note: If in the future this is changed in a way that causes parts of calculations to go unused, + // make sure to remove the z (rgba) field as it's often denormal. + // Then, use GSVector4::noopt() to prevent clang from optimizing out your "useless" shuffle + // e.g. stq = (stq.xyww() / stq.wwww()).noopt().xyww(stq); + GSVector4 st = stq0.xyxy(stq1) / q; + + stq0 = st.xyww(primclass == GS_SPRITE_CLASS ? stq1 : stq0); + stq1 = st.zwww(stq1); + + tmin = tmin.min(stq0.min(stq1)); + tmax = tmax.max(stq0.max(stq1)); + } + else + { + GSVector4i uv0(v0.m[1]); + GSVector4i uv1(v1.m[1]); + + GSVector4 st0 = GSVector4(uv0.uph16()).xyxy(); + GSVector4 st1 = GSVector4(uv1.uph16()).xyxy(); + + tmin = tmin.min(st0.min(st1)); + tmax = tmax.max(st0.max(st1)); + } + } + + GSVector4i xyzf0(v0.m[1]); + GSVector4i xyzf1(v1.m[1]); + + GSVector4i xy0 = xyzf0.upl16(); + GSVector4i zf0 = xyzf0.ywyw(); + GSVector4i xy1 = xyzf1.upl16(); + GSVector4i zf1 = xyzf1.ywyw(); + + GSVector4i p0 = xy0.blend32<0xc>(primclass == GS_SPRITE_CLASS ? zf1 : zf0); + GSVector4i p1 = xy1.blend32<0xc>(zf1); + + pmin = pmin.min_u32(p0.min_u32(p1)); + pmax = pmax.max_u32(p0.max_u32(p1)); + }; + + if (n == 2) + { + for (int i = 0; i < count; i += 2) + { + processVertices(v[index[i + 0]], v[index[i + 1]], false); + } + } + else if (iip || n == 1) // iip means final and non-final vertexes are treated the same + { + int i = 0; + for (; i < (count - 1); i += 2) // 2x loop unroll + { + processVertices(v[index[i + 0]], v[index[i + 1]], true); + } + if (count & 1) + { + // Compiler optimizations go! + // (And if they don't, it's only one vertex out of many) + processVertices(v[index[i]], v[index[i]], true); + } + } + else if (n == 3) + { + int i = 0; + for (; i < (count - 3); i += 6) + { + processVertices(v[index[i + 0]], v[index[i + 3]], flat_swapped); + processVertices(v[index[i + 1]], v[index[i + 4]], false); + processVertices(v[index[i + 2]], v[index[i + 5]], !flat_swapped); + } + if (count & 1) + { + if (flat_swapped) + { + processVertices(v[index[i + 1]], v[index[i + 2]], false); + // Compiler optimizations go! + // (And if they don't, it's only one vertex out of many) + processVertices(v[index[i + 0]], v[index[i + 0]], true); + } + else + { + processVertices(v[index[i + 0]], v[index[i + 1]], false); + // Compiler optimizations go! + // (And if they don't, it's only one vertex out of many) + processVertices(v[index[i + 2]], v[index[i + 2]], true); + } + } + } + else + { + pxAssertRel(0, "Bad n value"); + } + + GSVector4 o(context->XYOFFSET); + GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f); + + vt.m_min.p = (GSVector4(pmin) - o) * s; + vt.m_max.p = (GSVector4(pmax) - o) * s; + + // Fix signed int conversion + vt.m_min.p = vt.m_min.p.insert32<0, 2>(GSVector4::load((float)(u32)pmin.extract32<2>())); + vt.m_max.p = vt.m_max.p.insert32<0, 2>(GSVector4::load((float)(u32)pmax.extract32<2>())); + + if (tme) + { + if (fst) + { + s = GSVector4(1.0f / 16, 1.0f).xxyy(); + } + else + { + s = GSVector4(1 << context->TEX0.TW, 1 << context->TEX0.TH, 1, 1); + } + + vt.m_min.t = tmin * s; + vt.m_max.t = tmax * s; + } + else + { + vt.m_min.t = GSVector4::zero(); + vt.m_max.t = GSVector4::zero(); + } + + if (color) + { + vt.m_min.c = cmin.u8to32(); + vt.m_max.c = cmax.u8to32(); + } + else + { + vt.m_min.c = GSVector4i::zero(); + vt.m_max.c = GSVector4i::zero(); + } +} diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 559d53437d..66e8ff908c 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -476,6 +476,7 @@ + diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index f14430c2fd..3c36422c72 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -1628,6 +1628,9 @@ System\Ps2\GS\Renderers\Common + + System\Ps2\GS\Renderers\Common + System\Ps2\GS\Renderers\Common diff --git a/pcsx2/pcsx2core.vcxproj b/pcsx2/pcsx2core.vcxproj index c8388da711..aae45c7b42 100644 --- a/pcsx2/pcsx2core.vcxproj +++ b/pcsx2/pcsx2core.vcxproj @@ -329,6 +329,7 @@ + diff --git a/pcsx2/pcsx2core.vcxproj.filters b/pcsx2/pcsx2core.vcxproj.filters index c8547c58f5..dbfb50d0da 100644 --- a/pcsx2/pcsx2core.vcxproj.filters +++ b/pcsx2/pcsx2core.vcxproj.filters @@ -1121,6 +1121,9 @@ System\Ps2\GS\Renderers\Common + + System\Ps2\GS\Renderers\Common + System\Ps2\GS\Renderers\Common