From eb0b341e61cd3cd06db74e9ebdf0f32346d4c197 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Mon, 7 Oct 2024 00:18:33 -0500 Subject: [PATCH] GS:SW: Use unaligned loads to reduce constant size on AVX2 Allows more instructions to use 1-byte offsets --- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 30 +++---- .../SW/GSDrawScanlineCodeGenerator.all.cpp | 80 +++++++++++++------ pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h | 4 + pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h | 43 +++------- .../SW/GSSetupPrimCodeGenerator.all.cpp | 21 +++-- 5 files changed, 100 insertions(+), 78 deletions(-) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index 57f2fc8574..7c6d716d8c 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -207,10 +207,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons constexpr int vlen = sizeof(VectorF) / sizeof(float); #if _M_SSE >= 0x501 - const GSVector8* shift = (GSVector8*)g_const_256b.m_shift; - const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]); + auto load_shift = [](int i) { return GSVector8::load(&g_const_256b.m_shift[8 - i]); }; + const GSVector4 step_shift = GSVector4::broadcast32(&g_const_256b.m_shift[0]); #else - const GSVector4* shift = (GSVector4*)g_const_128b.m_shift; + static const GSVector4* shift = reinterpret_cast(g_const_128b.m_shift); + auto load_shift = [](int i) { return shift[1 + i]; }; const GSVector4 step_shift = shift[0]; #endif @@ -234,22 +235,23 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons for (int i = 0; i < vlen; i++) { - local.d[i].f = VectorI(df * shift[1 + i]).xxzzlh(); + local.d[i].f = VectorI(df * load_shift(i)).xxzzlh(); } } if (has_z && !sel.zequal) { - const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z); const VectorF dzf(static_cast(dscan.p.F64[1])); #if _M_SSE >= 0x501 - GSVector4::storel(&local.d8.p.z, dz.mul64(GSVector4::f32to64(shift))); + double dz = dscan.p.F64[1] * g_const_256b.m_shift[0]; + memcpy(&local.d8.p.z, &dz, sizeof(dz)); #else + const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z); local.d4.z = dz.mul64(GSVector4::f32to64(shift)); #endif for (int i = 0; i < vlen; i++) { - local.d[i].z = dzf * shift[i + 1]; + local.d[i].z = dzf * load_shift(i); } } } @@ -297,7 +299,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons for (int i = 0; i < vlen; i++) { - VectorF v = dstq * shift[1 + i]; + VectorF v = dstq * load_shift(i); if (sel.fst) { @@ -336,8 +338,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons for (int i = 0; i < vlen; i++) { - VectorI r = VectorI(dr * shift[1 + i]).ps32(); - VectorI b = VectorI(db * shift[1 + i]).ps32(); + VectorI r = VectorI(dr * load_shift(i)).ps32(); + VectorI b = VectorI(db * load_shift(i)).ps32(); local.d[i].rb = r.upl16(b); } @@ -347,8 +349,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons for (int i = 0; i < vlen; i++) { - VectorI g = VectorI(dg * shift[1 + i]).ps32(); - VectorI a = VectorI(da * shift[1 + i]).ps32(); + VectorI g = VectorI(dg * load_shift(i)).ps32(); + VectorI a = VectorI(da * load_shift(i)).ps32(); local.d[i].ga = g.upl16(a); } @@ -515,7 +517,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV steps = pixels + skip - vlen; left -= skip; #if _M_SSE >= 0x501 - test = GSVector8i::i8to32(g_const_256b.m_test[skip]) | GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]); + test = GSVector8i::i8to32(&g_const_256b.m_test[16 - skip]) | GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]); #else test = const_test[skip] | const_test[7 + (steps & (steps >> 31))]; #endif @@ -1756,7 +1758,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV if (!sel.notest) { #if _M_SSE >= 0x501 - test = GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]); + test = GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]); #else test = const_test[7 + (steps & (steps >> 31))]; #endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index b299558639..dc26ac61bc 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -661,25 +661,29 @@ void GSDrawScanlineCodeGenerator::Init() lea(a0.cvt32(), ptr[a0 + a1 - vecints]); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - - mov(eax, a0.cvt32()); - sar(eax, 31); // GH: 31 to extract the sign of the register - and_(eax, a0.cvt32()); - if (isXmm) - shl(eax, 4); // * sizeof(m_test[0]) - cdqe(); - if (isXmm) { + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + mov(eax, a0.cvt32()); + sar(eax, 31); // GH: 31 to extract the sign of the register + and_(eax, a0.cvt32()); + shl(eax, 4); // * sizeof(m_test[0]) + cdqe(); shl(a1.cvt32(), 4); // * sizeof(m_test[0]) movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData128B, m_test[0])]); por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]); } else { - pmovsxbd(_test, ptr[a1 * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]); - pmovsxbd(xym0, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]); + // GSVector8i test = loadu(&m_test[16 - skip]) | loadu(&m_test[steps >= 0 ? 0 : -steps]); + mov(eax, a1.cvt32()); + neg(rax); // rax = -skip + pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[16])]); + xor_(t0.cvt32(), t0.cvt32()); + mov(eax, a0.cvt32()); + neg(eax); // eax = -steps + cmovs(eax, t0.cvt32()); // if (eax < 0) eax = 0 + pmovsxbd(xym0, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]); por(_test, xym0); shl(a1.cvt32(), 5); // * sizeof(m_test[0]) } @@ -922,7 +926,7 @@ void GSDrawScanlineCodeGenerator::Init() /// Inputs: a0=steps, t0=fza_offset /// Outputs[x86]: xym0=z xym2=s, xym3=t, xym4=q, xym5=rb, xym6=ga, xym7=test /// Destroys[x86]: all -/// Destroys[x64]: xym0, xym1, xym2, xym3 +/// Destroys[x64]: xym0, xym1, xym2, xym3, t2 void GSDrawScanlineCodeGenerator::Step() { // steps -= 4; @@ -1048,19 +1052,22 @@ void GSDrawScanlineCodeGenerator::Step() if (!m_sel.notest) { +#if USING_XMM // test = m_test[7 + (steps & (steps >> 31))]; mov(eax, a0.cvt32()); sar(eax, 31); // GH: 31 to extract the sign of the register and_(eax, a0.cvt32()); - if (isXmm) - shl(eax, 4); + shl(eax, 4); cdqe(); - -#if USING_XMM movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]); #else - pmovsxbd(_test, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]); + // test = loadu(&m_test[steps >= 0 ? 0 : -steps]); + xor_(t2.cvt32(), t2.cvt32()); + mov(eax, a0.cvt32()); + neg(eax); // eax = -steps + cmovs(eax, t2.cvt32()); // if (eax < 0) eax = 0; + pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]); #endif } } @@ -1655,29 +1662,54 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pslld(xym4, 9); psrld(xym4, 9); - auto log2_coeff = [this](int i) -> Address +#if USING_YMM + auto load_log2_coeff = [this](const XYm& reg, int i) { - ptr[_m_const + log2_coeff_offset(i)]; + vbroadcastss(reg, ptr[_m_const + log2_coeff_offset(i)]); }; + auto log2_coeff = [this, &load_log2_coeff](int i) + { + load_log2_coeff(xym6, i); + return xym6; + }; +#else + auto log2_coeff = [this](int i) -> Operand + { + return ptr[_m_const + log2_coeff_offset(i)]; + }; + auto load_log2_coeff = [this, &log2_coeff](const XYm& reg, int i) + { + movaps(reg, log2_coeff(i)); + }; +#endif - orps(xym4, log2_coeff(3)); + load_log2_coeff(xym1, 3); + orps(xym4, xym1); // xym4 = mant(q) | 1.0f if (hasFMA) { - movaps(xym5, log2_coeff(0)); // c0 + load_log2_coeff(xym5, 0); // c0 vfmadd213ps(xym5, xym4, log2_coeff(1)); // c0 * xym4 + c1 vfmadd213ps(xym5, xym4, log2_coeff(2)); // (c0 * xym4 + c1) * xym4 + c2 - subps(xym4, log2_coeff(3)); // xym4 - 1.0f + subps(xym4, xym1); // xym4 - 1.0f vfmadd213ps(xym4, xym5, xym0); // ((c0 * xym4 + c1) * xym4 + c2) * (xym4 - 1.0f) + xym0 } else { - THREEARG(mulps, xym5, xym4, log2_coeff(0)); + if (hasAVX) + { + vmulps(xym5, xym4, log2_coeff(0)); + } + else + { + load_log2_coeff(xym5, 0); + mulps(xym5, xym4); + } addps(xym5, log2_coeff(1)); mulps(xym5, xym4); - subps(xym4, log2_coeff(3)); + subps(xym4, xym1); addps(xym5, log2_coeff(2)); mulps(xym4, xym5); addps(xym4, xym0); diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h index d23c713a28..953e84cc67 100644 --- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h @@ -204,10 +204,12 @@ public: FORWARD_OO_OI(or_) FORWARD_OO_OI(sub) FORWARD_OO_OI(xor_) + FORWARD(2, BASE, cmovs, const Reg&, const Operand&) FORWARD(2, BASE, lea, const Reg&, const Address&) FORWARD(2, BASE, mov, const Operand&, size_t) FORWARD(2, BASE, mov, ARGS_OO) FORWARD(2, BASE, movzx, const Reg&, const Operand&) + FORWARD(1, BASE, neg, const Operand&) FORWARD(1, BASE, not_, const Operand&) FORWARD(1, BASE, pop, const Operand&) FORWARD(1, BASE, push, const Operand&) @@ -243,6 +245,8 @@ public: AFORWARD(2, minps, ARGS_XO) SFORWARD(2, movaps, ARGS_XO) SFORWARD(2, movaps, const Address&, const Xmm&) + SFORWARD(2, movups, ARGS_XO) + SFORWARD(2, movups, const Address&, const Xmm&) SFORWARD(2, movd, const Address&, const Xmm&) SFORWARD(2, movd, const Reg32&, const Xmm&) SFORWARD(2, movd, const Xmm&, const Address&) diff --git a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h index 49e638faec..dbeef06a04 100644 --- a/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h +++ b/pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h @@ -256,46 +256,25 @@ namespace GSScanlineConstantData // Constant shared by all threads (to reduce cache miss) struct alignas(64) GSScanlineConstantData256B { - alignas(32) u8 m_test[16][8] = { - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00}, - {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00}, - {0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + // All AVX processors support unaligned access with little to no penalty as long as you don't cross a cache line. + // Take advantage of that to store single vectors that we index with single-element alignment + alignas(32) u8 m_test[24] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; - alignas(32) float m_shift[9][8] = { - { 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f}, - { 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f , 7.0f}, - { -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f}, - { -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f}, - { -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f}, - { -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f}, - { -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f}, - { -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f}, - { -7.0f , -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f}, + float m_log2_coef[4] = {}; + alignas(64) float m_shift[16] = { + 8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, }; - alignas(32) float m_log2_coef[4][8] = {}; constexpr GSScanlineConstantData256B() { using namespace GSScanlineConstantData; for (size_t n = 0; n < std::size(log2_coef); ++n) { - for (size_t i = 0; i < 8; ++i) - { - m_log2_coef[n][i] = log2_coef[n]; - } + m_log2_coef[n] = log2_coef[n]; } } }; diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp index af62bed11c..e9ae51b3bf 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp @@ -110,7 +110,12 @@ void GSSetupPrimCodeGenerator::Generate() for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++) { - movaps(XYm(3 + i), ptr[rax + i * vecsize]); + if (isXmm) + movaps(XYm(3 + i), ptr[rax + i * vecsize]); + else if (i == 0) + vbroadcastss(xym3, ptr[rax]); + else + movups(XYm(3 + i), ptr[rax + (9 - i) * sizeof(float)]); } } @@ -253,7 +258,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM() if (i < 4 || many_regs) vmulps(ymm0, Ymm(4 + i), ymm1); else - vmulps(ymm0, ymm1, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm0, ymm1, ptr[&g_const_256b.m_shift[8 - i]]); cvttps2dq(ymm0, ymm0); pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0)); @@ -281,7 +286,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM() if (i < 4 || many_regs) vmulps(ymm1, Ymm(4 + i), ymm0); else - vmulps(ymm1, ymm0, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm1, ymm0, ptr[&g_const_256b.m_shift[8 - i]]); movaps(_rip_local_di(i, z), ymm1); } } @@ -356,7 +361,7 @@ void GSSetupPrimCodeGenerator::Texture() if (i < 4 || many_regs) THREEARG(mulps, xym2, XYm(4 + i), xym1); else - vmulps(ymm2, ymm1, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm2, ymm1, ptr[&g_const_256b.m_shift[8 - i]]); if (m_sel.fst) { @@ -424,7 +429,7 @@ void GSSetupPrimCodeGenerator::Color() if (i < 4 || many_regs) THREEARG(mulps, xym0, XYm(4 + i), xym2); else - vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]); cvttps2dq(xym0, xym0); packssdw(xym0, xym0); @@ -433,7 +438,7 @@ void GSSetupPrimCodeGenerator::Color() if (i < 4 || many_regs) THREEARG(mulps, xym1, XYm(4 + i), xym3); else - vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]); cvttps2dq(xym1, xym1); packssdw(xym1, xym1); @@ -460,7 +465,7 @@ void GSSetupPrimCodeGenerator::Color() if (i < 4 || many_regs) THREEARG(mulps, xym0, XYm(4 + i), xym2); else - vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]); cvttps2dq(xym0, xym0); packssdw(xym0, xym0); @@ -469,7 +474,7 @@ void GSSetupPrimCodeGenerator::Color() if (i < 4 || many_regs) THREEARG(mulps, xym1, XYm(4 + i), xym3); else - vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]); + vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]); cvttps2dq(xym1, xym1); packssdw(xym1, xym1);