diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index d8b2198b36..c921dcdde9 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -1072,7 +1072,12 @@ public: { for (int j = 0; j < 16; j++, dst += dstpitch) { - ((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst); + for (int k = 0; k < 4; k++) + { + const u8* s = src + j * 16 + k * 4; + GSVector4i v = GSVector4i(pal[s[0]], pal[s[1]], pal[s[2]], pal[s[3]]); + reinterpret_cast(dst)[k] = v; + } } } @@ -1543,6 +1548,15 @@ public: #endif } + /// ReadAndExpandBlock8 for AVX2 platforms with slow VPGATHERDD (Haswell, Zen, Zen2, Zen3) + /// This is faster than the one in ReadAndExpandBlock8_32 on HSW+ due to a port 5 traffic jam, should be about the same on Zen + __forceinline static void ReadAndExpandBlock8_32HSW(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + { + alignas(32) u8 block[16 * 16]; + ReadBlock8(src, (u8*)block, sizeof(block) / 16); + ExpandBlock8_32(block, dst, dstpitch, pal); + } + __forceinline static void ReadAndExpandBlock8_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { //printf("ReadAndExpandBlock8_32\n"); @@ -1829,6 +1843,25 @@ public: // TODO: ReadAndExpandBlock4_16 + // ReadAndExpandBlock8H for AVX2 platforms with slow VPGATHERDD (Haswell, Zen, Zen2, Zen3) + // Also serves as the implementation for AVX / SSE + __forceinline static void ReadAndExpandBlock8H_32HSW(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) + { + for (int i = 0; i < 4; i++) + { + const u8* s = src + i * 64; + GSVector4i* d0 = reinterpret_cast(dst + dstpitch * 0); + GSVector4i* d1 = reinterpret_cast(dst + dstpitch * 1); + + d0[0] = GSVector4i(pal[s[ 3]], pal[s[ 7]], pal[s[19]], pal[s[23]]); + d0[1] = GSVector4i(pal[s[35]], pal[s[39]], pal[s[51]], pal[s[55]]); + d1[0] = GSVector4i(pal[s[11]], pal[s[15]], pal[s[27]], pal[s[31]]); + d1[1] = GSVector4i(pal[s[43]], pal[s[47]], pal[s[59]], pal[s[63]]); + + dst += dstpitch * 2; + } + } + __forceinline static void ReadAndExpandBlock8H_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { //printf("ReadAndExpandBlock8H_32\n"); @@ -1853,29 +1886,7 @@ public: #else - const GSVector4i* s = (const GSVector4i*)src; - - GSVector4i v0, v1, v2, v3; - - for (int i = 0; i < 4; i++) - { - v0 = s[i * 4 + 0]; - v1 = s[i * 4 + 1]; - v2 = s[i * 4 + 2]; - v3 = s[i * 4 + 3]; - - GSVector4i::sw64(v0, v1, v2, v3); - - (v0 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]); - (v1 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]); - - dst += dstpitch; - - (v2 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]); - (v3 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]); - - dst += dstpitch; - } + ReadAndExpandBlock8H_32HSW(src, dst, dstpitch, pal); #endif } diff --git a/pcsx2/GS/GSLocalMemory.cpp b/pcsx2/GS/GSLocalMemory.cpp index 3fba2a66d8..a1d00f20c3 100644 --- a/pcsx2/GS/GSLocalMemory.cpp +++ b/pcsx2/GS/GSLocalMemory.cpp @@ -17,6 +17,7 @@ #include "GSLocalMemory.h" #include "GS.h" #include "GSExtra.h" +#include #include template @@ -269,6 +270,35 @@ GSLocalMemory::GSLocalMemory() m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16; +#if _M_SSE == 0x501 + Xbyak::util::Cpu cpu; + bool slowVPGATHERDD; + if (cpu.has(Xbyak::util::Cpu::tINTEL)) + { + // Slow on Haswell + // CPUID data from https://en.wikichip.org/wiki/intel/cpuid + slowVPGATHERDD = cpu.displayModel == 0x46 || cpu.displayModel == 0x45 || cpu.displayModel == 0x3c; + } + else + { + // Currently no Zen CPUs with fast VPGATHERDD + // Check https://uops.info/table.html as new CPUs come out for one that doesn't split it into like 40 µops + // Doing it manually is about 28 µops (8x xmm -> gpr, 6x extr, 8x load, 6x insr) + slowVPGATHERDD = true; + } + if (const char* over = getenv("SLOW_VPGATHERDD_OVERRIDE")) // Easy override for comparing on vs off + { + slowVPGATHERDD = over[0] == 'Y' || over[0] == 'y' || over[0] == '1'; + } + if (slowVPGATHERDD) + { + m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8HSW; + m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8HHSW; + m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8HSW; + m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8HHSW; + } +#endif + m_psm[PSM_PSGPU24].bpp = 16; m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16; m_psm[PSM_PSMT8].bpp = 8; @@ -1460,6 +1490,28 @@ void GSLocalMemory::ReadTexture8H(const GSOffset& off, const GSVector4i& r, u8* }); } +#if _M_SSE == 0x501 +void GSLocalMemory::ReadTexture8HSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) +{ + const u32* pal = m_clut; + + foreachBlock(off.assertSizesMatch(swizzle8), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src) + { + GSBlock::ReadAndExpandBlock8_32HSW(src, read_dst, dstpitch, pal); + }); +} + +void GSLocalMemory::ReadTexture8HHSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) +{ + const u32* pal = m_clut; + + foreachBlock(off.assertSizesMatch(swizzle32), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src) + { + GSBlock::ReadAndExpandBlock8H_32HSW(src, read_dst, dstpitch, pal); + }); +} +#endif + void GSLocalMemory::ReadTexture4HL(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) { const u32* pal = m_clut; @@ -1538,6 +1590,22 @@ void GSLocalMemory::ReadTextureBlock8H(u32 bp, u8* dst, int dstpitch, const GIFR GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut); } +#if _M_SSE == 0x501 +void GSLocalMemory::ReadTextureBlock8HSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const +{ + ALIGN_STACK(32); + + GSBlock::ReadAndExpandBlock8_32HSW(BlockPtr(bp), dst, dstpitch, m_clut); +} + +void GSLocalMemory::ReadTextureBlock8HHSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const +{ + ALIGN_STACK(32); + + GSBlock::ReadAndExpandBlock8H_32HSW(BlockPtr(bp), dst, dstpitch, m_clut); +} +#endif + void GSLocalMemory::ReadTextureBlock4HL(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { ALIGN_STACK(32); diff --git a/pcsx2/GS/GSLocalMemory.h b/pcsx2/GS/GSLocalMemory.h index 03c916d1d9..6dc3f0484a 100644 --- a/pcsx2/GS/GSLocalMemory.h +++ b/pcsx2/GS/GSLocalMemory.h @@ -1156,6 +1156,13 @@ public: void ReadTextureBlock4HL(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4HH(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; +#if _M_SSE == 0x501 + void ReadTexture8HSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA); + void ReadTexture8HHSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA); + void ReadTextureBlock8HSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; + void ReadTextureBlock8HHSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; +#endif + // pal ? 8 : 32 void ReadTexture8P(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA); diff --git a/pcsx2/GS/GSVector4i.h b/pcsx2/GS/GSVector4i.h index 568bb348ca..72f6602564 100644 --- a/pcsx2/GS/GSVector4i.h +++ b/pcsx2/GS/GSVector4i.h @@ -81,16 +81,7 @@ public: __forceinline GSVector4i(int x, int y, int z, int w) { - // 4 gprs - - // m = _mm_set_epi32(w, z, y, x); - - // 2 gprs - - GSVector4i xz = load(x).upl32(load(z)); - GSVector4i yw = load(y).upl32(load(w)); - - *this = xz.upl32(yw); + m = _mm_set_epi32(w, z, y, x); } __forceinline GSVector4i(int x, int y)