mirror of https://github.com/PCSX2/pcsx2.git
GS: Faster palette lookup for AVX2 platforms with slow VPGATHERDD
This commit is contained in:
parent
e2169bc1da
commit
a6887715c7
|
@ -1072,7 +1072,12 @@ public:
|
|||
{
|
||||
for (int j = 0; j < 16; j++, dst += dstpitch)
|
||||
{
|
||||
((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst);
|
||||
for (int k = 0; k < 4; k++)
|
||||
{
|
||||
const u8* s = src + j * 16 + k * 4;
|
||||
GSVector4i v = GSVector4i(pal[s[0]], pal[s[1]], pal[s[2]], pal[s[3]]);
|
||||
reinterpret_cast<GSVector4i*>(dst)[k] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1543,6 +1548,15 @@ public:
|
|||
#endif
|
||||
}
|
||||
|
||||
/// ReadAndExpandBlock8 for AVX2 platforms with slow VPGATHERDD (Haswell, Zen, Zen2, Zen3)
|
||||
/// This is faster than the one in ReadAndExpandBlock8_32 on HSW+ due to a port 5 traffic jam, should be about the same on Zen
|
||||
__forceinline static void ReadAndExpandBlock8_32HSW(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
|
||||
{
|
||||
alignas(32) u8 block[16 * 16];
|
||||
ReadBlock8(src, (u8*)block, sizeof(block) / 16);
|
||||
ExpandBlock8_32(block, dst, dstpitch, pal);
|
||||
}
|
||||
|
||||
__forceinline static void ReadAndExpandBlock8_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
|
||||
{
|
||||
//printf("ReadAndExpandBlock8_32\n");
|
||||
|
@ -1829,6 +1843,25 @@ public:
|
|||
|
||||
// TODO: ReadAndExpandBlock4_16
|
||||
|
||||
// ReadAndExpandBlock8H for AVX2 platforms with slow VPGATHERDD (Haswell, Zen, Zen2, Zen3)
|
||||
// Also serves as the implementation for AVX / SSE
|
||||
__forceinline static void ReadAndExpandBlock8H_32HSW(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
const u8* s = src + i * 64;
|
||||
GSVector4i* d0 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 0);
|
||||
GSVector4i* d1 = reinterpret_cast<GSVector4i*>(dst + dstpitch * 1);
|
||||
|
||||
d0[0] = GSVector4i(pal[s[ 3]], pal[s[ 7]], pal[s[19]], pal[s[23]]);
|
||||
d0[1] = GSVector4i(pal[s[35]], pal[s[39]], pal[s[51]], pal[s[55]]);
|
||||
d1[0] = GSVector4i(pal[s[11]], pal[s[15]], pal[s[27]], pal[s[31]]);
|
||||
d1[1] = GSVector4i(pal[s[43]], pal[s[47]], pal[s[59]], pal[s[63]]);
|
||||
|
||||
dst += dstpitch * 2;
|
||||
}
|
||||
}
|
||||
|
||||
__forceinline static void ReadAndExpandBlock8H_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
|
||||
{
|
||||
//printf("ReadAndExpandBlock8H_32\n");
|
||||
|
@ -1853,29 +1886,7 @@ public:
|
|||
|
||||
#else
|
||||
|
||||
const GSVector4i* s = (const GSVector4i*)src;
|
||||
|
||||
GSVector4i v0, v1, v2, v3;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
v0 = s[i * 4 + 0];
|
||||
v1 = s[i * 4 + 1];
|
||||
v2 = s[i * 4 + 2];
|
||||
v3 = s[i * 4 + 3];
|
||||
|
||||
GSVector4i::sw64(v0, v1, v2, v3);
|
||||
|
||||
(v0 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]);
|
||||
(v1 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]);
|
||||
|
||||
dst += dstpitch;
|
||||
|
||||
(v2 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[0]);
|
||||
(v3 >> 24).gather32_32<>(pal, (GSVector4i*)&dst[16]);
|
||||
|
||||
dst += dstpitch;
|
||||
}
|
||||
ReadAndExpandBlock8H_32HSW(src, dst, dstpitch, pal);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "GSLocalMemory.h"
|
||||
#include "GS.h"
|
||||
#include "GSExtra.h"
|
||||
#include <xbyak/xbyak_util.h>
|
||||
#include <unordered_set>
|
||||
|
||||
template <typename Fn>
|
||||
|
@ -269,6 +270,35 @@ GSLocalMemory::GSLocalMemory()
|
|||
m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
|
||||
m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
|
||||
|
||||
#if _M_SSE == 0x501
|
||||
Xbyak::util::Cpu cpu;
|
||||
bool slowVPGATHERDD;
|
||||
if (cpu.has(Xbyak::util::Cpu::tINTEL))
|
||||
{
|
||||
// Slow on Haswell
|
||||
// CPUID data from https://en.wikichip.org/wiki/intel/cpuid
|
||||
slowVPGATHERDD = cpu.displayModel == 0x46 || cpu.displayModel == 0x45 || cpu.displayModel == 0x3c;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Currently no Zen CPUs with fast VPGATHERDD
|
||||
// Check https://uops.info/table.html as new CPUs come out for one that doesn't split it into like 40 µops
|
||||
// Doing it manually is about 28 µops (8x xmm -> gpr, 6x extr, 8x load, 6x insr)
|
||||
slowVPGATHERDD = true;
|
||||
}
|
||||
if (const char* over = getenv("SLOW_VPGATHERDD_OVERRIDE")) // Easy override for comparing on vs off
|
||||
{
|
||||
slowVPGATHERDD = over[0] == 'Y' || over[0] == 'y' || over[0] == '1';
|
||||
}
|
||||
if (slowVPGATHERDD)
|
||||
{
|
||||
m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8HSW;
|
||||
m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8HHSW;
|
||||
m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8HSW;
|
||||
m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8HHSW;
|
||||
}
|
||||
#endif
|
||||
|
||||
m_psm[PSM_PSGPU24].bpp = 16;
|
||||
m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16;
|
||||
m_psm[PSM_PSMT8].bpp = 8;
|
||||
|
@ -1460,6 +1490,28 @@ void GSLocalMemory::ReadTexture8H(const GSOffset& off, const GSVector4i& r, u8*
|
|||
});
|
||||
}
|
||||
|
||||
#if _M_SSE == 0x501
|
||||
void GSLocalMemory::ReadTexture8HSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
|
||||
{
|
||||
const u32* pal = m_clut;
|
||||
|
||||
foreachBlock(off.assertSizesMatch(swizzle8), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src)
|
||||
{
|
||||
GSBlock::ReadAndExpandBlock8_32HSW(src, read_dst, dstpitch, pal);
|
||||
});
|
||||
}
|
||||
|
||||
void GSLocalMemory::ReadTexture8HHSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
|
||||
{
|
||||
const u32* pal = m_clut;
|
||||
|
||||
foreachBlock(off.assertSizesMatch(swizzle32), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src)
|
||||
{
|
||||
GSBlock::ReadAndExpandBlock8H_32HSW(src, read_dst, dstpitch, pal);
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
void GSLocalMemory::ReadTexture4HL(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA)
|
||||
{
|
||||
const u32* pal = m_clut;
|
||||
|
@ -1538,6 +1590,22 @@ void GSLocalMemory::ReadTextureBlock8H(u32 bp, u8* dst, int dstpitch, const GIFR
|
|||
GSBlock::ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
|
||||
}
|
||||
|
||||
#if _M_SSE == 0x501
|
||||
void GSLocalMemory::ReadTextureBlock8HSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
|
||||
{
|
||||
ALIGN_STACK(32);
|
||||
|
||||
GSBlock::ReadAndExpandBlock8_32HSW(BlockPtr(bp), dst, dstpitch, m_clut);
|
||||
}
|
||||
|
||||
void GSLocalMemory::ReadTextureBlock8HHSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
|
||||
{
|
||||
ALIGN_STACK(32);
|
||||
|
||||
GSBlock::ReadAndExpandBlock8H_32HSW(BlockPtr(bp), dst, dstpitch, m_clut);
|
||||
}
|
||||
#endif
|
||||
|
||||
void GSLocalMemory::ReadTextureBlock4HL(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
|
||||
{
|
||||
ALIGN_STACK(32);
|
||||
|
|
|
@ -1156,6 +1156,13 @@ public:
|
|||
void ReadTextureBlock4HL(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
|
||||
void ReadTextureBlock4HH(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
|
||||
|
||||
#if _M_SSE == 0x501
|
||||
void ReadTexture8HSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA);
|
||||
void ReadTexture8HHSW(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA);
|
||||
void ReadTextureBlock8HSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
|
||||
void ReadTextureBlock8HHSW(u32 bp, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
|
||||
#endif
|
||||
|
||||
// pal ? 8 : 32
|
||||
|
||||
void ReadTexture8P(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA);
|
||||
|
|
|
@ -81,16 +81,7 @@ public:
|
|||
|
||||
__forceinline GSVector4i(int x, int y, int z, int w)
|
||||
{
|
||||
// 4 gprs
|
||||
|
||||
// m = _mm_set_epi32(w, z, y, x);
|
||||
|
||||
// 2 gprs
|
||||
|
||||
GSVector4i xz = load(x).upl32(load(z));
|
||||
GSVector4i yw = load(y).upl32(load(w));
|
||||
|
||||
*this = xz.upl32(yw);
|
||||
m = _mm_set_epi32(w, z, y, x);
|
||||
}
|
||||
|
||||
__forceinline GSVector4i(int x, int y)
|
||||
|
|
Loading…
Reference in New Issue