diff --git a/pcsx2/GS/GSBlock.cpp b/pcsx2/GS/GSBlock.cpp index e5e03ef06f..64b0ba86aa 100644 --- a/pcsx2/GS/GSBlock.cpp +++ b/pcsx2/GS/GSBlock.cpp @@ -19,6 +19,7 @@ CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15); +CONSTINIT const GSVector4i GSBlock::m_palvec_mask(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14); diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index 7f5aa9c23e..39fab9c384 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -24,6 +24,7 @@ class GSBlock static const GSVector4i m_r16mask; static const GSVector4i m_r8mask; static const GSVector4i m_r4mask; + static const GSVector4i m_palvec_mask; static const GSVector4i m_avx2_r8mask1; static const GSVector4i m_avx2_r8mask2; @@ -1759,72 +1760,106 @@ public: // TODO: ReadAndExpandBlock8_16 - __forceinline static void ReadAndExpandBlock4_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u64* RESTRICT pal) + /// Load 16-element palette into four vectors, with each u32 split across the four vectors + template + __forceinline static void LoadPalVecs(const u32* RESTRICT pal, V& p0, V& p1, V& p2, V& p3) + { + const GSVector4i* p = (const GSVector4i*)pal; + p0 = V::broadcast128(p[0]).shuffle8(V::broadcast128(m_palvec_mask)); + p1 = V::broadcast128(p[1]).shuffle8(V::broadcast128(m_palvec_mask)); + p2 = V::broadcast128(p[2]).shuffle8(V::broadcast128(m_palvec_mask)); + p3 = V::broadcast128(p[3]).shuffle8(V::broadcast128(m_palvec_mask)); + V::sw32(p0, p1, p2, p3); + V::sw64(p0, p1, p2, p3); + std::swap(p1, p2); + } + + template + __forceinline static void ReadClut4AndWrite(const V& p0, const V& p1, const V& p2, const V& p3, const V& src, V* dst, int dstride) + { + V r0 = p0.shuffle8(src); + V r1 = p1.shuffle8(src); + V r2 = p2.shuffle8(src); + V r3 = p3.shuffle8(src); + + V::sw8(r0, r1, r2, r3); + V::sw16(r0, r1, r2, r3); + + dst[dstride * 0] = r0; + dst[dstride * 1] = r2; + dst[dstride * 2] = r1; + dst[dstride * 3] = r3; + } + + __forceinline static void ReadAndExpandBlock4_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal) { //printf("ReadAndExpandBlock4_32\n"); const GSVector4i* s = (const GSVector4i*)src; + GSVector4i p0, p1, p2, p3; + LoadPalVecs(pal, p0, p1, p2, p3); + GSVector4i mask(0x0f0f0f0f); + GSVector4i v0, v1, v2, v3; - GSVector4 v0f, v1f, v2f, v3f; for (int i = 0; i < 2; i++) { + GSVector4i* d0 = reinterpret_cast(dst + dstpitch * 0); + GSVector4i* d1 = reinterpret_cast(dst + dstpitch * 1); + GSVector4i* d2 = reinterpret_cast(dst + dstpitch * 2); + GSVector4i* d3 = reinterpret_cast(dst + dstpitch * 3); + v0 = s[i * 8 + 0]; v1 = s[i * 8 + 1]; v2 = s[i * 8 + 2]; v3 = s[i * 8 + 3]; - GSVector4i::sw32_inv(v0, v1, v2, v3); - GSVector4i::mix4(v0, v1); - GSVector4i::mix4(v2, v3); + GSVector4i::sw64(v0, v1, v2, v3); - v0f = GSVector4::cast(v0); - v1f = GSVector4::cast(v1); - v2f = GSVector4::cast(v2); - v3f = GSVector4::cast(v3); + v0 = v0.shuffle8(m_palvec_mask); + v1 = v1.shuffle8(m_palvec_mask); + v2 = v2.shuffle8(m_palvec_mask); + v3 = v3.shuffle8(m_palvec_mask); - v0 = GSVector4i::cast(v0f.xzxz(v2f)).shuffle8(m_r4mask); - v1 = GSVector4i::cast(v0f.ywyw(v2f)).shuffle8(m_r4mask); - v2 = GSVector4i::cast(v1f.zxzx(v3f)).shuffle8(m_r4mask); - v3 = GSVector4i::cast(v1f.wywy(v3f)).shuffle8(m_r4mask); + ReadClut4AndWrite(p0, p1, p2, p3, v0 & mask, d0 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v0 >> 4) & mask, d2 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, v1 & mask, d0 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v1 >> 4) & mask, d2 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, v2 & mask, d1 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v2 >> 4) & mask, d3 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, v3 & mask, d1 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v3 >> 4) & mask, d3 + 0, 2); - v0.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; - v1.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; - v2.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; - v3.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; + dst += dstpitch * 4; + + d0 = reinterpret_cast(dst + dstpitch * 0); + d1 = reinterpret_cast(dst + dstpitch * 1); + d2 = reinterpret_cast(dst + dstpitch * 2); + d3 = reinterpret_cast(dst + dstpitch * 3); v0 = s[i * 8 + 4]; v1 = s[i * 8 + 5]; v2 = s[i * 8 + 6]; v3 = s[i * 8 + 7]; - GSVector4i::sw32_inv(v0, v1, v2, v3); - GSVector4i::mix4(v0, v1); - GSVector4i::mix4(v2, v3); + GSVector4i::sw64(v0, v1, v2, v3); - v0f = GSVector4::cast(v0); - v1f = GSVector4::cast(v1); - v2f = GSVector4::cast(v2); - v3f = GSVector4::cast(v3); + v0 = v0.shuffle8(m_palvec_mask); + v1 = v1.shuffle8(m_palvec_mask); + v2 = v2.shuffle8(m_palvec_mask); + v3 = v3.shuffle8(m_palvec_mask); - v0 = GSVector4i::cast(v0f.zxzx(v2f)).shuffle8(m_r4mask); - v1 = GSVector4i::cast(v0f.wywy(v2f)).shuffle8(m_r4mask); - v2 = GSVector4i::cast(v1f.xzxz(v3f)).shuffle8(m_r4mask); - v3 = GSVector4i::cast(v1f.ywyw(v3f)).shuffle8(m_r4mask); + ReadClut4AndWrite(p0, p1, p2, p3, v0 & mask, d0 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v0 >> 4) & mask, d2 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, v1 & mask, d0 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v1 >> 4) & mask, d2 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, v2 & mask, d1 + 1, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v2 >> 4) & mask, d3 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, v3 & mask, d1 + 0, 2); + ReadClut4AndWrite(p0, p1, p2, p3, (v3 >> 4) & mask, d3 + 1, 2); - v0.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; - v1.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; - v2.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; - v3.gather64_8<>(pal, (GSVector4i*)dst); - dst += dstpitch; + dst += dstpitch * 4; } } diff --git a/pcsx2/GS/GSLocalMemory.cpp b/pcsx2/GS/GSLocalMemory.cpp index 50ce07ed52..3fba2a66d8 100644 --- a/pcsx2/GS/GSLocalMemory.cpp +++ b/pcsx2/GS/GSLocalMemory.cpp @@ -1442,7 +1442,7 @@ void GSLocalMemory::ReadTexture8(const GSOffset& off, const GSVector4i& r, u8* d void GSLocalMemory::ReadTexture4(const GSOffset& off, const GSVector4i& r, u8* dst, int dstpitch, const GIFRegTEXA& TEXA) { - const u64* pal = m_clut; + const u32* pal = m_clut; foreachBlock(off.assertSizesMatch(swizzle4), this, r, dst, dstpitch, 32, [&](u8* read_dst, const u8* src) { diff --git a/pcsx2/GS/GSVector4i.h b/pcsx2/GS/GSVector4i.h index e520c9482c..b8c59e576a 100644 --- a/pcsx2/GS/GSVector4i.h +++ b/pcsx2/GS/GSVector4i.h @@ -2005,6 +2005,13 @@ public: // clang-format on + /// Noop, here so broadcast128 can be used generically over all vectors + __forceinline static GSVector4i broadcast128(const GSVector4i& v) + { + return v; + } + + __forceinline static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); } __forceinline static GSVector4i xffffffff() { return zero() == zero(); } diff --git a/tests/ctest/GS/swizzle_test_main.cpp b/tests/ctest/GS/swizzle_test_main.cpp index c3339fc8b6..9057aa49b4 100644 --- a/tests/ctest/GS/swizzle_test_main.cpp +++ b/tests/ctest/GS/swizzle_test_main.cpp @@ -432,7 +432,7 @@ TEST(ReadAndExpandTest, Read4) { TestData expected = swizzle4(&columnTable4[0][0], data, true); expected = expand4(expected.prepareExpand()); - GSBlock::ReadAndExpandBlock4_32(data.block, data.output, 128, data.clut64); + GSBlock::ReadAndExpandBlock4_32(data.block, data.output, 128, data.clut32); assertEqual(expected, data, "ReadAndExpand4", 16, 32, 32); }); }