diff --git a/pcsx2/GS/GSBlock.cpp b/pcsx2/GS/GSBlock.cpp index 64b0ba86aa..f89353c14e 100644 --- a/pcsx2/GS/GSBlock.cpp +++ b/pcsx2/GS/GSBlock.cpp @@ -19,10 +19,13 @@ CONSTINIT const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); CONSTINIT const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15); CONSTINIT const GSVector4i GSBlock::m_r4mask(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15); +CONSTINIT const GSVector4i GSBlock::m_w4mask(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15); CONSTINIT const GSVector4i GSBlock::m_palvec_mask(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); CONSTINIT const GSVector4i GSBlock::m_avx2_r8mask2(1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15, 2, 6, 10, 14); +CONSTINIT const GSVector4i GSBlock::m_avx2_w8mask1(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); +CONSTINIT const GSVector4i GSBlock::m_avx2_w8mask2(4, 0, 12, 8, 5, 1, 13, 9, 6, 2, 14, 10, 7, 3, 15, 11); CONSTINIT const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9); CONSTINIT const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11); diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index 5a8c11dfd6..cc91198cef 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -24,10 +24,13 @@ class GSBlock static const GSVector4i m_r16mask; static const GSVector4i m_r8mask; static const GSVector4i m_r4mask; + static const GSVector4i m_w4mask; static const GSVector4i m_palvec_mask; static const GSVector4i m_avx2_r8mask1; static const GSVector4i m_avx2_r8mask2; + static const GSVector4i m_avx2_w8mask1; + static const GSVector4i m_avx2_w8mask2; static const GSVector4i m_uw8hmask0; static const GSVector4i m_uw8hmask1; @@ -216,24 +219,20 @@ public: GSVector8i v0(v4, v5); GSVector8i v1(v6, v7); + GSVector8i v2 = v0.blend32<0xaa>(v1).shuffle8(GSVector8i::broadcast128(m_avx2_w8mask1)).acbd(); + GSVector8i v3 = v1.blend32<0xaa>(v0).shuffle8(GSVector8i::broadcast128(m_avx2_w8mask2)).acbd(); + if ((i & 1) == 0) { - v1 = v1.yxwz(); + ((GSVector8i*)dst)[i * 2 + 0] = v2; + ((GSVector8i*)dst)[i * 2 + 1] = v3; } else { - v0 = v0.yxwz(); + ((GSVector8i*)dst)[i * 2 + 0] = v3; + ((GSVector8i*)dst)[i * 2 + 1] = v2; } - GSVector8i::sw8(v0, v1); - GSVector8i::sw16(v0, v1); - - v0 = v0.acbd(); - v1 = v1.acbd(); - - ((GSVector8i*)dst)[i * 2 + 0] = v0; - ((GSVector8i*)dst)[i * 2 + 1] = v1; - #else GSVector4i v0 = GSVector4i::load(&src[srcpitch * 0]); @@ -273,6 +272,33 @@ public: // TODO: pshufb +#if _M_SSE >= 0x501 + + GSVector8i v0 = GSVector8i(GSVector4i::load(&src[srcpitch * 0]), GSVector4i::load(&src[srcpitch * 1])); + GSVector8i v1 = GSVector8i(GSVector4i::load(&src[srcpitch * 2]), GSVector4i::load(&src[srcpitch * 3])); + + v0 = v0.shuffle8(GSVector8i::broadcast128(m_w4mask)).acbd(); + v1 = v1.shuffle8(GSVector8i::broadcast128(m_w4mask)).acbd(); + + if ((i & 1) == 0) + { + v0 = v0.xzyw(); + v1 = v1.ywxz(); + } + else + { + v0 = v0.ywxz(); + v1 = v1.xzyw(); + } + + GSVector8i::mix4(v0, v1); + GSVector8i::sw32(v0, v1); + + ((GSVector8i*)dst)[i * 2 + 0] = v0; + ((GSVector8i*)dst)[i * 2 + 1] = v1; + +#else + GSVector4i v0 = GSVector4i::load(&src[srcpitch * 0]); GSVector4i v1 = GSVector4i::load(&src[srcpitch * 1]); GSVector4i v2 = GSVector4i::load(&src[srcpitch * 2]); @@ -298,6 +324,8 @@ public: ((GSVector4i*)dst)[i * 4 + 1] = v1; ((GSVector4i*)dst)[i * 4 + 2] = v2; ((GSVector4i*)dst)[i * 4 + 3] = v3; + +#endif } template