From 4139da82b992c3ac8e24dd8d9df7f10ba6f1cec9 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Tue, 6 Apr 2021 01:58:57 -0500 Subject: [PATCH] GS: Remove special casing alignment on AVX+ Extra code, unneeded as AVX+ has fast unaligned loads --- pcsx2/GS/GSBlock.h | 109 +++++++++++++------------------------ pcsx2/GS/GSLocalMemory.cpp | 8 +++ pcsx2/PCSX2Base.h | 8 +++ 3 files changed, 54 insertions(+), 71 deletions(-) diff --git a/pcsx2/GS/GSBlock.h b/pcsx2/GS/GSBlock.h index d9457e6d87..32404b6688 100644 --- a/pcsx2/GS/GSBlock.h +++ b/pcsx2/GS/GSBlock.h @@ -42,50 +42,10 @@ public: #if _M_SSE >= 0x501 - GSVector8i v0, v1; + GSVector8i v0 = GSVector8i::load(s0).acbd(); + GSVector8i v1 = GSVector8i::load(s1).acbd(); - if (alignment == 32) - { - v0 = GSVector8i::load(s0).acbd(); - v1 = GSVector8i::load(s1).acbd(); - - GSVector8i::sw64(v0, v1); - } - else - { - if (alignment == 16) - { - v0 = GSVector8i::load(&s0[0], &s0[16]).acbd(); - v1 = GSVector8i::load(&s1[0], &s1[16]).acbd(); - - GSVector8i::sw64(v0, v1); - } - else - { - //v0 = GSVector8i::load(&s0[0], &s0[16], &s0[8], &s0[24]); - //v1 = GSVector8i::load(&s1[0], &s1[16], &s1[8], &s1[24]); - - GSVector4i v4 = GSVector4i::load(&s0[0], &s1[0]); - GSVector4i v5 = GSVector4i::load(&s0[8], &s1[8]); - GSVector4i v6 = GSVector4i::load(&s0[16], &s1[16]); - GSVector4i v7 = GSVector4i::load(&s0[24], &s1[24]); - - if (mask == 0xffffffff) - { - // just write them out directly - - ((GSVector4i*)dst)[i * 4 + 0] = v4; - ((GSVector4i*)dst)[i * 4 + 1] = v5; - ((GSVector4i*)dst)[i * 4 + 2] = v6; - ((GSVector4i*)dst)[i * 4 + 3] = v7; - - return; - } - - v0 = GSVector8i::cast(v4).insert<1>(v5); - v1 = GSVector8i::cast(v6).insert<1>(v7); - } - } + GSVector8i::sw64(v0, v1); if (mask == 0xffffffff) { @@ -112,6 +72,17 @@ public: GSVector4i v0, v1, v2, v3; +#if FAST_UNALIGNED + + v0 = GSVector4i::load(&s0[0]); + v1 = GSVector4i::load(&s0[16]); + v2 = GSVector4i::load(&s1[0]); + v3 = GSVector4i::load(&s1[16]); + + GSVector4i::sw64(v0, v2, v1, v3); + +#else + if (alignment != 0) { v0 = GSVector4i::load(&s0[0]); @@ -129,6 +100,8 @@ public: v3 = GSVector4i::load(&s0[24], &s1[24]); } +#endif + if (mask == 0xffffffff) { ((GSVector4i*)dst)[i * 4 + 0] = v0; @@ -169,31 +142,11 @@ public: #if _M_SSE >= 0x501 - GSVector8i v0, v1; + GSVector8i v0 = GSVector8i::load(s0); + GSVector8i v1 = GSVector8i::load(s1); - if (alignment == 32) - { - v0 = GSVector8i::load(s0); - v1 = GSVector8i::load(s1); - - GSVector8i::sw128(v0, v1); - GSVector8i::sw16(v0, v1); - } - else - { - if (alignment == 16) - { - v0 = GSVector8i::load(&s0[0], &s1[0]); - v1 = GSVector8i::load(&s0[16], &s1[16]); - } - else - { - v0 = GSVector8i::load(&s0[0], &s0[8], &s1[0], &s1[8]); - v1 = GSVector8i::load(&s0[16], &s0[24], &s1[16], &s1[24]); - } - - GSVector8i::sw16(v0, v1); - } + GSVector8i::sw128(v0, v1); + GSVector8i::sw16(v0, v1); v0 = v0.acbd(); v1 = v1.acbd(); @@ -205,6 +158,18 @@ public: GSVector4i v0, v1, v2, v3; +#if FAST_UNALIGNED + + v0 = GSVector4i::load(&s0[0]); + v1 = GSVector4i::load(&s0[16]); + v2 = GSVector4i::load(&s1[0]); + v3 = GSVector4i::load(&s1[16]); + + GSVector4i::sw16(v0, v1, v2, v3); + GSVector4i::sw64(v0, v1, v2, v3); + +#else + if (alignment != 0) { v0 = GSVector4i::load(&s0[0]); @@ -225,6 +190,8 @@ public: GSVector4i::sw64(v0, v1, v2, v3); } +#endif + ((GSVector4i*)dst)[i * 4 + 0] = v0; ((GSVector4i*)dst)[i * 4 + 1] = v2; ((GSVector4i*)dst)[i * 4 + 2] = v1; @@ -240,10 +207,10 @@ public: #if _M_SSE >= 0x501 - GSVector4i v4 = GSVector4i::load(&src[srcpitch * 0]); - GSVector4i v5 = GSVector4i::load(&src[srcpitch * 1]); - GSVector4i v6 = GSVector4i::load(&src[srcpitch * 2]); - GSVector4i v7 = GSVector4i::load(&src[srcpitch * 3]); + GSVector4i v4 = GSVector4i::load(&src[srcpitch * 0]); + GSVector4i v5 = GSVector4i::load(&src[srcpitch * 1]); + GSVector4i v6 = GSVector4i::load(&src[srcpitch * 2]); + GSVector4i v7 = GSVector4i::load(&src[srcpitch * 3]); GSVector8i v0(v4, v5); GSVector8i v1(v6, v7); diff --git a/pcsx2/GS/GSLocalMemory.cpp b/pcsx2/GS/GSLocalMemory.cpp index 33e2bb7919..50ce07ed52 100644 --- a/pcsx2/GS/GSLocalMemory.cpp +++ b/pcsx2/GS/GSLocalMemory.cpp @@ -689,6 +689,9 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const u8* sr if (h2 > 0) { +#if FAST_UNALIGNED + WriteImageColumn(l, r, y, h2, src, srcpitch, BITBLTBUF); +#else size_t addr = (size_t)&src[l * trbpp >> 3]; if ((addr & 31) == 0 && (srcpitch & 31) == 0) @@ -703,6 +706,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const u8* sr { WriteImageColumn(l, r, y, h2, src, srcpitch, BITBLTBUF); } +#endif src += srcpitch * h2; y += h2; @@ -839,6 +843,9 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const u8* src, int len, GIFRegB if (h2 > 0) { +#if FAST_UNALIGNED + WriteImageBlock(la, ra, ty, h2, s, srcpitch, BITBLTBUF); +#else size_t addr = (size_t)&s[la * trbpp >> 3]; if ((addr & 31) == 0 && (srcpitch & 31) == 0) @@ -853,6 +860,7 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const u8* src, int len, GIFRegB { WriteImageBlock(la, ra, ty, h2, s, srcpitch, BITBLTBUF); } +#endif s += srcpitch * h2; ty += h2; diff --git a/pcsx2/PCSX2Base.h b/pcsx2/PCSX2Base.h index 1c3f2e9a43..96aa82ef9b 100644 --- a/pcsx2/PCSX2Base.h +++ b/pcsx2/PCSX2Base.h @@ -35,3 +35,11 @@ #elif _M_SSE < 0x401 #error PCSX2 requires compiling for at least SSE 4.1 #endif + +// Starting with AVX, processors have fast unaligned loads +// Reduce code duplication by not compiling multiple versions +#if _M_SSE >= 0x500 + #define FAST_UNALIGNED 1 +#else + #define FAST_UNALIGNED 0 +#endif