GS: Remove special casing alignment on AVX+

Extra code, unneeded as AVX+ has fast unaligned loads
This commit is contained in:
TellowKrinkle 2021-04-06 01:58:57 -05:00 committed by refractionpcsx2
parent bdc7dc2cd8
commit 4139da82b9
3 changed files with 54 additions and 71 deletions

View File

@ -42,50 +42,10 @@ public:
#if _M_SSE >= 0x501
GSVector8i v0, v1;
GSVector8i v0 = GSVector8i::load<false>(s0).acbd();
GSVector8i v1 = GSVector8i::load<false>(s1).acbd();
if (alignment == 32)
{
v0 = GSVector8i::load<true>(s0).acbd();
v1 = GSVector8i::load<true>(s1).acbd();
GSVector8i::sw64(v0, v1);
}
else
{
if (alignment == 16)
{
v0 = GSVector8i::load(&s0[0], &s0[16]).acbd();
v1 = GSVector8i::load(&s1[0], &s1[16]).acbd();
GSVector8i::sw64(v0, v1);
}
else
{
//v0 = GSVector8i::load(&s0[0], &s0[16], &s0[8], &s0[24]);
//v1 = GSVector8i::load(&s1[0], &s1[16], &s1[8], &s1[24]);
GSVector4i v4 = GSVector4i::load(&s0[0], &s1[0]);
GSVector4i v5 = GSVector4i::load(&s0[8], &s1[8]);
GSVector4i v6 = GSVector4i::load(&s0[16], &s1[16]);
GSVector4i v7 = GSVector4i::load(&s0[24], &s1[24]);
if (mask == 0xffffffff)
{
// just write them out directly
((GSVector4i*)dst)[i * 4 + 0] = v4;
((GSVector4i*)dst)[i * 4 + 1] = v5;
((GSVector4i*)dst)[i * 4 + 2] = v6;
((GSVector4i*)dst)[i * 4 + 3] = v7;
return;
}
v0 = GSVector8i::cast(v4).insert<1>(v5);
v1 = GSVector8i::cast(v6).insert<1>(v7);
}
}
GSVector8i::sw64(v0, v1);
if (mask == 0xffffffff)
{
@ -112,6 +72,17 @@ public:
GSVector4i v0, v1, v2, v3;
#if FAST_UNALIGNED
v0 = GSVector4i::load<false>(&s0[0]);
v1 = GSVector4i::load<false>(&s0[16]);
v2 = GSVector4i::load<false>(&s1[0]);
v3 = GSVector4i::load<false>(&s1[16]);
GSVector4i::sw64(v0, v2, v1, v3);
#else
if (alignment != 0)
{
v0 = GSVector4i::load<true>(&s0[0]);
@ -129,6 +100,8 @@ public:
v3 = GSVector4i::load(&s0[24], &s1[24]);
}
#endif
if (mask == 0xffffffff)
{
((GSVector4i*)dst)[i * 4 + 0] = v0;
@ -169,31 +142,11 @@ public:
#if _M_SSE >= 0x501
GSVector8i v0, v1;
GSVector8i v0 = GSVector8i::load<false>(s0);
GSVector8i v1 = GSVector8i::load<false>(s1);
if (alignment == 32)
{
v0 = GSVector8i::load<true>(s0);
v1 = GSVector8i::load<true>(s1);
GSVector8i::sw128(v0, v1);
GSVector8i::sw16(v0, v1);
}
else
{
if (alignment == 16)
{
v0 = GSVector8i::load(&s0[0], &s1[0]);
v1 = GSVector8i::load(&s0[16], &s1[16]);
}
else
{
v0 = GSVector8i::load(&s0[0], &s0[8], &s1[0], &s1[8]);
v1 = GSVector8i::load(&s0[16], &s0[24], &s1[16], &s1[24]);
}
GSVector8i::sw16(v0, v1);
}
GSVector8i::sw128(v0, v1);
GSVector8i::sw16(v0, v1);
v0 = v0.acbd();
v1 = v1.acbd();
@ -205,6 +158,18 @@ public:
GSVector4i v0, v1, v2, v3;
#if FAST_UNALIGNED
v0 = GSVector4i::load<false>(&s0[0]);
v1 = GSVector4i::load<false>(&s0[16]);
v2 = GSVector4i::load<false>(&s1[0]);
v3 = GSVector4i::load<false>(&s1[16]);
GSVector4i::sw16(v0, v1, v2, v3);
GSVector4i::sw64(v0, v1, v2, v3);
#else
if (alignment != 0)
{
v0 = GSVector4i::load<true>(&s0[0]);
@ -225,6 +190,8 @@ public:
GSVector4i::sw64(v0, v1, v2, v3);
}
#endif
((GSVector4i*)dst)[i * 4 + 0] = v0;
((GSVector4i*)dst)[i * 4 + 1] = v2;
((GSVector4i*)dst)[i * 4 + 2] = v1;
@ -240,10 +207,10 @@ public:
#if _M_SSE >= 0x501
GSVector4i v4 = GSVector4i::load<alignment != 0>(&src[srcpitch * 0]);
GSVector4i v5 = GSVector4i::load<alignment != 0>(&src[srcpitch * 1]);
GSVector4i v6 = GSVector4i::load<alignment != 0>(&src[srcpitch * 2]);
GSVector4i v7 = GSVector4i::load<alignment != 0>(&src[srcpitch * 3]);
GSVector4i v4 = GSVector4i::load<false>(&src[srcpitch * 0]);
GSVector4i v5 = GSVector4i::load<false>(&src[srcpitch * 1]);
GSVector4i v6 = GSVector4i::load<false>(&src[srcpitch * 2]);
GSVector4i v7 = GSVector4i::load<false>(&src[srcpitch * 3]);
GSVector8i v0(v4, v5);
GSVector8i v1(v6, v7);

View File

@ -689,6 +689,9 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const u8* sr
if (h2 > 0)
{
#if FAST_UNALIGNED
WriteImageColumn<psm, bsx, bsy, 0>(l, r, y, h2, src, srcpitch, BITBLTBUF);
#else
size_t addr = (size_t)&src[l * trbpp >> 3];
if ((addr & 31) == 0 && (srcpitch & 31) == 0)
@ -703,6 +706,7 @@ void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const u8* sr
{
WriteImageColumn<psm, bsx, bsy, 0>(l, r, y, h2, src, srcpitch, BITBLTBUF);
}
#endif
src += srcpitch * h2;
y += h2;
@ -839,6 +843,9 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const u8* src, int len, GIFRegB
if (h2 > 0)
{
#if FAST_UNALIGNED
WriteImageBlock<psm, bsx, bsy, 0>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
#else
size_t addr = (size_t)&s[la * trbpp >> 3];
if ((addr & 31) == 0 && (srcpitch & 31) == 0)
@ -853,6 +860,7 @@ void GSLocalMemory::WriteImage(int& tx, int& ty, const u8* src, int len, GIFRegB
{
WriteImageBlock<psm, bsx, bsy, 0>(la, ra, ty, h2, s, srcpitch, BITBLTBUF);
}
#endif
s += srcpitch * h2;
ty += h2;

View File

@ -35,3 +35,11 @@
#elif _M_SSE < 0x401
#error PCSX2 requires compiling for at least SSE 4.1
#endif
// Starting with AVX, processors have fast unaligned loads
// Reduce code duplication by not compiling multiple versions
#if _M_SSE >= 0x500
#define FAST_UNALIGNED 1
#else
#define FAST_UNALIGNED 0
#endif