GS: Consolidate repeated BlockH code

This commit is contained in:
TellowKrinkle 2021-04-07 17:35:34 -05:00 committed by refractionpcsx2
parent 89f3b46a9e
commit e2169bc1da
3 changed files with 256 additions and 455 deletions

View File

@ -855,47 +855,40 @@ public:
#endif
}
__forceinline static void ReadBlock8HP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
template <u32 shift, u32 mask>
__forceinline static void ReadBlockHP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
{
#if _M_SSE >= 0x501
u8* RESTRICT d0 = &dst[dstpitch * 0];
u8* RESTRICT d1 = &dst[dstpitch * 4];
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i v0, v1, v2, v3;
GSVector4i v4, v5;
v0 = s[0].acbd();
v1 = s[1].acbd();
v2 = s[2].acbd();
v3 = s[3].acbd();
GSVector8i maskvec = GSVector8i(mask);
v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24));
for (int i = 0; i < 2; i++)
{
v0 = s[i * 4 + 0].acbd();
v1 = s[i * 4 + 1].acbd();
v2 = s[i * 4 + 2].acbd();
v3 = s[i * 4 + 3].acbd();
v0 = (v0 >> shift).ps32(v1 >> shift).pu16((v2 >> shift).ps32(v3 >> shift));
if (mask != 0xffffffff)
v0 = v0 & maskvec;
v4 = v0.extract<0>();
v5 = v0.extract<1>();
GSVector4i::storel(&d0[dstpitch * 0], v4);
GSVector4i::storel(&d0[dstpitch * 1], v5);
GSVector4i::storeh(&d0[dstpitch * 2], v4);
GSVector4i::storeh(&d0[dstpitch * 3], v5);
GSVector4i::storel(&dst[dstpitch * 0], v4);
GSVector4i::storel(&dst[dstpitch * 1], v5);
GSVector4i::storeh(&dst[dstpitch * 2], v4);
GSVector4i::storeh(&dst[dstpitch * 3], v5);
v0 = s[4].acbd();
v1 = s[5].acbd();
v2 = s[6].acbd();
v3 = s[7].acbd();
v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24));
v4 = v0.extract<0>();
v5 = v0.extract<1>();
GSVector4i::storel(&d1[dstpitch * 0], v4);
GSVector4i::storel(&d1[dstpitch * 1], v5);
GSVector4i::storeh(&d1[dstpitch * 2], v4);
GSVector4i::storeh(&d1[dstpitch * 3], v5);
dst += dstpitch * 4;
}
#else
@ -903,6 +896,8 @@ public:
GSVector4i v0, v1, v2, v3;
GSVector4i maskvec(mask);
for (int i = 0; i < 4; i++)
{
v0 = s[i * 4 + 0];
@ -912,7 +907,10 @@ public:
GSVector4i::sw64(v0, v1, v2, v3);
v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24));
v0 = ((v0 >> shift).ps32(v1 >> shift)).pu16((v2 >> shift).ps32(v3 >> shift));
if (mask != 0xffffffff)
v0 = v0 & maskvec;
GSVector4i::storel(dst, v0);
@ -926,149 +924,19 @@ public:
#endif
}
__forceinline static void ReadBlock8HP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
{
ReadBlockHP<24, 0xffffffff>(src, dst, dstpitch);
}
__forceinline static void ReadBlock4HLP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
{
#if _M_SSE >= 0x501
u8* RESTRICT d0 = &dst[dstpitch * 0];
u8* RESTRICT d1 = &dst[dstpitch * 4];
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i v0, v1, v2, v3;
GSVector4i v4, v5;
GSVector8i mask(0x0f0f0f0f);
v0 = s[0].acbd();
v1 = s[1].acbd();
v2 = s[2].acbd();
v3 = s[3].acbd();
v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask;
v4 = v0.extract<0>();
v5 = v0.extract<1>();
GSVector4i::storel(&d0[dstpitch * 0], v4);
GSVector4i::storel(&d0[dstpitch * 1], v5);
GSVector4i::storeh(&d0[dstpitch * 2], v4);
GSVector4i::storeh(&d0[dstpitch * 3], v5);
v0 = s[4].acbd();
v1 = s[5].acbd();
v2 = s[6].acbd();
v3 = s[7].acbd();
v0 = (v0 >> 24).ps32(v1 >> 24).pu16((v2 >> 24).ps32(v3 >> 24)) & mask;
v4 = v0.extract<0>();
v5 = v0.extract<1>();
GSVector4i::storel(&d1[dstpitch * 0], v4);
GSVector4i::storel(&d1[dstpitch * 1], v5);
GSVector4i::storeh(&d1[dstpitch * 2], v4);
GSVector4i::storeh(&d1[dstpitch * 3], v5);
#else
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0, v1, v2, v3;
GSVector4i mask(0x0f0f0f0f);
for (int i = 0; i < 4; i++)
{
v0 = s[i * 4 + 0];
v1 = s[i * 4 + 1];
v2 = s[i * 4 + 2];
v3 = s[i * 4 + 3];
GSVector4i::sw64(v0, v1, v2, v3);
v0 = ((v0 >> 24).ps32(v1 >> 24)).pu16((v2 >> 24).ps32(v3 >> 24)) & mask;
GSVector4i::storel(dst, v0);
dst += dstpitch;
GSVector4i::storeh(dst, v0);
dst += dstpitch;
}
#endif
ReadBlockHP<24, 0x0f0f0f0f>(src, dst, dstpitch);
}
__forceinline static void ReadBlock4HHP(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch)
{
#if _M_SSE >= 0x501
u8* RESTRICT d0 = &dst[dstpitch * 0];
u8* RESTRICT d1 = &dst[dstpitch * 4];
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i v0, v1, v2, v3;
GSVector4i v4, v5;
v0 = s[0].acbd();
v1 = s[1].acbd();
v2 = s[2].acbd();
v3 = s[3].acbd();
v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28));
v4 = v0.extract<0>();
v5 = v0.extract<1>();
GSVector4i::storel(&d0[dstpitch * 0], v4);
GSVector4i::storel(&d0[dstpitch * 1], v5);
GSVector4i::storeh(&d0[dstpitch * 2], v4);
GSVector4i::storeh(&d0[dstpitch * 3], v5);
v0 = s[4].acbd();
v1 = s[5].acbd();
v2 = s[6].acbd();
v3 = s[7].acbd();
v0 = (v0 >> 28).ps32(v1 >> 28).pu16((v2 >> 28).ps32(v3 >> 28));
v4 = v0.extract<0>();
v5 = v0.extract<1>();
GSVector4i::storel(&d1[dstpitch * 0], v4);
GSVector4i::storel(&d1[dstpitch * 1], v5);
GSVector4i::storeh(&d1[dstpitch * 2], v4);
GSVector4i::storeh(&d1[dstpitch * 3], v5);
#else
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0, v1, v2, v3;
for (int i = 0; i < 4; i++)
{
v0 = s[i * 4 + 0];
v1 = s[i * 4 + 1];
v2 = s[i * 4 + 2];
v3 = s[i * 4 + 3];
GSVector4i::sw64(v0, v1, v2, v3);
v0 = ((v0 >> 28).ps32(v1 >> 28)).pu16((v2 >> 28).ps32(v3 >> 28));
GSVector4i::storel(dst, v0);
dst += dstpitch;
GSVector4i::storeh(dst, v0);
dst += dstpitch;
}
#endif
ReadBlockHP<28, 0xffffffff>(src, dst, dstpitch);
}
template <bool AEM, class V>
@ -1232,77 +1100,74 @@ public:
}
}
__forceinline static void ExpandBlock8H_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
template <u32 shift, u8 mask>
__forceinline static void ExpandBlockH_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 24).gather32_32<>(pal);
((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 24).gather32_32<>(pal);
GSVector4i v0 = s[j * 2 + 0] >> shift;
GSVector4i v1 = s[j * 2 + 1] >> shift;
if (mask != 0xff)
{
v0 = v0 & mask;
v1 = v1 & mask;
}
((GSVector4i*)dst)[0] = v0.gather32_32<>(pal);
((GSVector4i*)dst)[1] = v1.gather32_32<>(pal);
}
}
template <u32 shift, u8 mask>
__forceinline static void ExpandBlockH_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0 = s[j * 2 + 0] >> shift;
GSVector4i v1 = s[j * 2 + 1] >> shift;
if (mask != 0xff)
{
v0 = v0 & mask;
v1 = v1 & mask;
}
v0 = v0.gather32_32<>(pal);
v1 = v1.gather32_32<>(pal);
((GSVector4i*)dst)[0] = v0.pu32(v1);
}
}
__forceinline static void ExpandBlock8H_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
ExpandBlockH_32<24, 0xff>(src, dst, dstpitch, pal);
}
__forceinline static void ExpandBlock8H_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0 = (s[j * 2 + 0] >> 24).gather32_32<>(pal);
GSVector4i v1 = (s[j * 2 + 1] >> 24).gather32_32<>(pal);
((GSVector4i*)dst)[0] = v0.pu32(v1);
}
ExpandBlockH_16<24, 0xff>(src, dst, dstpitch, pal);
}
__forceinline static void ExpandBlock4HL_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
((GSVector4i*)dst)[0] = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal);
((GSVector4i*)dst)[1] = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal);
}
ExpandBlockH_32<24, 0x0f>(src, dst, dstpitch, pal);
}
__forceinline static void ExpandBlock4HL_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0 = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal);
GSVector4i v1 = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal);
((GSVector4i*)dst)[0] = v0.pu32(v1);
}
ExpandBlockH_16<24, 0x0f>(src, dst, dstpitch, pal);
}
__forceinline static void ExpandBlock4HH_32(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 28).gather32_32<>(pal);
((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 28).gather32_32<>(pal);
}
ExpandBlockH_32<28, 0xff>(src, dst, dstpitch, pal);
}
__forceinline static void ExpandBlock4HH_16(u32* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
for (int j = 0; j < 8; j++, dst += dstpitch)
{
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i v0 = (s[j * 2 + 0] >> 28).gather32_32<>(pal);
GSVector4i v1 = (s[j * 2 + 1] >> 28).gather32_32<>(pal);
((GSVector4i*)dst)[0] = v0.pu32(v1);
}
ExpandBlockH_16<28, 0xff>(src, dst, dstpitch, pal);
}
__forceinline static void UnpackAndWriteBlock24(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst)
@ -1392,16 +1257,18 @@ public:
#endif
}
__forceinline static void UnpackAndWriteBlock8H(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst)
template <u32 mask>
__forceinline static void UnpackAndWriteBlockH(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst)
{
GSVector4i v4, v5, v6, v7;
#if _M_SSE >= 0x501
GSVector4i v4, v5, v6, v7;
GSVector8i v0, v1, v2, v3;
GSVector8i mask = GSVector8i::xff000000();
GSVector8i* d = reinterpret_cast<GSVector8i*>(dst);
for (int i = 0; i < 2; i++, src += srcpitch * 4)
for (int i = 0; i < 2; i++, src += srcpitch * 4, d += 4)
{
if (mask == 0xff000000)
{
v4 = GSVector4i::loadl(&src[srcpitch * 0]);
v5 = GSVector4i::loadl(&src[srcpitch * 1]);
@ -1410,45 +1277,114 @@ public:
v4 = v4.upl16(v5);
v5 = v6.upl16(v7);
}
else
{
v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 2]));
v5 = GSVector4i::load(*(u32*)&src[srcpitch * 1]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3]));
v0 = GSVector8i::u8to32(v4) << 24;
v1 = GSVector8i::u8to32(v4.zwzw()) << 24;
v2 = GSVector8i::u8to32(v5) << 24;
v3 = GSVector8i::u8to32(v5.zwzw()) << 24;
if (mask == 0x0f000000)
{
v6 = v4.upl8(v4 >> 4);
v7 = v5.upl8(v5 >> 4);
}
else if (mask == 0xf0000000)
{
v6 = (v4 << 4).upl8(v4);
v7 = (v5 << 4).upl8(v5);
}
else
{
ASSERT(0);
}
((GSVector8i*)dst)[i * 4 + 0] = ((GSVector8i*)dst)[i * 4 + 0].blend(v0, mask);
((GSVector8i*)dst)[i * 4 + 1] = ((GSVector8i*)dst)[i * 4 + 1].blend(v1, mask);
((GSVector8i*)dst)[i * 4 + 2] = ((GSVector8i*)dst)[i * 4 + 2].blend(v2, mask);
((GSVector8i*)dst)[i * 4 + 3] = ((GSVector8i*)dst)[i * 4 + 3].blend(v3, mask);
v4 = v6.upl16(v7);
v5 = v6.uph16(v7);
}
GSVector8i v0 = GSVector8i::u8to32(v4) << 24;
GSVector8i v1 = GSVector8i::u8to32(v4.zwzw()) << 24;
GSVector8i v2 = GSVector8i::u8to32(v5) << 24;
GSVector8i v3 = GSVector8i::u8to32(v5.zwzw()) << 24;
d[0] = d[0].smartblend<mask>(v0);
d[1] = d[1].smartblend<mask>(v1);
d[2] = d[2].smartblend<mask>(v2);
d[3] = d[3].smartblend<mask>(v3);
}
#else
GSVector4i v0, v1, v2, v3, v4;
GSVector4i mask = GSVector4i::xff000000();
GSVector4i v0, v1, v2, v3;
GSVector4i mask0 = m_uw8hmask0;
GSVector4i mask1 = m_uw8hmask1;
GSVector4i mask2 = m_uw8hmask2;
GSVector4i mask3 = m_uw8hmask3;
for (int i = 0; i < 4; i++, src += srcpitch * 2)
GSVector4i* d = reinterpret_cast<GSVector4i*>(dst);
for (int i = 0; i < 2; i++, src += srcpitch * 4, d += 8)
{
v4 = GSVector4i::load(src, src + srcpitch);
if (mask == 0xff000000)
{
v4 = GSVector4i::load(src + srcpitch * 0, src + srcpitch * 1);
v5 = GSVector4i::load(src + srcpitch * 2, src + srcpitch * 3);
}
else
{
v6 = GSVector4i::load(*(u32*)&src[srcpitch * 0]);
v7 = GSVector4i::load(*(u32*)&src[srcpitch * 1]);
v4 = v6.upl32(v7);
v6 = GSVector4i::load(*(u32*)&src[srcpitch * 2]);
v7 = GSVector4i::load(*(u32*)&src[srcpitch * 3]);
v5 = v6.upl32(v7);
if (mask == 0x0f000000)
{
v4 = v4.upl8(v4 >> 4);
v5 = v5.upl8(v5 >> 4);
}
else if (mask == 0xf0000000)
{
v4 = (v4 << 4).upl8(v4);
v5 = (v5 << 4).upl8(v5);
}
else
{
ASSERT(0);
}
}
v0 = v4.shuffle8(mask0);
v1 = v4.shuffle8(mask1);
v2 = v4.shuffle8(mask2);
v3 = v4.shuffle8(mask3);
((GSVector4i*)dst)[i * 4 + 0] = ((GSVector4i*)dst)[i * 4 + 0].blend8(v0, mask);
((GSVector4i*)dst)[i * 4 + 1] = ((GSVector4i*)dst)[i * 4 + 1].blend8(v1, mask);
((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend8(v2, mask);
((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend8(v3, mask);
d[0] = d[0].smartblend<mask>(v0);
d[1] = d[1].smartblend<mask>(v1);
d[2] = d[2].smartblend<mask>(v2);
d[3] = d[3].smartblend<mask>(v3);
v0 = v5.shuffle8(mask0);
v1 = v5.shuffle8(mask1);
v2 = v5.shuffle8(mask2);
v3 = v5.shuffle8(mask3);
d[4] = d[4].smartblend<mask>(v0);
d[5] = d[5].smartblend<mask>(v1);
d[6] = d[6].smartblend<mask>(v2);
d[7] = d[7].smartblend<mask>(v3);
}
#endif
}
__forceinline static void UnpackAndWriteBlock8H(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst)
{
UnpackAndWriteBlockH<0xff000000>(src, srcpitch, dst);
}
__forceinline static void UnpackAndWriteBlock4HL(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst)
{
//printf("4HL\n");
@ -1461,144 +1397,12 @@ public:
s[i] = (columnTable32[j][i * 2] & 0x0f) | (columnTable32[j][i * 2 + 1] << 4);
}
#if _M_SSE >= 0x501
GSVector4i v4, v5, v6, v7;
GSVector8i v0, v1, v2, v3;
GSVector8i mask(0x0f000000);
for (int i = 0; i < 2; i++, src += srcpitch * 4)
{
v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 2]));
v5 = GSVector4i::load(*(u32*)&src[srcpitch * 1]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3]));
v6 = v4.upl8(v4 >> 4);
v7 = v5.upl8(v5 >> 4);
v4 = v6.upl16(v7);
v5 = v6.uph16(v7);
v0 = GSVector8i::u8to32(v4) << 24;
v1 = GSVector8i::u8to32(v4.zwzw()) << 24;
v2 = GSVector8i::u8to32(v5) << 24;
v3 = GSVector8i::u8to32(v5.zwzw()) << 24;
((GSVector8i*)dst)[i * 4 + 0] = ((GSVector8i*)dst)[i * 4 + 0].blend(v0, mask);
((GSVector8i*)dst)[i * 4 + 1] = ((GSVector8i*)dst)[i * 4 + 1].blend(v1, mask);
((GSVector8i*)dst)[i * 4 + 2] = ((GSVector8i*)dst)[i * 4 + 2].blend(v2, mask);
((GSVector8i*)dst)[i * 4 + 3] = ((GSVector8i*)dst)[i * 4 + 3].blend(v3, mask);
}
#else
GSVector4i v0, v1, v2, v3, v4, v5;
GSVector4i mask = GSVector4i(0x0f000000);
GSVector4i mask0 = m_uw8hmask0;
GSVector4i mask1 = m_uw8hmask1;
GSVector4i mask2 = m_uw8hmask2;
GSVector4i mask3 = m_uw8hmask3;
for (int i = 0; i < 2; i++, src += srcpitch * 4)
{
v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 1]));
v5 = GSVector4i::load(*(u32*)&src[srcpitch * 2]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3]));
v4 = v4.upl8(v4 >> 4);
v5 = v5.upl8(v5 >> 4);
v0 = v4.shuffle8(mask0);
v1 = v4.shuffle8(mask1);
v2 = v4.shuffle8(mask2);
v3 = v4.shuffle8(mask3);
((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
v0 = v5.shuffle8(mask0);
v1 = v5.shuffle8(mask1);
v2 = v5.shuffle8(mask2);
v3 = v5.shuffle8(mask3);
((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
}
#endif
UnpackAndWriteBlockH<0x0f000000>(src, srcpitch, dst);
}
__forceinline static void UnpackAndWriteBlock4HH(const u8* RESTRICT src, int srcpitch, u8* RESTRICT dst)
{
#if _M_SSE >= 0x501
GSVector4i v4, v5, v6, v7;
GSVector8i v0, v1, v2, v3;
GSVector8i mask = GSVector8i::xf0000000();
for (int i = 0; i < 2; i++, src += srcpitch * 4)
{
v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 2]));
v5 = GSVector4i::load(*(u32*)&src[srcpitch * 1]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3]));
v6 = (v4 << 4).upl8(v4);
v7 = (v5 << 4).upl8(v5);
v4 = v6.upl16(v7);
v5 = v6.uph16(v7);
v0 = GSVector8i::u8to32(v4) << 24;
v1 = GSVector8i::u8to32(v4.zwzw()) << 24;
v2 = GSVector8i::u8to32(v5) << 24;
v3 = GSVector8i::u8to32(v5.zwzw()) << 24;
((GSVector8i*)dst)[i * 4 + 0] = ((GSVector8i*)dst)[i * 4 + 0].blend(v0, mask);
((GSVector8i*)dst)[i * 4 + 1] = ((GSVector8i*)dst)[i * 4 + 1].blend(v1, mask);
((GSVector8i*)dst)[i * 4 + 2] = ((GSVector8i*)dst)[i * 4 + 2].blend(v2, mask);
((GSVector8i*)dst)[i * 4 + 3] = ((GSVector8i*)dst)[i * 4 + 3].blend(v3, mask);
}
#else
GSVector4i v0, v1, v2, v3, v4, v5;
GSVector4i mask = GSVector4i::xf0000000();
GSVector4i mask0 = m_uw8hmask0;
GSVector4i mask1 = m_uw8hmask1;
GSVector4i mask2 = m_uw8hmask2;
GSVector4i mask3 = m_uw8hmask3;
for (int i = 0; i < 2; i++, src += srcpitch * 4)
{
v4 = GSVector4i::load(*(u32*)&src[srcpitch * 0]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 1]));
v5 = GSVector4i::load(*(u32*)&src[srcpitch * 2]).upl32(GSVector4i::load(*(u32*)&src[srcpitch * 3]));
v4 = (v4 << 4).upl8(v4);
v5 = (v5 << 4).upl8(v5);
v0 = v4.shuffle8(mask0);
v1 = v4.shuffle8(mask1);
v2 = v4.shuffle8(mask2);
v3 = v4.shuffle8(mask3);
((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask);
((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask);
((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask);
((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask);
v0 = v5.shuffle8(mask0);
v1 = v5.shuffle8(mask1);
v2 = v5.shuffle8(mask2);
v3 = v5.shuffle8(mask3);
((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask);
((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask);
((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask);
((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask);
}
#endif
UnpackAndWriteBlockH<0xf0000000>(src, srcpitch, dst);
}
template <bool AEM>
@ -2078,17 +1882,16 @@ public:
// TODO: ReadAndExpandBlock8H_16
__forceinline static void ReadAndExpandBlock4HL_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
template <u32 shift, u32 mask>
__forceinline static void ReadAndExpandBlock4H_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
//printf("ReadAndExpandBlock4HL_32\n");
#if _M_SSE >= 0x501
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i p0, p1, p2, p3;
LoadPalVecs(pal, p0, p1, p2, p3);
GSVector8i mask(0x0f0f0f0f);
GSVector8i maskvec(mask);
GSVector8i v0, v1, v2, v3;
@ -2099,81 +1902,10 @@ public:
GSVector8i* d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
GSVector8i* d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);
v0 = s[i * 4 + 0] >> 24;
v1 = s[i * 4 + 1] >> 24;
v2 = s[i * 4 + 2] >> 24;
v3 = s[i * 4 + 3] >> 24;
GSVector8i::sw128(v0, v1);
GSVector8i::sw64(v0, v1);
GSVector8i::sw128(v2, v3);
GSVector8i::sw64(v2, v3);
GSVector8i all = v0.ps32(v1).pu16(v2.ps32(v3)) & mask;
ReadClut4(p0, p1, p2, p3, all, *d0, *d1, *d2, *d3);
dst += dstpitch * 4;
}
#else
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i p0, p1, p2, p3;
LoadPalVecs(pal, p0, p1, p2, p3);
GSVector4i mask(0x0f0f0f0f);
GSVector4i v0, v1, v2, v3;
for (int i = 0; i < 4; i++)
{
GSVector4i* d0 = reinterpret_cast<GSVector4i*>(dst);
GSVector4i* d1 = reinterpret_cast<GSVector4i*>(dst + dstpitch);
v0 = s[i * 4 + 0] >> 24;
v1 = s[i * 4 + 1] >> 24;
v2 = s[i * 4 + 2] >> 24;
v3 = s[i * 4 + 3] >> 24;
GSVector4i::sw64(v0, v1, v2, v3);
GSVector4i all = v0.ps32(v1).pu16(v2.ps32(v3)) & mask;
ReadClut4(p0, p1, p2, p3, all, d0[0], d0[1], d1[0], d1[1]);
dst += dstpitch * 2;
}
#endif
}
// TODO: ReadAndExpandBlock4HL_16
__forceinline static void ReadAndExpandBlock4HH_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
//printf("ReadAndExpandBlock4HH_32\n");
#if _M_SSE >= 0x501
const GSVector8i* s = (const GSVector8i*)src;
GSVector8i p0, p1, p2, p3;
LoadPalVecs(pal, p0, p1, p2, p3);
GSVector8i v0, v1, v2, v3;
for (int i = 0; i < 2; i++)
{
GSVector8i* d0 = reinterpret_cast<GSVector8i*>(dst);
GSVector8i* d1 = reinterpret_cast<GSVector8i*>(dst + dstpitch);
GSVector8i* d2 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 2);
GSVector8i* d3 = reinterpret_cast<GSVector8i*>(dst + dstpitch * 3);
v0 = s[i * 4 + 0] >> 28;
v1 = s[i * 4 + 1] >> 28;
v2 = s[i * 4 + 2] >> 28;
v3 = s[i * 4 + 3] >> 28;
v0 = s[i * 4 + 0] >> shift;
v1 = s[i * 4 + 1] >> shift;
v2 = s[i * 4 + 2] >> shift;
v3 = s[i * 4 + 3] >> shift;
GSVector8i::sw128(v0, v1);
GSVector8i::sw64(v0, v1);
@ -2181,6 +1913,8 @@ public:
GSVector8i::sw64(v2, v3);
GSVector8i all = v0.ps32(v1).pu16(v2.ps32(v3));
if (mask != 0xffffffff)
all = all & mask;
ReadClut4(p0, p1, p2, p3, all, *d0, *d1, *d2, *d3);
@ -2193,6 +1927,7 @@ public:
GSVector4i p0, p1, p2, p3;
LoadPalVecs(pal, p0, p1, p2, p3);
GSVector4i maskvec(mask);
GSVector4i v0, v1, v2, v3;
@ -2201,14 +1936,16 @@ public:
GSVector4i* d0 = reinterpret_cast<GSVector4i*>(dst);
GSVector4i* d1 = reinterpret_cast<GSVector4i*>(dst + dstpitch);
v0 = s[i * 4 + 0] >> 28;
v1 = s[i * 4 + 1] >> 28;
v2 = s[i * 4 + 2] >> 28;
v3 = s[i * 4 + 3] >> 28;
v0 = s[i * 4 + 0] >> shift;
v1 = s[i * 4 + 1] >> shift;
v2 = s[i * 4 + 2] >> shift;
v3 = s[i * 4 + 3] >> shift;
GSVector4i::sw64(v0, v1, v2, v3);
GSVector4i all = v0.ps32(v1).pu16(v2.ps32(v3));
if (mask != 0xffffffff)
all = all & mask;
ReadClut4(p0, p1, p2, p3, all, d0[0], d0[1], d1[0], d1[1]);
@ -2218,5 +1955,19 @@ public:
#endif
}
__forceinline static void ReadAndExpandBlock4HL_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
//printf("ReadAndExpandBlock4HL_32\n");
ReadAndExpandBlock4H_32<24, 0x0f0f0f0f>(src, dst, dstpitch, pal);
}
// TODO: ReadAndExpandBlock4HL_16
__forceinline static void ReadAndExpandBlock4HH_32(const u8* RESTRICT src, u8* RESTRICT dst, int dstpitch, const u32* RESTRICT pal)
{
//printf("ReadAndExpandBlock4HH_32\n");
ReadAndExpandBlock4H_32<28, 0xffffffff>(src, dst, dstpitch, pal);
}
// TODO: ReadAndExpandBlock4HH_16
};

View File

@ -449,6 +449,31 @@ public:
#endif
}
/// Equivalent to blend with the given mask broadcasted across the vector
/// May be faster than blend in some cases
template <u32 mask>
__forceinline GSVector4i smartblend(const GSVector4i& a) const
{
if (mask == 0)
return *this;
if (mask == 0xffffffff)
return a;
if (mask == 0x0000ffff)
return blend16<0x55>(a);
if (mask == 0xffff0000)
return blend16<0xaa>(a);
for (int i = 0; i < 32; i += 8)
{
u8 byte = (mask >> i) & 0xff;
if (byte != 0xff && byte != 0)
return blend(a, GSVector4i(mask));
}
return blend8(a, GSVector4i(mask));
}
__forceinline GSVector4i blend(const GSVector4i& a, const GSVector4i& mask) const
{
return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a)));

View File

@ -327,6 +327,31 @@ public:
return GSVector8i(_mm256_or_si256(_mm256_andnot_si256(mask, m), _mm256_and_si256(mask, a)));
}
/// Equivalent to blend with the given mask broadcasted across the vector
/// May be faster than blend in some cases
template <u32 mask>
__forceinline GSVector8i smartblend(const GSVector8i& a) const
{
if (mask == 0)
return *this;
if (mask == 0xffffffff)
return a;
if (mask == 0x0000ffff)
return blend16<0x55>(a);
if (mask == 0xffff0000)
return blend16<0xaa>(a);
for (int i = 0; i < 32; i += 8)
{
u8 byte = (mask >> i) & 0xff;
if (byte != 0xff && byte != 0)
return blend(a, GSVector8i(mask));
}
return blend8(a, GSVector8i(mask));
}
__forceinline GSVector8i mix16(const GSVector8i& a) const
{
return blend16<0xaa>(a);