mirror of https://github.com/PCSX2/pcsx2.git
GS:SW: Use unaligned loads to reduce constant size on AVX2
Allows more instructions to use 1-byte offsets
This commit is contained in:
parent
0d434d69be
commit
eb0b341e61
|
@ -207,10 +207,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
|
||||||
constexpr int vlen = sizeof(VectorF) / sizeof(float);
|
constexpr int vlen = sizeof(VectorF) / sizeof(float);
|
||||||
|
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
const GSVector8* shift = (GSVector8*)g_const_256b.m_shift;
|
auto load_shift = [](int i) { return GSVector8::load<false>(&g_const_256b.m_shift[8 - i]); };
|
||||||
const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
|
const GSVector4 step_shift = GSVector4::broadcast32(&g_const_256b.m_shift[0]);
|
||||||
#else
|
#else
|
||||||
const GSVector4* shift = (GSVector4*)g_const_128b.m_shift;
|
static const GSVector4* shift = reinterpret_cast<const GSVector4*>(g_const_128b.m_shift);
|
||||||
|
auto load_shift = [](int i) { return shift[1 + i]; };
|
||||||
const GSVector4 step_shift = shift[0];
|
const GSVector4 step_shift = shift[0];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -234,22 +235,23 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
|
||||||
|
|
||||||
for (int i = 0; i < vlen; i++)
|
for (int i = 0; i < vlen; i++)
|
||||||
{
|
{
|
||||||
local.d[i].f = VectorI(df * shift[1 + i]).xxzzlh();
|
local.d[i].f = VectorI(df * load_shift(i)).xxzzlh();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (has_z && !sel.zequal)
|
if (has_z && !sel.zequal)
|
||||||
{
|
{
|
||||||
const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
|
|
||||||
const VectorF dzf(static_cast<float>(dscan.p.F64[1]));
|
const VectorF dzf(static_cast<float>(dscan.p.F64[1]));
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
GSVector4::storel(&local.d8.p.z, dz.mul64(GSVector4::f32to64(shift)));
|
double dz = dscan.p.F64[1] * g_const_256b.m_shift[0];
|
||||||
|
memcpy(&local.d8.p.z, &dz, sizeof(dz));
|
||||||
#else
|
#else
|
||||||
|
const GSVector4 dz = GSVector4::broadcast64(&dscan.p.z);
|
||||||
local.d4.z = dz.mul64(GSVector4::f32to64(shift));
|
local.d4.z = dz.mul64(GSVector4::f32to64(shift));
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < vlen; i++)
|
for (int i = 0; i < vlen; i++)
|
||||||
{
|
{
|
||||||
local.d[i].z = dzf * shift[i + 1];
|
local.d[i].z = dzf * load_shift(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -297,7 +299,7 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
|
||||||
|
|
||||||
for (int i = 0; i < vlen; i++)
|
for (int i = 0; i < vlen; i++)
|
||||||
{
|
{
|
||||||
VectorF v = dstq * shift[1 + i];
|
VectorF v = dstq * load_shift(i);
|
||||||
|
|
||||||
if (sel.fst)
|
if (sel.fst)
|
||||||
{
|
{
|
||||||
|
@ -336,8 +338,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
|
||||||
|
|
||||||
for (int i = 0; i < vlen; i++)
|
for (int i = 0; i < vlen; i++)
|
||||||
{
|
{
|
||||||
VectorI r = VectorI(dr * shift[1 + i]).ps32();
|
VectorI r = VectorI(dr * load_shift(i)).ps32();
|
||||||
VectorI b = VectorI(db * shift[1 + i]).ps32();
|
VectorI b = VectorI(db * load_shift(i)).ps32();
|
||||||
|
|
||||||
local.d[i].rb = r.upl16(b);
|
local.d[i].rb = r.upl16(b);
|
||||||
}
|
}
|
||||||
|
@ -347,8 +349,8 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
|
||||||
|
|
||||||
for (int i = 0; i < vlen; i++)
|
for (int i = 0; i < vlen; i++)
|
||||||
{
|
{
|
||||||
VectorI g = VectorI(dg * shift[1 + i]).ps32();
|
VectorI g = VectorI(dg * load_shift(i)).ps32();
|
||||||
VectorI a = VectorI(da * shift[1 + i]).ps32();
|
VectorI a = VectorI(da * load_shift(i)).ps32();
|
||||||
|
|
||||||
local.d[i].ga = g.upl16(a);
|
local.d[i].ga = g.upl16(a);
|
||||||
}
|
}
|
||||||
|
@ -515,7 +517,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
|
||||||
steps = pixels + skip - vlen;
|
steps = pixels + skip - vlen;
|
||||||
left -= skip;
|
left -= skip;
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
test = GSVector8i::i8to32(g_const_256b.m_test[skip]) | GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
|
test = GSVector8i::i8to32(&g_const_256b.m_test[16 - skip]) | GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]);
|
||||||
#else
|
#else
|
||||||
test = const_test[skip] | const_test[7 + (steps & (steps >> 31))];
|
test = const_test[skip] | const_test[7 + (steps & (steps >> 31))];
|
||||||
#endif
|
#endif
|
||||||
|
@ -1756,7 +1758,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
|
||||||
if (!sel.notest)
|
if (!sel.notest)
|
||||||
{
|
{
|
||||||
#if _M_SSE >= 0x501
|
#if _M_SSE >= 0x501
|
||||||
test = GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
|
test = GSVector8i::i8to32(&g_const_256b.m_test[0 - (steps & (steps >> 31))]);
|
||||||
#else
|
#else
|
||||||
test = const_test[7 + (steps & (steps >> 31))];
|
test = const_test[7 + (steps & (steps >> 31))];
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -661,25 +661,29 @@ void GSDrawScanlineCodeGenerator::Init()
|
||||||
|
|
||||||
lea(a0.cvt32(), ptr[a0 + a1 - vecints]);
|
lea(a0.cvt32(), ptr[a0 + a1 - vecints]);
|
||||||
|
|
||||||
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
|
|
||||||
|
|
||||||
mov(eax, a0.cvt32());
|
|
||||||
sar(eax, 31); // GH: 31 to extract the sign of the register
|
|
||||||
and_(eax, a0.cvt32());
|
|
||||||
if (isXmm)
|
|
||||||
shl(eax, 4); // * sizeof(m_test[0])
|
|
||||||
cdqe();
|
|
||||||
|
|
||||||
if (isXmm)
|
if (isXmm)
|
||||||
{
|
{
|
||||||
|
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
|
||||||
|
mov(eax, a0.cvt32());
|
||||||
|
sar(eax, 31); // GH: 31 to extract the sign of the register
|
||||||
|
and_(eax, a0.cvt32());
|
||||||
|
shl(eax, 4); // * sizeof(m_test[0])
|
||||||
|
cdqe();
|
||||||
shl(a1.cvt32(), 4); // * sizeof(m_test[0])
|
shl(a1.cvt32(), 4); // * sizeof(m_test[0])
|
||||||
movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData128B, m_test[0])]);
|
movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData128B, m_test[0])]);
|
||||||
por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
|
por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
pmovsxbd(_test, ptr[a1 * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
|
// GSVector8i test = loadu(&m_test[16 - skip]) | loadu(&m_test[steps >= 0 ? 0 : -steps]);
|
||||||
pmovsxbd(xym0, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
|
mov(eax, a1.cvt32());
|
||||||
|
neg(rax); // rax = -skip
|
||||||
|
pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[16])]);
|
||||||
|
xor_(t0.cvt32(), t0.cvt32());
|
||||||
|
mov(eax, a0.cvt32());
|
||||||
|
neg(eax); // eax = -steps
|
||||||
|
cmovs(eax, t0.cvt32()); // if (eax < 0) eax = 0
|
||||||
|
pmovsxbd(xym0, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
|
||||||
por(_test, xym0);
|
por(_test, xym0);
|
||||||
shl(a1.cvt32(), 5); // * sizeof(m_test[0])
|
shl(a1.cvt32(), 5); // * sizeof(m_test[0])
|
||||||
}
|
}
|
||||||
|
@ -922,7 +926,7 @@ void GSDrawScanlineCodeGenerator::Init()
|
||||||
/// Inputs: a0=steps, t0=fza_offset
|
/// Inputs: a0=steps, t0=fza_offset
|
||||||
/// Outputs[x86]: xym0=z xym2=s, xym3=t, xym4=q, xym5=rb, xym6=ga, xym7=test
|
/// Outputs[x86]: xym0=z xym2=s, xym3=t, xym4=q, xym5=rb, xym6=ga, xym7=test
|
||||||
/// Destroys[x86]: all
|
/// Destroys[x86]: all
|
||||||
/// Destroys[x64]: xym0, xym1, xym2, xym3
|
/// Destroys[x64]: xym0, xym1, xym2, xym3, t2
|
||||||
void GSDrawScanlineCodeGenerator::Step()
|
void GSDrawScanlineCodeGenerator::Step()
|
||||||
{
|
{
|
||||||
// steps -= 4;
|
// steps -= 4;
|
||||||
|
@ -1048,19 +1052,22 @@ void GSDrawScanlineCodeGenerator::Step()
|
||||||
|
|
||||||
if (!m_sel.notest)
|
if (!m_sel.notest)
|
||||||
{
|
{
|
||||||
|
#if USING_XMM
|
||||||
// test = m_test[7 + (steps & (steps >> 31))];
|
// test = m_test[7 + (steps & (steps >> 31))];
|
||||||
|
|
||||||
mov(eax, a0.cvt32());
|
mov(eax, a0.cvt32());
|
||||||
sar(eax, 31); // GH: 31 to extract the sign of the register
|
sar(eax, 31); // GH: 31 to extract the sign of the register
|
||||||
and_(eax, a0.cvt32());
|
and_(eax, a0.cvt32());
|
||||||
if (isXmm)
|
shl(eax, 4);
|
||||||
shl(eax, 4);
|
|
||||||
cdqe();
|
cdqe();
|
||||||
|
|
||||||
#if USING_XMM
|
|
||||||
movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
|
movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
|
||||||
#else
|
#else
|
||||||
pmovsxbd(_test, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
|
// test = loadu(&m_test[steps >= 0 ? 0 : -steps]);
|
||||||
|
xor_(t2.cvt32(), t2.cvt32());
|
||||||
|
mov(eax, a0.cvt32());
|
||||||
|
neg(eax); // eax = -steps
|
||||||
|
cmovs(eax, t2.cvt32()); // if (eax < 0) eax = 0;
|
||||||
|
pmovsxbd(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1655,29 +1662,54 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
|
||||||
pslld(xym4, 9);
|
pslld(xym4, 9);
|
||||||
psrld(xym4, 9);
|
psrld(xym4, 9);
|
||||||
|
|
||||||
auto log2_coeff = [this](int i) -> Address
|
#if USING_YMM
|
||||||
|
auto load_log2_coeff = [this](const XYm& reg, int i)
|
||||||
{
|
{
|
||||||
ptr[_m_const + log2_coeff_offset(i)];
|
vbroadcastss(reg, ptr[_m_const + log2_coeff_offset(i)]);
|
||||||
};
|
};
|
||||||
|
auto log2_coeff = [this, &load_log2_coeff](int i)
|
||||||
|
{
|
||||||
|
load_log2_coeff(xym6, i);
|
||||||
|
return xym6;
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
auto log2_coeff = [this](int i) -> Operand
|
||||||
|
{
|
||||||
|
return ptr[_m_const + log2_coeff_offset(i)];
|
||||||
|
};
|
||||||
|
auto load_log2_coeff = [this, &log2_coeff](const XYm& reg, int i)
|
||||||
|
{
|
||||||
|
movaps(reg, log2_coeff(i));
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
orps(xym4, log2_coeff(3));
|
load_log2_coeff(xym1, 3);
|
||||||
|
orps(xym4, xym1);
|
||||||
|
|
||||||
// xym4 = mant(q) | 1.0f
|
// xym4 = mant(q) | 1.0f
|
||||||
|
|
||||||
if (hasFMA)
|
if (hasFMA)
|
||||||
{
|
{
|
||||||
movaps(xym5, log2_coeff(0)); // c0
|
load_log2_coeff(xym5, 0); // c0
|
||||||
vfmadd213ps(xym5, xym4, log2_coeff(1)); // c0 * xym4 + c1
|
vfmadd213ps(xym5, xym4, log2_coeff(1)); // c0 * xym4 + c1
|
||||||
vfmadd213ps(xym5, xym4, log2_coeff(2)); // (c0 * xym4 + c1) * xym4 + c2
|
vfmadd213ps(xym5, xym4, log2_coeff(2)); // (c0 * xym4 + c1) * xym4 + c2
|
||||||
subps(xym4, log2_coeff(3)); // xym4 - 1.0f
|
subps(xym4, xym1); // xym4 - 1.0f
|
||||||
vfmadd213ps(xym4, xym5, xym0); // ((c0 * xym4 + c1) * xym4 + c2) * (xym4 - 1.0f) + xym0
|
vfmadd213ps(xym4, xym5, xym0); // ((c0 * xym4 + c1) * xym4 + c2) * (xym4 - 1.0f) + xym0
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
THREEARG(mulps, xym5, xym4, log2_coeff(0));
|
if (hasAVX)
|
||||||
|
{
|
||||||
|
vmulps(xym5, xym4, log2_coeff(0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
load_log2_coeff(xym5, 0);
|
||||||
|
mulps(xym5, xym4);
|
||||||
|
}
|
||||||
addps(xym5, log2_coeff(1));
|
addps(xym5, log2_coeff(1));
|
||||||
mulps(xym5, xym4);
|
mulps(xym5, xym4);
|
||||||
subps(xym4, log2_coeff(3));
|
subps(xym4, xym1);
|
||||||
addps(xym5, log2_coeff(2));
|
addps(xym5, log2_coeff(2));
|
||||||
mulps(xym4, xym5);
|
mulps(xym4, xym5);
|
||||||
addps(xym4, xym0);
|
addps(xym4, xym0);
|
||||||
|
|
|
@ -204,10 +204,12 @@ public:
|
||||||
FORWARD_OO_OI(or_)
|
FORWARD_OO_OI(or_)
|
||||||
FORWARD_OO_OI(sub)
|
FORWARD_OO_OI(sub)
|
||||||
FORWARD_OO_OI(xor_)
|
FORWARD_OO_OI(xor_)
|
||||||
|
FORWARD(2, BASE, cmovs, const Reg&, const Operand&)
|
||||||
FORWARD(2, BASE, lea, const Reg&, const Address&)
|
FORWARD(2, BASE, lea, const Reg&, const Address&)
|
||||||
FORWARD(2, BASE, mov, const Operand&, size_t)
|
FORWARD(2, BASE, mov, const Operand&, size_t)
|
||||||
FORWARD(2, BASE, mov, ARGS_OO)
|
FORWARD(2, BASE, mov, ARGS_OO)
|
||||||
FORWARD(2, BASE, movzx, const Reg&, const Operand&)
|
FORWARD(2, BASE, movzx, const Reg&, const Operand&)
|
||||||
|
FORWARD(1, BASE, neg, const Operand&)
|
||||||
FORWARD(1, BASE, not_, const Operand&)
|
FORWARD(1, BASE, not_, const Operand&)
|
||||||
FORWARD(1, BASE, pop, const Operand&)
|
FORWARD(1, BASE, pop, const Operand&)
|
||||||
FORWARD(1, BASE, push, const Operand&)
|
FORWARD(1, BASE, push, const Operand&)
|
||||||
|
@ -243,6 +245,8 @@ public:
|
||||||
AFORWARD(2, minps, ARGS_XO)
|
AFORWARD(2, minps, ARGS_XO)
|
||||||
SFORWARD(2, movaps, ARGS_XO)
|
SFORWARD(2, movaps, ARGS_XO)
|
||||||
SFORWARD(2, movaps, const Address&, const Xmm&)
|
SFORWARD(2, movaps, const Address&, const Xmm&)
|
||||||
|
SFORWARD(2, movups, ARGS_XO)
|
||||||
|
SFORWARD(2, movups, const Address&, const Xmm&)
|
||||||
SFORWARD(2, movd, const Address&, const Xmm&)
|
SFORWARD(2, movd, const Address&, const Xmm&)
|
||||||
SFORWARD(2, movd, const Reg32&, const Xmm&)
|
SFORWARD(2, movd, const Reg32&, const Xmm&)
|
||||||
SFORWARD(2, movd, const Xmm&, const Address&)
|
SFORWARD(2, movd, const Xmm&, const Address&)
|
||||||
|
|
|
@ -256,46 +256,25 @@ namespace GSScanlineConstantData
|
||||||
// Constant shared by all threads (to reduce cache miss)
|
// Constant shared by all threads (to reduce cache miss)
|
||||||
struct alignas(64) GSScanlineConstantData256B
|
struct alignas(64) GSScanlineConstantData256B
|
||||||
{
|
{
|
||||||
alignas(32) u8 m_test[16][8] = {
|
// All AVX processors support unaligned access with little to no penalty as long as you don't cross a cache line.
|
||||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
// Take advantage of that to store single vectors that we index with single-element alignment
|
||||||
{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
alignas(32) u8 m_test[24] = {
|
||||||
{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
{0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
|
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||||
{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
|
|
||||||
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
|
|
||||||
{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
|
|
||||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
|
|
||||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
|
|
||||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
|
|
||||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
|
|
||||||
};
|
};
|
||||||
alignas(32) float m_shift[9][8] = {
|
float m_log2_coef[4] = {};
|
||||||
{ 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f},
|
alignas(64) float m_shift[16] = {
|
||||||
{ 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f , 7.0f},
|
8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
|
||||||
{ -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f},
|
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
|
||||||
{ -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f},
|
|
||||||
{ -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f},
|
|
||||||
{ -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f , 3.0f},
|
|
||||||
{ -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f , 2.0f},
|
|
||||||
{ -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f},
|
|
||||||
{ -7.0f , -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f},
|
|
||||||
};
|
};
|
||||||
alignas(32) float m_log2_coef[4][8] = {};
|
|
||||||
|
|
||||||
constexpr GSScanlineConstantData256B()
|
constexpr GSScanlineConstantData256B()
|
||||||
{
|
{
|
||||||
using namespace GSScanlineConstantData;
|
using namespace GSScanlineConstantData;
|
||||||
for (size_t n = 0; n < std::size(log2_coef); ++n)
|
for (size_t n = 0; n < std::size(log2_coef); ++n)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < 8; ++i)
|
m_log2_coef[n] = log2_coef[n];
|
||||||
{
|
|
||||||
m_log2_coef[n][i] = log2_coef[n];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -110,7 +110,12 @@ void GSSetupPrimCodeGenerator::Generate()
|
||||||
|
|
||||||
for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
|
for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
|
||||||
{
|
{
|
||||||
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
|
if (isXmm)
|
||||||
|
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
|
||||||
|
else if (i == 0)
|
||||||
|
vbroadcastss(xym3, ptr[rax]);
|
||||||
|
else
|
||||||
|
movups(XYm(3 + i), ptr[rax + (9 - i) * sizeof(float)]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,7 +258,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
vmulps(ymm0, Ymm(4 + i), ymm1);
|
vmulps(ymm0, Ymm(4 + i), ymm1);
|
||||||
else
|
else
|
||||||
vmulps(ymm0, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm0, ymm1, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
cvttps2dq(ymm0, ymm0);
|
cvttps2dq(ymm0, ymm0);
|
||||||
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
|
@ -281,7 +286,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
vmulps(ymm1, Ymm(4 + i), ymm0);
|
vmulps(ymm1, Ymm(4 + i), ymm0);
|
||||||
else
|
else
|
||||||
vmulps(ymm1, ymm0, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm1, ymm0, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
movaps(_rip_local_di(i, z), ymm1);
|
movaps(_rip_local_di(i, z), ymm1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -356,7 +361,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
THREEARG(mulps, xym2, XYm(4 + i), xym1);
|
THREEARG(mulps, xym2, XYm(4 + i), xym1);
|
||||||
else
|
else
|
||||||
vmulps(ymm2, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm2, ymm1, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
|
|
||||||
if (m_sel.fst)
|
if (m_sel.fst)
|
||||||
{
|
{
|
||||||
|
@ -424,7 +429,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
||||||
else
|
else
|
||||||
vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
cvttps2dq(xym0, xym0);
|
cvttps2dq(xym0, xym0);
|
||||||
packssdw(xym0, xym0);
|
packssdw(xym0, xym0);
|
||||||
|
|
||||||
|
@ -433,7 +438,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
||||||
else
|
else
|
||||||
vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
cvttps2dq(xym1, xym1);
|
cvttps2dq(xym1, xym1);
|
||||||
packssdw(xym1, xym1);
|
packssdw(xym1, xym1);
|
||||||
|
|
||||||
|
@ -460,7 +465,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
||||||
else
|
else
|
||||||
vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm0, ymm2, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
cvttps2dq(xym0, xym0);
|
cvttps2dq(xym0, xym0);
|
||||||
packssdw(xym0, xym0);
|
packssdw(xym0, xym0);
|
||||||
|
|
||||||
|
@ -469,7 +474,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
if (i < 4 || many_regs)
|
if (i < 4 || many_regs)
|
||||||
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
||||||
else
|
else
|
||||||
vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
|
vmulps(ymm1, ymm3, ptr[&g_const_256b.m_shift[8 - i]]);
|
||||||
cvttps2dq(xym1, xym1);
|
cvttps2dq(xym1, xym1);
|
||||||
packssdw(xym1, xym1);
|
packssdw(xym1, xym1);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue