GS:SW: Use doubles for Z calculation

This commit is contained in:
TellowKrinkle 2022-05-11 13:31:30 -05:00 committed by tellowkrinkle
parent 9be7eb67d8
commit 4ddf897719
8 changed files with 206 additions and 134 deletions

View File

@ -131,9 +131,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
#if _M_SSE >= 0x501
const GSVector8* shift = (GSVector8*)g_const->m_shift_256b;
const GSVector4* half_shift = reinterpret_cast<const GSVector4*>(shift);
const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
#else
const GSVector4* shift = (GSVector4*)g_const->m_shift_128b;
const u64* half_shift = reinterpret_cast<const u64*>(shift);
const GSVector4 step_shift = shift[0];
#endif
@ -163,18 +165,16 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
if (has_z)
{
const VectorF dz = VectorF::broadcast64(&dscan.p.z);
#if _M_SSE >= 0x501
const VectorF dz = VectorF::broadcast32(&dscan.p.z);
local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift)));
#else
const GSVector4 dz = dscan.p.zzzz();
local.d4.z = dz * step_shift;
local.d4.z = dz.mul64(GSVector4::f32to64(shift));
#endif
for (int i = 0; i < vlen; i++)
{
local.d[i].z = dz * shift[1 + i];
local.d[i].z0 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 2]));
local.d[i].z1 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 3]));
}
}
}
@ -312,7 +312,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b;
#endif
VectorI test;
VectorF zo;
VectorF zo0, zo1;
VectorI f;
VectorF s, t, q;
VectorI uf, vf;
@ -358,7 +358,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.zb)
{
zo = local.d[skip].z;
zo0 = local.d[skip].z0;
zo1 = local.d[skip].z1;
}
}
@ -450,19 +451,32 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.prim != GS_SPRITE_CLASS)
{
// Need to handle when the float converts incorrectly
#if _M_SSE >= 0x501
GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
#else
GSVector4 z = scan.p.zzzz() + zo;
#endif
VectorF zbase = VectorF::broadcast64(&scan.p.z);
VectorF z0 = zbase.add64(zo0);
VectorF z1 = zbase.add64(zo1);
if (sel.zoverflow)
{
zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001());
// SSE only has double to int32 conversion, no double to uint32
// Work around this by subtracting 0x80000000 before converting, then adding it back after
// Since we've subtracted 0x80000000, truncating now rounds up for numbers less than 0x80000000
// So approximate the truncation by subtracting an extra (0.5 - ulp) and rounding instead
GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32(false);
GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32(false);
#if _M_SSE >= 0x501
zs = GSVector8i(zl, zh);
#else
zs = zl.upl64(zh);
#endif
zs += VectorI::x80000000();
}
else
{
zs = VectorI(z);
#if _M_SSE >= 0x501
zs = GSVector8i(z0.f64toi32(), z1.f64toi32());
#else
zs = z0.f64toi32().upl64(z1.f64toi32());
#endif
}
if (sel.zclamp)
@ -1487,10 +1501,12 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
if (sel.zb)
{
#if _M_SSE >= 0x501
zo += GSVector8::broadcast32(&local.d8.p.z);
GSVector8 add = GSVector8::broadcast64(&local.d8.p.z);
#else
zo += local.d4.z;
GSVector4 add = local.d4.z;
#endif
zo0 = zo0.add64(add);
zo1 = zo1.add64(add);
}
if (sel.fwrite && sel.fge)

View File

@ -30,7 +30,8 @@ using namespace Xbyak;
// If use_lod, m_local.gd->tex, else m_local.gd->tex[0]
#define _64_m_local__gd__tex r14
#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_local_(ptrtype, field) ((m_rip) ? ptrtype[rip + (char*)&m_local.field] : ptrtype[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_local(field) _rip_local_(ptr, field)
#define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)])
/// On AVX, does a v-prefixed separate destination operation
@ -99,7 +100,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
, _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd))
, _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm))
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15)
, _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
, _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
{
m_sel.key = key;
use_lod = m_sel.mmin;
@ -169,6 +170,15 @@ void GSDrawScanlineCodeGenerator2::pbroadcastwLocal(const XYm& reg, const Addres
#endif
}
void GSDrawScanlineCodeGenerator2::broadcastsd(const XYm& reg, const Address& mem)
{
#if USING_YMM
vbroadcastsd(reg, mem);
#else
movddup(reg, mem);
#endif
}
void GSDrawScanlineCodeGenerator2::broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr)
{
movd(Xmm(vec.getIdx()), gpr);
@ -691,7 +701,6 @@ void GSDrawScanlineCodeGenerator2::Init()
// Free: rax
const XYm& f = _f;
const XYm& z = _z;
if (m_sel.prim != GS_SPRITE_CLASS)
{
@ -714,26 +723,25 @@ void GSDrawScanlineCodeGenerator2::Init()
if (m_sel.zb)
{
// z = vp.zzzz() + m_local.d[skip].z;
broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z
if (hasAVX)
{
vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
vaddpd(xym0, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
vaddpd(xym1, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
}
else
{
movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0));
movaps(xym0, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
addpd(xym0, xym1);
addpd(xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
}
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
movaps(_rip_local(temp.z0), xym0);
movaps(_rip_local(temp.z1), xym1);
}
}
}
else
{
if (m_sel.ztest)
{
pbroadcastdLocal(z, _rip_local(p.z));
}
if (m_sel.fwrite && m_sel.fge)
pbroadcastwLocal(_f, _rip_local(p.f));
}
@ -902,8 +910,7 @@ void GSDrawScanlineCodeGenerator2::Step()
add(t0, vecsize / 2);
const XYm& z =_z;
const XYm& f =_f;
const XYm& f = _f;
if (m_sel.prim != GS_SPRITE_CLASS)
{
@ -911,7 +918,20 @@ void GSDrawScanlineCodeGenerator2::Step()
if (m_sel.zb)
{
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
broadcastsd(xym1, _rip_local_d_p(z));
if (hasAVX)
{
vaddpd(xym0, xym1, _rip_local(temp.z0));
vaddpd(xym1, xym1, _rip_local(temp.z1));
}
else
{
movaps(xym0, _rip_local(temp.z0));
addpd(xym0, xym1);
addpd(xym1, _rip_local(temp.z1));
}
movaps(_rip_local(temp.z0), xym0);
movaps(_rip_local(temp.z1), xym1);
}
// f = f.add16(m_local.d4.f);
@ -1042,8 +1062,6 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
return;
}
const XYm& z = _z;
// int za = fza_base.y + fza_offset->y;
mov(t2.cvt32(), dword[t1 + 4]);
@ -1056,36 +1074,51 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
{
if (m_sel.zoverflow)
{
// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
z /= 2;
zs = GSVector8i(z, true);
zs = zs.min_u32(GSVector8i::x7fffffff());
zs = zs.sll32(1) | 1;*/
// GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32();
// GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32();
// zs = GSVector8i(zl, zh);
// zs += VectorI::x80000000();
auto m_half = loadAddress(rax, &GSVector4::m_half);
auto m_imin = loadAddress(rax, &GSVector4::m_xc1e00000000fffff);
broadcastsd(temp1, ptr[m_imin]);
if (hasAVX)
vbroadcastss(temp1, ptr[m_half]);
{
vaddpd(xym0, temp1, _rip_local(temp.z0));
vaddpd(temp1, temp1, _rip_local(temp.z1));
}
else
movaps(temp1, ptr[m_half]);
{
movaps(xym0, _rip_local(temp.z0));
addpd(xym0, temp1);
addpd(temp1, _rip_local(temp.z1));
}
cvtpd2dq(xmm0, xym0);
cvtpd2dq(Xmm(temp1.getIdx()), temp1);
mulps(temp1, z);
cvttps2dq(temp1, temp1);
pslld(temp1, 1);
#if USING_YMM
vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1);
#else
punpcklqdq(xym0, temp1);
#endif
cvttps2dq(xym0, z);
pcmpeqd(temp2, temp2);
psrld(temp2, 31);
pand(xym0, temp2);
por(xym0, temp1);
pcmpeqd(temp1, temp1);
pslld(temp1, 31);
paddd(xym0, temp1);
}
else
{
// zs = GSVector4i(z);
// zs = GSVector8i(z0.f64toi32(), z1.f64toi32());
cvttps2dq(xym0, z);
#if USING_YMM
cvttpd2dq(xmm0, _rip_local_(yword, temp.z0));
cvttpd2dq(Xmm(temp1.getIdx()), _rip_local_(yword, temp.z1));
vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1);
#else
cvttpd2dq(xmm0, _rip_local_(xword, temp.z0));
cvttpd2dq(temp1, _rip_local_(xword, temp.z1));
punpcklqdq(xym0, temp1);
#endif
}
// Clamp Z to ZPSM_FMT_MAX
@ -1104,7 +1137,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
}
else
{
movdqa(xym0, _z);
pbroadcastdLocal(xym0, _rip_local(p.z));
}
if (m_sel.ztest)

View File

@ -82,7 +82,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
/// Available on both x86 and x64, not always valid
const XYm _rb, _ga, _fm, _zm, _fd, _test;
/// Always valid if needed, x64 only
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
const XYm _f, _s, _t, _q, _f_rb, _f_ga;
/// Returns the first arg on 32-bit, second on 64-bit
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
@ -117,6 +117,7 @@ private:
/// On YMM registers this will be a broadcast from a 16-bit value
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem);
void broadcastsd(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a 32-bit GPR to a vector register
void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr);
void modulate16(const XYm& a, const Xbyak::Operand& f, u8 shift);

View File

@ -463,7 +463,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
GSVertexSW2 dv1 = v2 - v0;
GSVertexSW2 dv2 = v2 - v1;
GSVector4 cross = dv0.p * dv1.p.yxwz();
GSVector4 cross = GSVector4::loadl(&dv0.p) * GSVector4::loadl(&dv1.p).yxwz();
cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product
// the longest horizontal span would be cross.x / dv1.p.y, but we don't need its actual value
@ -487,18 +487,10 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
ddx[2] = ddx[0].xzyw();
// Precision is important here. Don't use reciprocal, it will break Jak3/Xenosaga1
GSVector8 _dxy01c(dxy01 / cross);
GSVector8 dxy01c(dxy01 / cross);
/*
dscan = dv1 * dxy01c.yyyy() - dv0 * dxy01c.wwww();
dedge = dv0 * dxy01c.zzzz() - dv1 * dxy01c.xxxx();
*/
dscan.p = dv1.p * _dxy01c.yyyy().extract<0>() - dv0.p * _dxy01c.wwww().extract<0>();
dscan.tc = dv1.tc * _dxy01c.yyyy() - dv0.tc * _dxy01c.wwww();
dedge.p = dv0.p * _dxy01c.zzzz().extract<0>() - dv1.p * _dxy01c.xxxx().extract<0>();
dedge.tc = dv0.tc * _dxy01c.zzzz() - dv1.tc * _dxy01c.xxxx();
if (m1 & 1)
{
@ -567,13 +559,12 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, c
while (top < bottom)
{
GSVector8 dy(GSVector4(top) - p0.yyyy());
const float dy = static_cast<float>(top) - p0.y;
GSVector8 dyv(dy);
GSVertexSW2 scan;
GSVector4 xy = GSVector4::loadl(&edge.p) + GSVector4::loadl(&dedge.p) * dyv.extract<0>();
scan.p = edge.p + dedge.p * dy.extract<0>();
GSVector4 lrf = scan.p.ceil();
GSVector4 lrf = xy.ceil();
GSVector4 l = lrf.max(scissor);
GSVector4 r = lrf.min(scissor);
GSVector4i lr = GSVector4i(l.xxyy(r));
@ -585,12 +576,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, c
if (pixels > 0)
{
scan.tc = edge.tc + dedge.tc * dy;
float prestep = l.x - p0.x;
GSVector8 prestepv(prestep);
GSVector8 prestep((l - p0).xxxx());
scan.p = scan.p + dscan.p * prestep.extract<0>();
scan.tc = scan.tc + dscan.tc * prestep;
GSVertexSW2 scan;
GSVector4::storel(&scan.p, xy + GSVector4::loadl(&dscan.p) * prestepv.extract<0>());
scan.p.F64[1] = edge.p.F64[1] + dedge.p.F64[1] * dy + dscan.p.F64[1] * prestep;
scan.tc = edge.tc + dedge.tc * dyv + dscan.tc * prestepv;
AddScanline(e++, pixels, left, top, (GSVertexSW&)scan);
}
@ -652,7 +644,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
GSVertexSW dv1 = v2 - v0;
GSVertexSW dv2 = v2 - v1;
GSVector4 cross = dv0.p * dv1.p.yxwz();
GSVector4 cross = GSVector4::loadl(&dv0.p) * GSVector4::loadl(&dv1.p).yxwz();
cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product
// the longest horizontal span would be cross.x / dv1.p.y, but we don't need its actual value
@ -678,18 +670,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
// Precision is important here. Don't use reciprocal, it will break Jak3/Xenosaga1
GSVector4 dxy01c = dxy01 / cross;
/*
dscan = dv1 * dxy01c.yyyy() - dv0 * dxy01c.wwww();
dedge = dv0 * dxy01c.zzzz() - dv1 * dxy01c.xxxx();
*/
dscan.p = dv1.p * dxy01c.yyyy() - dv0.p * dxy01c.wwww();
dscan.t = dv1.t * dxy01c.yyyy() - dv0.t * dxy01c.wwww();
dscan.c = dv1.c * dxy01c.yyyy() - dv0.c * dxy01c.wwww();
dedge.p = dv0.p * dxy01c.zzzz() - dv1.p * dxy01c.xxxx();
dedge.t = dv0.t * dxy01c.zzzz() - dv1.t * dxy01c.xxxx();
dedge.c = dv0.c * dxy01c.zzzz() - dv1.c * dxy01c.xxxx();
if (m1 & 1)
{
@ -758,13 +740,11 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
while (top < bottom)
{
GSVector4 dy = GSVector4(top) - p0.yyyy();
const float dy = static_cast<float>(top) - p0.y;
GSVertexSW scan;
GSVector4 xy = GSVector4::loadl(&edge.p) + GSVector4::loadl(&dedge.p) * dy;
scan.p = edge.p + dedge.p * dy;
GSVector4 lrf = scan.p.ceil();
GSVector4 lrf = xy.ceil();
GSVector4 l = lrf.max(scissor);
GSVector4 r = lrf.min(scissor);
GSVector4i lr = GSVector4i(l.xxyy(r));
@ -776,14 +756,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
if (pixels > 0)
{
scan.t = edge.t + dedge.t * dy;
scan.c = edge.c + dedge.c * dy;
const float prestep = l.x - p0.x;
GSVector4 prestep = (l - p0).xxxx();
scan.p = scan.p + dscan.p * prestep;
scan.t = scan.t + dscan.t * prestep;
scan.c = scan.c + dscan.c * prestep;
GSVertexSW scan;
GSVector4::storel(&scan.p, xy + GSVector4::loadl(&dscan.p) * prestep);
scan.p.F64[1] = edge.p.F64[1] + dedge.p.F64[1] * dy + dscan.p.F64[1] * prestep;
scan.t = edge.t + dedge.t * dy + dscan.t * prestep;
scan.c = edge.c + dedge.c * dy + dscan.c * prestep;
AddScanline(e++, pixels, left, top, scan);
}

View File

@ -250,8 +250,8 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
}
else
{
float z = static_cast<float>(static_cast<u32>(xyzuvf.extract32<1>()));
dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0));
double z = static_cast<double>(static_cast<u32>(xyzuvf.extract32<1>()));
dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4::f64(z, 0.0));
t = t.blend32<8>(GSVector4(xyzuvf << 7));
}

View File

@ -170,16 +170,16 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
{
#if _M_SSE >= 0x501
struct skip { GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad; } d[8];
struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u32 z, f; } p; } d8;
struct { GSVector8i rb, ga; } c;
struct skip { GSVector8 z0, z1, s, t, q; GSVector8i rb, ga, f; } d[8];
struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u64 z; u32 f; } p; } d8;
struct { u32 z, f; } p;
struct { GSVector8i rb, ga; } c;
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
struct
{
GSVector8 z, zo;
GSVector8 z0, z1;
GSVector8i f;
GSVector8 s, t, q;
GSVector8i rb, ga;
@ -198,7 +198,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
#else
struct skip { GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad; } d[4];
struct skip { GSVector4 z0, z1, s, t, q; GSVector4i rb, ga, f; } d[4];
struct step { GSVector4 z, stq; GSVector4i c, f; } d4;
struct { GSVector4i rb, ga; } c;
struct { GSVector4i z, f; } p;
@ -207,7 +207,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
struct
{
GSVector4 z, zo;
GSVector4 z0, z1;
GSVector4i f;
GSVector4 s, t, q;
GSVector4i rb, ga;

View File

@ -192,21 +192,26 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
// VectorF dz = VectorF::broadcast64(&dscan.p.z)
movddup(xmm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
// m_local.d4.z = dz * 4.0f;
THREEARG(mulps, xmm1, xmm0, xmm3);
movdqa(_rip_local_d_p(z), xmm1);
// m_local.d4.z = dz.mul64(GSVector4::f32to64(shift));
cvtps2pd(xmm1, xmm3);
mulpd(xmm1, xmm0);
movaps(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
movdqa(_rip_local(d[i].z), xmm1);
cvtps2pd(xmm1, XYm(4 + i));
pshufd(xmm2, XYm(4 + i), _MM_SHUFFLE(1, 0, 3, 2));
cvtps2pd(xmm2, xmm2);
mulpd(xmm1, xmm0);
mulpd(xmm2, xmm0);
movaps(_rip_local(d[i].z0), xmm1);
movaps(_rip_local(d[i].z1), xmm2);
}
}
}
@ -276,23 +281,34 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
if (m_en.z)
{
// const VectorF dz = VectorF::broadcast32(&dscan.p.z);
vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
// const VectorF dz = VectorF::broadcast64(&dscan.p.z);
vbroadcastsd(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
// local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
vmulss(xmm1, xmm0, xmm3);
movss(_rip_local_d_p(z), xmm1);
// GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift)));
cvtss2sd(xmm1, xmm3);
vmulsd(xmm1, xmm0, xmm1);
movsd(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// m_local.d[i].z = dz * shift[1 + i];
// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
// Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative)
if (i < 4 || many_regs)
vmulps(ymm1, Ymm(4 + i), ymm0);
{
cvtps2pd(ymm1, Xmm(4 + i));
vextracti128(xmm2, Ymm(4 + i), 1);
cvtps2pd(ymm2, xmm2);
}
else
vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]);
movaps(_rip_local(d[i].z), ymm1);
{
cvtps2pd(ymm1, ptr[&g_const->m_shift_256b[i + 1][0]]);
cvtps2pd(ymm2, ptr[&g_const->m_shift_256b[i + 1][4]]);
}
mulpd(ymm1, ymm0);
mulpd(ymm2, ymm0);
movaps(_rip_local(d[i].z0), ymm1);
movaps(_rip_local(d[i].z1), ymm2);
}
}
}

View File

@ -19,6 +19,16 @@
struct alignas(32) GSVertexSW
{
// When drawing sprites:
// p: x y _ f
// t: s t q z
// c: r g b a
// Otherwise:
// p: x y zl zh
// t: s t q f
// c: r g b a
// cov is placed in x since by the time it's known, xy are no longer needed
GSVector4 p, _pad, t, c;
__forceinline GSVertexSW() {}
@ -43,7 +53,8 @@ struct alignas(32) GSVertexSW
__forceinline void operator+=(const GSVertexSW& v)
{
p += v.p;
GSVector4::storel(&p, GSVector4::loadl(&p) + GSVector4::loadl(&v.p));
p.F64[1] += v.p.F64[1];
t += v.t;
c += v.c;
}
@ -52,7 +63,8 @@ struct alignas(32) GSVertexSW
{
GSVertexSW v;
v.p = a.p + b.p;
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) + GSVector4::loadl(&b.p));
v.p.F64[1] = a.p.F64[1] + b.p.F64[1];
v.t = a.t + b.t;
v.c = a.c + b.c;
@ -63,7 +75,8 @@ struct alignas(32) GSVertexSW
{
GSVertexSW v;
v.p = a.p - b.p;
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) - GSVector4::loadl(&b.p));
v.p.F64[1] = a.p.F64[1] - b.p.F64[1];
v.t = a.t - b.t;
v.c = a.c - b.c;
@ -74,7 +87,8 @@ struct alignas(32) GSVertexSW
{
GSVertexSW v;
v.p = a.p * b;
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) * b);
v.p.F64[1] = a.p.F64[1] * b.F32[0];
v.t = a.t * b;
v.c = a.c * b;
@ -85,7 +99,8 @@ struct alignas(32) GSVertexSW
{
GSVertexSW v;
v.p = a.p / b;
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) / b);
v.p.F64[1] = a.p.F64[1] / b.F32[0];
v.t = a.t / b;
v.c = a.c / b;
@ -249,11 +264,23 @@ struct alignas(32) GSVertexSW2
{
GSVertexSW2 v;
v.p = a.p - b.p;
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) - GSVector4::loadl(&b.p));
v.p.F64[1] = a.p.F64[1] - b.p.F64[1];
v.tc = a.tc - b.tc;
return v;
}
__forceinline friend GSVertexSW2 operator*(const GSVertexSW2& a, const GSVector8& b)
{
GSVertexSW2 v;
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) * b.extract<0>());
v.p.F64[1] = a.p.F64[1] * b.F32[0];
v.tc = a.tc * b;
return v;
}
};
#endif