mirror of https://github.com/PCSX2/pcsx2.git
GS:SW: Use doubles for Z calculation
This commit is contained in:
parent
9be7eb67d8
commit
4ddf897719
|
@ -131,9 +131,11 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
|
|||
|
||||
#if _M_SSE >= 0x501
|
||||
const GSVector8* shift = (GSVector8*)g_const->m_shift_256b;
|
||||
const GSVector4* half_shift = reinterpret_cast<const GSVector4*>(shift);
|
||||
const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
|
||||
#else
|
||||
const GSVector4* shift = (GSVector4*)g_const->m_shift_128b;
|
||||
const u64* half_shift = reinterpret_cast<const u64*>(shift);
|
||||
const GSVector4 step_shift = shift[0];
|
||||
#endif
|
||||
|
||||
|
@ -163,18 +165,16 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
|
|||
|
||||
if (has_z)
|
||||
{
|
||||
const VectorF dz = VectorF::broadcast64(&dscan.p.z);
|
||||
#if _M_SSE >= 0x501
|
||||
const VectorF dz = VectorF::broadcast32(&dscan.p.z);
|
||||
|
||||
local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
|
||||
GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift)));
|
||||
#else
|
||||
const GSVector4 dz = dscan.p.zzzz();
|
||||
|
||||
local.d4.z = dz * step_shift;
|
||||
local.d4.z = dz.mul64(GSVector4::f32to64(shift));
|
||||
#endif
|
||||
for (int i = 0; i < vlen; i++)
|
||||
{
|
||||
local.d[i].z = dz * shift[1 + i];
|
||||
local.d[i].z0 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 2]));
|
||||
local.d[i].z1 = dz.mul64(VectorF::f32to64(&half_shift[2 * i + 3]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -312,7 +312,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b;
|
||||
#endif
|
||||
VectorI test;
|
||||
VectorF zo;
|
||||
VectorF zo0, zo1;
|
||||
VectorI f;
|
||||
VectorF s, t, q;
|
||||
VectorI uf, vf;
|
||||
|
@ -358,7 +358,8 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
|
||||
if (sel.zb)
|
||||
{
|
||||
zo = local.d[skip].z;
|
||||
zo0 = local.d[skip].z0;
|
||||
zo1 = local.d[skip].z1;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -450,19 +451,32 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
if (sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// Need to handle when the float converts incorrectly
|
||||
#if _M_SSE >= 0x501
|
||||
GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
|
||||
#else
|
||||
GSVector4 z = scan.p.zzzz() + zo;
|
||||
#endif
|
||||
VectorF zbase = VectorF::broadcast64(&scan.p.z);
|
||||
VectorF z0 = zbase.add64(zo0);
|
||||
VectorF z1 = zbase.add64(zo1);
|
||||
|
||||
if (sel.zoverflow)
|
||||
{
|
||||
zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001());
|
||||
// SSE only has double to int32 conversion, no double to uint32
|
||||
// Work around this by subtracting 0x80000000 before converting, then adding it back after
|
||||
// Since we've subtracted 0x80000000, truncating now rounds up for numbers less than 0x80000000
|
||||
// So approximate the truncation by subtracting an extra (0.5 - ulp) and rounding instead
|
||||
GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32(false);
|
||||
GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32(false);
|
||||
#if _M_SSE >= 0x501
|
||||
zs = GSVector8i(zl, zh);
|
||||
#else
|
||||
zs = zl.upl64(zh);
|
||||
#endif
|
||||
zs += VectorI::x80000000();
|
||||
}
|
||||
else
|
||||
{
|
||||
zs = VectorI(z);
|
||||
#if _M_SSE >= 0x501
|
||||
zs = GSVector8i(z0.f64toi32(), z1.f64toi32());
|
||||
#else
|
||||
zs = z0.f64toi32().upl64(z1.f64toi32());
|
||||
#endif
|
||||
}
|
||||
|
||||
if (sel.zclamp)
|
||||
|
@ -1487,10 +1501,12 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
if (sel.zb)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
zo += GSVector8::broadcast32(&local.d8.p.z);
|
||||
GSVector8 add = GSVector8::broadcast64(&local.d8.p.z);
|
||||
#else
|
||||
zo += local.d4.z;
|
||||
GSVector4 add = local.d4.z;
|
||||
#endif
|
||||
zo0 = zo0.add64(add);
|
||||
zo1 = zo1.add64(add);
|
||||
}
|
||||
|
||||
if (sel.fwrite && sel.fge)
|
||||
|
|
|
@ -30,7 +30,8 @@ using namespace Xbyak;
|
|||
// If use_lod, m_local.gd->tex, else m_local.gd->tex[0]
|
||||
#define _64_m_local__gd__tex r14
|
||||
|
||||
#define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
|
||||
#define _rip_local_(ptrtype, field) ((m_rip) ? ptrtype[rip + (char*)&m_local.field] : ptrtype[_m_local + OFFSETOF(GSScanlineLocalData, field)])
|
||||
#define _rip_local(field) _rip_local_(ptr, field)
|
||||
#define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)])
|
||||
|
||||
/// On AVX, does a v-prefixed separate destination operation
|
||||
|
@ -99,7 +100,7 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
|
|||
, _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd))
|
||||
, _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm))
|
||||
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15)
|
||||
, _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
|
||||
, _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
|
||||
{
|
||||
m_sel.key = key;
|
||||
use_lod = m_sel.mmin;
|
||||
|
@ -169,6 +170,15 @@ void GSDrawScanlineCodeGenerator2::pbroadcastwLocal(const XYm& reg, const Addres
|
|||
#endif
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator2::broadcastsd(const XYm& reg, const Address& mem)
|
||||
{
|
||||
#if USING_YMM
|
||||
vbroadcastsd(reg, mem);
|
||||
#else
|
||||
movddup(reg, mem);
|
||||
#endif
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator2::broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr)
|
||||
{
|
||||
movd(Xmm(vec.getIdx()), gpr);
|
||||
|
@ -691,7 +701,6 @@ void GSDrawScanlineCodeGenerator2::Init()
|
|||
// Free: rax
|
||||
|
||||
const XYm& f = _f;
|
||||
const XYm& z = _z;
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
|
@ -714,26 +723,25 @@ void GSDrawScanlineCodeGenerator2::Init()
|
|||
if (m_sel.zb)
|
||||
{
|
||||
// z = vp.zzzz() + m_local.d[skip].z;
|
||||
broadcastsd(xym1, ptr[a3 + offsetof(GSVertexSW, p.z)]); // v.p.z
|
||||
if (hasAVX)
|
||||
{
|
||||
vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
|
||||
vaddpd(xym0, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
|
||||
vaddpd(xym1, xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
|
||||
}
|
||||
else
|
||||
{
|
||||
movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
|
||||
shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
movaps(xym0, ptr[a1 + offsetof(GSScanlineLocalData::skip, z0)]);
|
||||
addpd(xym0, xym1);
|
||||
addpd(xym1, ptr[a1 + offsetof(GSScanlineLocalData::skip, z1)]);
|
||||
}
|
||||
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
|
||||
movaps(_rip_local(temp.z0), xym0);
|
||||
movaps(_rip_local(temp.z1), xym1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (m_sel.ztest)
|
||||
{
|
||||
pbroadcastdLocal(z, _rip_local(p.z));
|
||||
}
|
||||
|
||||
if (m_sel.fwrite && m_sel.fge)
|
||||
pbroadcastwLocal(_f, _rip_local(p.f));
|
||||
}
|
||||
|
@ -902,8 +910,7 @@ void GSDrawScanlineCodeGenerator2::Step()
|
|||
|
||||
add(t0, vecsize / 2);
|
||||
|
||||
const XYm& z =_z;
|
||||
const XYm& f =_f;
|
||||
const XYm& f = _f;
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
|
@ -911,7 +918,20 @@ void GSDrawScanlineCodeGenerator2::Step()
|
|||
|
||||
if (m_sel.zb)
|
||||
{
|
||||
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
|
||||
broadcastsd(xym1, _rip_local_d_p(z));
|
||||
if (hasAVX)
|
||||
{
|
||||
vaddpd(xym0, xym1, _rip_local(temp.z0));
|
||||
vaddpd(xym1, xym1, _rip_local(temp.z1));
|
||||
}
|
||||
else
|
||||
{
|
||||
movaps(xym0, _rip_local(temp.z0));
|
||||
addpd(xym0, xym1);
|
||||
addpd(xym1, _rip_local(temp.z1));
|
||||
}
|
||||
movaps(_rip_local(temp.z0), xym0);
|
||||
movaps(_rip_local(temp.z1), xym1);
|
||||
}
|
||||
|
||||
// f = f.add16(m_local.d4.f);
|
||||
|
@ -1042,8 +1062,6 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
|
|||
return;
|
||||
}
|
||||
|
||||
const XYm& z = _z;
|
||||
|
||||
// int za = fza_base.y + fza_offset->y;
|
||||
|
||||
mov(t2.cvt32(), dword[t1 + 4]);
|
||||
|
@ -1056,36 +1074,51 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
|
|||
{
|
||||
if (m_sel.zoverflow)
|
||||
{
|
||||
// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
|
||||
z /= 2;
|
||||
zs = GSVector8i(z, true);
|
||||
zs = zs.min_u32(GSVector8i::x7fffffff());
|
||||
zs = zs.sll32(1) | 1;*/
|
||||
// GSVector4i zl = z0.add64(VectorF::m_xc1e00000000fffff).f64toi32();
|
||||
// GSVector4i zh = z1.add64(VectorF::m_xc1e00000000fffff).f64toi32();
|
||||
// zs = GSVector8i(zl, zh);
|
||||
// zs += VectorI::x80000000();
|
||||
|
||||
auto m_half = loadAddress(rax, &GSVector4::m_half);
|
||||
auto m_imin = loadAddress(rax, &GSVector4::m_xc1e00000000fffff);
|
||||
broadcastsd(temp1, ptr[m_imin]);
|
||||
|
||||
if (hasAVX)
|
||||
vbroadcastss(temp1, ptr[m_half]);
|
||||
{
|
||||
vaddpd(xym0, temp1, _rip_local(temp.z0));
|
||||
vaddpd(temp1, temp1, _rip_local(temp.z1));
|
||||
}
|
||||
else
|
||||
movaps(temp1, ptr[m_half]);
|
||||
{
|
||||
movaps(xym0, _rip_local(temp.z0));
|
||||
addpd(xym0, temp1);
|
||||
addpd(temp1, _rip_local(temp.z1));
|
||||
}
|
||||
cvtpd2dq(xmm0, xym0);
|
||||
cvtpd2dq(Xmm(temp1.getIdx()), temp1);
|
||||
|
||||
mulps(temp1, z);
|
||||
cvttps2dq(temp1, temp1);
|
||||
pslld(temp1, 1);
|
||||
#if USING_YMM
|
||||
vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1);
|
||||
#else
|
||||
punpcklqdq(xym0, temp1);
|
||||
#endif
|
||||
|
||||
cvttps2dq(xym0, z);
|
||||
pcmpeqd(temp2, temp2);
|
||||
psrld(temp2, 31);
|
||||
pand(xym0, temp2);
|
||||
|
||||
por(xym0, temp1);
|
||||
pcmpeqd(temp1, temp1);
|
||||
pslld(temp1, 31);
|
||||
paddd(xym0, temp1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// zs = GSVector4i(z);
|
||||
// zs = GSVector8i(z0.f64toi32(), z1.f64toi32());
|
||||
|
||||
cvttps2dq(xym0, z);
|
||||
#if USING_YMM
|
||||
cvttpd2dq(xmm0, _rip_local_(yword, temp.z0));
|
||||
cvttpd2dq(Xmm(temp1.getIdx()), _rip_local_(yword, temp.z1));
|
||||
vinserti128(xym0, xym0, Xmm(temp1.getIdx()), 1);
|
||||
#else
|
||||
cvttpd2dq(xmm0, _rip_local_(xword, temp.z0));
|
||||
cvttpd2dq(temp1, _rip_local_(xword, temp.z1));
|
||||
punpcklqdq(xym0, temp1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Clamp Z to ZPSM_FMT_MAX
|
||||
|
@ -1104,7 +1137,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
|
|||
}
|
||||
else
|
||||
{
|
||||
movdqa(xym0, _z);
|
||||
pbroadcastdLocal(xym0, _rip_local(p.z));
|
||||
}
|
||||
|
||||
if (m_sel.ztest)
|
||||
|
|
|
@ -82,7 +82,7 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
|
|||
/// Available on both x86 and x64, not always valid
|
||||
const XYm _rb, _ga, _fm, _zm, _fd, _test;
|
||||
/// Always valid if needed, x64 only
|
||||
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
|
||||
const XYm _f, _s, _t, _q, _f_rb, _f_ga;
|
||||
|
||||
/// Returns the first arg on 32-bit, second on 64-bit
|
||||
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
|
||||
|
@ -117,6 +117,7 @@ private:
|
|||
/// On YMM registers this will be a broadcast from a 16-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
void broadcastsd(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a 32-bit GPR to a vector register
|
||||
void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr);
|
||||
void modulate16(const XYm& a, const Xbyak::Operand& f, u8 shift);
|
||||
|
|
|
@ -463,7 +463,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
|
|||
GSVertexSW2 dv1 = v2 - v0;
|
||||
GSVertexSW2 dv2 = v2 - v1;
|
||||
|
||||
GSVector4 cross = dv0.p * dv1.p.yxwz();
|
||||
GSVector4 cross = GSVector4::loadl(&dv0.p) * GSVector4::loadl(&dv1.p).yxwz();
|
||||
|
||||
cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product
|
||||
// the longest horizontal span would be cross.x / dv1.p.y, but we don't need its actual value
|
||||
|
@ -487,18 +487,10 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
|
|||
ddx[2] = ddx[0].xzyw();
|
||||
|
||||
// Precision is important here. Don't use reciprocal, it will break Jak3/Xenosaga1
|
||||
GSVector8 _dxy01c(dxy01 / cross);
|
||||
GSVector8 dxy01c(dxy01 / cross);
|
||||
|
||||
/*
|
||||
dscan = dv1 * dxy01c.yyyy() - dv0 * dxy01c.wwww();
|
||||
dedge = dv0 * dxy01c.zzzz() - dv1 * dxy01c.xxxx();
|
||||
*/
|
||||
|
||||
dscan.p = dv1.p * _dxy01c.yyyy().extract<0>() - dv0.p * _dxy01c.wwww().extract<0>();
|
||||
dscan.tc = dv1.tc * _dxy01c.yyyy() - dv0.tc * _dxy01c.wwww();
|
||||
|
||||
dedge.p = dv0.p * _dxy01c.zzzz().extract<0>() - dv1.p * _dxy01c.xxxx().extract<0>();
|
||||
dedge.tc = dv0.tc * _dxy01c.zzzz() - dv1.tc * _dxy01c.xxxx();
|
||||
|
||||
if (m1 & 1)
|
||||
{
|
||||
|
@ -567,13 +559,12 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, c
|
|||
|
||||
while (top < bottom)
|
||||
{
|
||||
GSVector8 dy(GSVector4(top) - p0.yyyy());
|
||||
const float dy = static_cast<float>(top) - p0.y;
|
||||
GSVector8 dyv(dy);
|
||||
|
||||
GSVertexSW2 scan;
|
||||
GSVector4 xy = GSVector4::loadl(&edge.p) + GSVector4::loadl(&dedge.p) * dyv.extract<0>();
|
||||
|
||||
scan.p = edge.p + dedge.p * dy.extract<0>();
|
||||
|
||||
GSVector4 lrf = scan.p.ceil();
|
||||
GSVector4 lrf = xy.ceil();
|
||||
GSVector4 l = lrf.max(scissor);
|
||||
GSVector4 r = lrf.min(scissor);
|
||||
GSVector4i lr = GSVector4i(l.xxyy(r));
|
||||
|
@ -585,12 +576,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW2& edge, c
|
|||
|
||||
if (pixels > 0)
|
||||
{
|
||||
scan.tc = edge.tc + dedge.tc * dy;
|
||||
float prestep = l.x - p0.x;
|
||||
GSVector8 prestepv(prestep);
|
||||
|
||||
GSVector8 prestep((l - p0).xxxx());
|
||||
|
||||
scan.p = scan.p + dscan.p * prestep.extract<0>();
|
||||
scan.tc = scan.tc + dscan.tc * prestep;
|
||||
GSVertexSW2 scan;
|
||||
GSVector4::storel(&scan.p, xy + GSVector4::loadl(&dscan.p) * prestepv.extract<0>());
|
||||
scan.p.F64[1] = edge.p.F64[1] + dedge.p.F64[1] * dy + dscan.p.F64[1] * prestep;
|
||||
scan.tc = edge.tc + dedge.tc * dyv + dscan.tc * prestepv;
|
||||
|
||||
AddScanline(e++, pixels, left, top, (GSVertexSW&)scan);
|
||||
}
|
||||
|
@ -652,7 +644,7 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
|
|||
GSVertexSW dv1 = v2 - v0;
|
||||
GSVertexSW dv2 = v2 - v1;
|
||||
|
||||
GSVector4 cross = dv0.p * dv1.p.yxwz();
|
||||
GSVector4 cross = GSVector4::loadl(&dv0.p) * GSVector4::loadl(&dv1.p).yxwz();
|
||||
|
||||
cross = (cross - cross.yxwz()).yyyy(); // select the second component, the negated cross product
|
||||
// the longest horizontal span would be cross.x / dv1.p.y, but we don't need its actual value
|
||||
|
@ -678,18 +670,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const u32* index)
|
|||
// Precision is important here. Don't use reciprocal, it will break Jak3/Xenosaga1
|
||||
GSVector4 dxy01c = dxy01 / cross;
|
||||
|
||||
/*
|
||||
dscan = dv1 * dxy01c.yyyy() - dv0 * dxy01c.wwww();
|
||||
dedge = dv0 * dxy01c.zzzz() - dv1 * dxy01c.xxxx();
|
||||
*/
|
||||
|
||||
dscan.p = dv1.p * dxy01c.yyyy() - dv0.p * dxy01c.wwww();
|
||||
dscan.t = dv1.t * dxy01c.yyyy() - dv0.t * dxy01c.wwww();
|
||||
dscan.c = dv1.c * dxy01c.yyyy() - dv0.c * dxy01c.wwww();
|
||||
|
||||
dedge.p = dv0.p * dxy01c.zzzz() - dv1.p * dxy01c.xxxx();
|
||||
dedge.t = dv0.t * dxy01c.zzzz() - dv1.t * dxy01c.xxxx();
|
||||
dedge.c = dv0.c * dxy01c.zzzz() - dv1.c * dxy01c.xxxx();
|
||||
|
||||
if (m1 & 1)
|
||||
{
|
||||
|
@ -758,13 +740,11 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
|
|||
|
||||
while (top < bottom)
|
||||
{
|
||||
GSVector4 dy = GSVector4(top) - p0.yyyy();
|
||||
const float dy = static_cast<float>(top) - p0.y;
|
||||
|
||||
GSVertexSW scan;
|
||||
GSVector4 xy = GSVector4::loadl(&edge.p) + GSVector4::loadl(&dedge.p) * dy;
|
||||
|
||||
scan.p = edge.p + dedge.p * dy;
|
||||
|
||||
GSVector4 lrf = scan.p.ceil();
|
||||
GSVector4 lrf = xy.ceil();
|
||||
GSVector4 l = lrf.max(scissor);
|
||||
GSVector4 r = lrf.min(scissor);
|
||||
GSVector4i lr = GSVector4i(l.xxyy(r));
|
||||
|
@ -776,14 +756,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
|
|||
|
||||
if (pixels > 0)
|
||||
{
|
||||
scan.t = edge.t + dedge.t * dy;
|
||||
scan.c = edge.c + dedge.c * dy;
|
||||
const float prestep = l.x - p0.x;
|
||||
|
||||
GSVector4 prestep = (l - p0).xxxx();
|
||||
|
||||
scan.p = scan.p + dscan.p * prestep;
|
||||
scan.t = scan.t + dscan.t * prestep;
|
||||
scan.c = scan.c + dscan.c * prestep;
|
||||
GSVertexSW scan;
|
||||
GSVector4::storel(&scan.p, xy + GSVector4::loadl(&dscan.p) * prestep);
|
||||
scan.p.F64[1] = edge.p.F64[1] + dedge.p.F64[1] * dy + dscan.p.F64[1] * prestep;
|
||||
scan.t = edge.t + dedge.t * dy + dscan.t * prestep;
|
||||
scan.c = edge.c + dedge.c * dy + dscan.c * prestep;
|
||||
|
||||
AddScanline(e++, pixels, left, top, scan);
|
||||
}
|
||||
|
|
|
@ -250,8 +250,8 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
}
|
||||
else
|
||||
{
|
||||
float z = static_cast<float>(static_cast<u32>(xyzuvf.extract32<1>()));
|
||||
dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0));
|
||||
double z = static_cast<double>(static_cast<u32>(xyzuvf.extract32<1>()));
|
||||
dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4::f64(z, 0.0));
|
||||
t = t.blend32<8>(GSVector4(xyzuvf << 7));
|
||||
}
|
||||
|
||||
|
|
|
@ -170,16 +170,16 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
|
|||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
struct skip { GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad; } d[8];
|
||||
struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u32 z, f; } p; } d8;
|
||||
struct { GSVector8i rb, ga; } c;
|
||||
struct skip { GSVector8 z0, z1, s, t, q; GSVector8i rb, ga, f; } d[8];
|
||||
struct step { GSVector4 stq; struct { u32 rb, ga; } c; struct { u64 z; u32 f; } p; } d8;
|
||||
struct { u32 z, f; } p;
|
||||
struct { GSVector8i rb, ga; } c;
|
||||
|
||||
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
|
||||
|
||||
struct
|
||||
{
|
||||
GSVector8 z, zo;
|
||||
GSVector8 z0, z1;
|
||||
GSVector8i f;
|
||||
GSVector8 s, t, q;
|
||||
GSVector8i rb, ga;
|
||||
|
@ -198,7 +198,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
|
|||
|
||||
#else
|
||||
|
||||
struct skip { GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad; } d[4];
|
||||
struct skip { GSVector4 z0, z1, s, t, q; GSVector4i rb, ga, f; } d[4];
|
||||
struct step { GSVector4 z, stq; GSVector4i c, f; } d4;
|
||||
struct { GSVector4i rb, ga; } c;
|
||||
struct { GSVector4i z, f; } p;
|
||||
|
@ -207,7 +207,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
|
|||
|
||||
struct
|
||||
{
|
||||
GSVector4 z, zo;
|
||||
GSVector4 z0, z1;
|
||||
GSVector4i f;
|
||||
GSVector4 s, t, q;
|
||||
GSVector4i rb, ga;
|
||||
|
|
|
@ -192,21 +192,26 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
|
|||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
// VectorF dz = VectorF::broadcast64(&dscan.p.z)
|
||||
movddup(xmm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
|
||||
|
||||
broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
movdqa(_rip_local_d_p(z), xmm1);
|
||||
// m_local.d4.z = dz.mul64(GSVector4::f32to64(shift));
|
||||
cvtps2pd(xmm1, xmm3);
|
||||
mulpd(xmm1, xmm0);
|
||||
movaps(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
|
||||
// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
|
||||
movdqa(_rip_local(d[i].z), xmm1);
|
||||
cvtps2pd(xmm1, XYm(4 + i));
|
||||
pshufd(xmm2, XYm(4 + i), _MM_SHUFFLE(1, 0, 3, 2));
|
||||
cvtps2pd(xmm2, xmm2);
|
||||
mulpd(xmm1, xmm0);
|
||||
mulpd(xmm2, xmm0);
|
||||
movaps(_rip_local(d[i].z0), xmm1);
|
||||
movaps(_rip_local(d[i].z1), xmm2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -276,23 +281,34 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
|
|||
|
||||
if (m_en.z)
|
||||
{
|
||||
// const VectorF dz = VectorF::broadcast32(&dscan.p.z);
|
||||
vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
|
||||
// const VectorF dz = VectorF::broadcast64(&dscan.p.z);
|
||||
vbroadcastsd(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
|
||||
|
||||
// local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
|
||||
vmulss(xmm1, xmm0, xmm3);
|
||||
movss(_rip_local_d_p(z), xmm1);
|
||||
// GSVector4::storel(&local.d8.p.z, dz.extract<0>().mul64(GSVector4::f32to64(shift)));
|
||||
cvtss2sd(xmm1, xmm3);
|
||||
vmulsd(xmm1, xmm0, xmm1);
|
||||
movsd(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
// m_local.d[i].z0 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 2]));
|
||||
// m_local.d[i].z1 = dz.mul64(VectorF::f32to64(half_shift[2 * i + 3]));
|
||||
|
||||
// Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative)
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm1, Ymm(4 + i), ymm0);
|
||||
{
|
||||
cvtps2pd(ymm1, Xmm(4 + i));
|
||||
vextracti128(xmm2, Ymm(4 + i), 1);
|
||||
cvtps2pd(ymm2, xmm2);
|
||||
}
|
||||
else
|
||||
vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
movaps(_rip_local(d[i].z), ymm1);
|
||||
{
|
||||
cvtps2pd(ymm1, ptr[&g_const->m_shift_256b[i + 1][0]]);
|
||||
cvtps2pd(ymm2, ptr[&g_const->m_shift_256b[i + 1][4]]);
|
||||
}
|
||||
mulpd(ymm1, ymm0);
|
||||
mulpd(ymm2, ymm0);
|
||||
movaps(_rip_local(d[i].z0), ymm1);
|
||||
movaps(_rip_local(d[i].z1), ymm2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,16 @@
|
|||
|
||||
struct alignas(32) GSVertexSW
|
||||
{
|
||||
// When drawing sprites:
|
||||
// p: x y _ f
|
||||
// t: s t q z
|
||||
// c: r g b a
|
||||
// Otherwise:
|
||||
// p: x y zl zh
|
||||
// t: s t q f
|
||||
// c: r g b a
|
||||
// cov is placed in x since by the time it's known, xy are no longer needed
|
||||
|
||||
GSVector4 p, _pad, t, c;
|
||||
|
||||
__forceinline GSVertexSW() {}
|
||||
|
@ -43,7 +53,8 @@ struct alignas(32) GSVertexSW
|
|||
|
||||
__forceinline void operator+=(const GSVertexSW& v)
|
||||
{
|
||||
p += v.p;
|
||||
GSVector4::storel(&p, GSVector4::loadl(&p) + GSVector4::loadl(&v.p));
|
||||
p.F64[1] += v.p.F64[1];
|
||||
t += v.t;
|
||||
c += v.c;
|
||||
}
|
||||
|
@ -52,7 +63,8 @@ struct alignas(32) GSVertexSW
|
|||
{
|
||||
GSVertexSW v;
|
||||
|
||||
v.p = a.p + b.p;
|
||||
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) + GSVector4::loadl(&b.p));
|
||||
v.p.F64[1] = a.p.F64[1] + b.p.F64[1];
|
||||
v.t = a.t + b.t;
|
||||
v.c = a.c + b.c;
|
||||
|
||||
|
@ -63,7 +75,8 @@ struct alignas(32) GSVertexSW
|
|||
{
|
||||
GSVertexSW v;
|
||||
|
||||
v.p = a.p - b.p;
|
||||
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) - GSVector4::loadl(&b.p));
|
||||
v.p.F64[1] = a.p.F64[1] - b.p.F64[1];
|
||||
v.t = a.t - b.t;
|
||||
v.c = a.c - b.c;
|
||||
|
||||
|
@ -74,7 +87,8 @@ struct alignas(32) GSVertexSW
|
|||
{
|
||||
GSVertexSW v;
|
||||
|
||||
v.p = a.p * b;
|
||||
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) * b);
|
||||
v.p.F64[1] = a.p.F64[1] * b.F32[0];
|
||||
v.t = a.t * b;
|
||||
v.c = a.c * b;
|
||||
|
||||
|
@ -85,7 +99,8 @@ struct alignas(32) GSVertexSW
|
|||
{
|
||||
GSVertexSW v;
|
||||
|
||||
v.p = a.p / b;
|
||||
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) / b);
|
||||
v.p.F64[1] = a.p.F64[1] / b.F32[0];
|
||||
v.t = a.t / b;
|
||||
v.c = a.c / b;
|
||||
|
||||
|
@ -249,11 +264,23 @@ struct alignas(32) GSVertexSW2
|
|||
{
|
||||
GSVertexSW2 v;
|
||||
|
||||
v.p = a.p - b.p;
|
||||
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) - GSVector4::loadl(&b.p));
|
||||
v.p.F64[1] = a.p.F64[1] - b.p.F64[1];
|
||||
v.tc = a.tc - b.tc;
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
__forceinline friend GSVertexSW2 operator*(const GSVertexSW2& a, const GSVector8& b)
|
||||
{
|
||||
GSVertexSW2 v;
|
||||
|
||||
GSVector4::storel(&v.p, GSVector4::loadl(&a.p) * b.extract<0>());
|
||||
v.p.F64[1] = a.p.F64[1] * b.F32[0];
|
||||
v.tc = a.tc * b;
|
||||
|
||||
return v;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue