mirror of https://github.com/PCSX2/pcsx2.git
GS:SW: Rearrange GSVertexSW members to better match planned DoubleZ arrangement
Note: Removes zequal. DoubleZ will fix, but until then things will break
This commit is contained in:
parent
56bba522ac
commit
9be7eb67d8
|
@ -98,7 +98,6 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
|
|||
sel.fb = m_global.sel.fb;
|
||||
sel.zb = m_global.sel.zb;
|
||||
sel.zoverflow = m_global.sel.zoverflow;
|
||||
sel.zequal = m_global.sel.zequal;
|
||||
sel.notest = m_global.sel.notest;
|
||||
|
||||
m_sp = m_sp_map[sel];
|
||||
|
@ -138,23 +137,22 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
|
|||
const GSVector4 step_shift = shift[0];
|
||||
#endif
|
||||
|
||||
GSVector4 tstep = dscan.t * step_shift;
|
||||
|
||||
if (has_z || has_f)
|
||||
{
|
||||
if (sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
GSVector4 dp8 = dscan.p * step_shift;
|
||||
#endif
|
||||
if (has_f)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
local.d8.p.f = GSVector4i(tstep).extract32<3>();
|
||||
|
||||
GSVector8 df = GSVector8::broadcast32(&dscan.p.w);
|
||||
GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
|
||||
#else
|
||||
GSVector4 df = dscan.p.wwww();
|
||||
GSVector4 df = dscan.t.wwww();
|
||||
|
||||
local.d4.f = GSVector4i(df * shift[0]).xxzzlh();
|
||||
local.d4.f = GSVector4i(tstep).zzzzh().wwww();
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < vlen; i++)
|
||||
|
@ -165,25 +163,18 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
|
|||
|
||||
if (has_z)
|
||||
{
|
||||
if (sel.zequal)
|
||||
{
|
||||
local.p.z = vertex[index[1]].t.U32[3];
|
||||
}
|
||||
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
local.d8.p.z = dp8.extract32<2>();
|
||||
const VectorF dz = VectorF::broadcast32(&dscan.p.z);
|
||||
|
||||
const GSVector8 dz = GSVector8::broadcast32(&dscan.p.z);
|
||||
local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
|
||||
#else
|
||||
const GSVector4 dz = dscan.p.zzzz();
|
||||
const GSVector4 dz = dscan.p.zzzz();
|
||||
|
||||
local.d4.z = dz * shift[0];
|
||||
local.d4.z = dz * step_shift;
|
||||
#endif
|
||||
for (int i = 0; i < vlen; i++)
|
||||
{
|
||||
local.d[i].z = dz * shift[1 + i];
|
||||
}
|
||||
for (int i = 0; i < vlen; i++)
|
||||
{
|
||||
local.d[i].z = dz * shift[1 + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -207,8 +198,6 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
|
|||
|
||||
if (has_t)
|
||||
{
|
||||
GSVector4 tstep = dscan.t * step_shift;
|
||||
|
||||
if (sel.fst)
|
||||
{
|
||||
LOCAL_STEP.stq = GSVector4::cast(GSVector4i(tstep));
|
||||
|
@ -361,9 +350,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
if (sel.fwrite && sel.fge)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
f = GSVector8i::broadcast16(GSVector4i(scan.p).srl<12>()).add16(local.d[skip].f);
|
||||
f = GSVector8i::broadcast16(GSVector4i(scan.t).srl<12>()).add16(local.d[skip].f);
|
||||
#else
|
||||
f = GSVector4i(scan.p).zzzzh().zzzz().add16(local.d[skip].f);
|
||||
f = GSVector4i(scan.t).zzzzh().zzzz().add16(local.d[skip].f);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -378,9 +367,9 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
if (sel.edge)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
cov = GSVector8i::broadcast16(GSVector4i::cast(scan.t).srl<12>()).srl16(9);
|
||||
cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
|
||||
#else
|
||||
cov = GSVector4i::cast(scan.t).zzzzh().wwww().srl16(9);
|
||||
cov = GSVector4i::cast(scan.p).xxxxl().xxxx().srl16(9);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -467,11 +456,7 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
|
|||
GSVector4 z = scan.p.zzzz() + zo;
|
||||
#endif
|
||||
|
||||
if (sel.zequal)
|
||||
{
|
||||
zs = local.p.z;
|
||||
}
|
||||
else if (sel.zoverflow)
|
||||
if (sel.zoverflow)
|
||||
{
|
||||
zs = (VectorI(z * 0.5f) << 1) | (VectorI(z) & VectorI::x00000001());
|
||||
}
|
||||
|
|
|
@ -697,28 +697,33 @@ void GSDrawScanlineCodeGenerator2::Init()
|
|||
{
|
||||
if (m_sel.fwrite && m_sel.fge || m_sel.zb)
|
||||
{
|
||||
broadcastf128(z, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p
|
||||
|
||||
if (m_sel.fwrite && m_sel.fge)
|
||||
{
|
||||
// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
|
||||
// f = GSVector4i(v.t).zzzzh().zzzz().add16(m_local.d[skip].f);
|
||||
if (isYmm)
|
||||
vbroadcastss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]);
|
||||
else
|
||||
movss(f, ptr[a3 + offsetof(GSVertexSW, t.w)]); // v.t.w
|
||||
|
||||
cvttps2dq(f, z);
|
||||
pshufhw(f, f, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(f, f, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
cvttps2dq(f, f);
|
||||
punpcklwd(f, f);
|
||||
pshufd(f, f, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
paddw(f, ptr[a1 + offsetof(GSScanlineLocalData::skip, f)]);
|
||||
}
|
||||
|
||||
if (m_sel.zb)
|
||||
{
|
||||
if (!m_sel.zequal)
|
||||
// z = vp.zzzz() + m_local.d[skip].z;
|
||||
if (hasAVX)
|
||||
{
|
||||
// z = vp.zzzz() + m_local.d[skip].z;
|
||||
shufps(z, z, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
|
||||
vbroadcastss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
|
||||
}
|
||||
else
|
||||
pbroadcastdLocal(z, _rip_local(p.z));
|
||||
{
|
||||
movss(z, ptr[a3 + offsetof(GSVertexSW, p.z)]);
|
||||
shufps(z, z, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
}
|
||||
addps(z, ptr[a1 + offsetof(GSScanlineLocalData::skip, z)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -733,21 +738,22 @@ void GSDrawScanlineCodeGenerator2::Init()
|
|||
pbroadcastwLocal(_f, _rip_local(p.f));
|
||||
}
|
||||
|
||||
const XYm& vt = xym4;
|
||||
|
||||
if (m_sel.fb)
|
||||
{
|
||||
if (m_sel.edge || m_sel.tfx != TFX_NONE)
|
||||
{
|
||||
broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
|
||||
}
|
||||
|
||||
if (m_sel.edge)
|
||||
{
|
||||
// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
|
||||
// m_local.temp.cov = GSVector8i::broadcast16(GSVector4i::cast(scan.p)).srl16(9);
|
||||
|
||||
pshufhw(xym3, vt, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xym3, xym3, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
if (hasAVX2)
|
||||
{
|
||||
vpbroadcastw(xym3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
|
||||
}
|
||||
else
|
||||
{
|
||||
movd(xmm3, ptr[a3 + offsetof(GSVertexSW, p.x)]);
|
||||
punpcklwd(xmm3, xmm3);
|
||||
pshufd(xmm3, xmm3, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
}
|
||||
psrlw(xym3, 9);
|
||||
|
||||
movdqa(_rip_local(temp.cov), xym3);
|
||||
|
@ -755,6 +761,10 @@ void GSDrawScanlineCodeGenerator2::Init()
|
|||
|
||||
if (m_sel.tfx != TFX_NONE)
|
||||
{
|
||||
const XYm& vt = xym4;
|
||||
|
||||
broadcastf128(vt, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
|
||||
|
||||
// a1 = &m_local.d[skip]
|
||||
|
||||
const XYm& s = _s;
|
||||
|
@ -901,14 +911,7 @@ void GSDrawScanlineCodeGenerator2::Step()
|
|||
|
||||
if (m_sel.zb)
|
||||
{
|
||||
if (m_sel.zequal)
|
||||
{
|
||||
pbroadcastdLocal(z, _rip_local(p.z));
|
||||
}
|
||||
else
|
||||
{
|
||||
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
|
||||
}
|
||||
BROADCAST_AND_OP(vbroadcastss, addps, z, xym0, _rip_local_d_p(z));
|
||||
}
|
||||
|
||||
// f = f.add16(m_local.d4.f);
|
||||
|
@ -1051,11 +1054,7 @@ void GSDrawScanlineCodeGenerator2::TestZ(const XYm& temp1, const XYm& temp2)
|
|||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
if (m_sel.zequal)
|
||||
{
|
||||
movdqa(xym0, _z);
|
||||
}
|
||||
else if (m_sel.zoverflow)
|
||||
if (m_sel.zoverflow)
|
||||
{
|
||||
// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
||||
/*GSVector8 z = GSVector8::broadcast32(&scan.p.z) + zo;
|
||||
|
|
|
@ -963,7 +963,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
AddScanline(e, 1, xi, top, edge);
|
||||
|
||||
e->t.U32[3] = (0x10000 - xf) & 0xffff;
|
||||
e->p.U32[0] = (0x10000 - xf) & 0xffff;
|
||||
|
||||
e++;
|
||||
}
|
||||
|
@ -986,7 +986,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
AddScanline(e, 1, xi, top, edge);
|
||||
|
||||
e->t.U32[3] = xf;
|
||||
e->p.U32[0] = xf;
|
||||
|
||||
e++;
|
||||
}
|
||||
|
@ -1053,7 +1053,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
AddScanline(e, 1, left, yi, edge);
|
||||
|
||||
e->t.U32[3] = (0x10000 - yf) & 0xffff;
|
||||
e->p.U32[0] = (0x10000 - yf) & 0xffff;
|
||||
|
||||
e++;
|
||||
}
|
||||
|
@ -1076,7 +1076,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
|
|||
{
|
||||
AddScanline(e, 1, left, yi, edge);
|
||||
|
||||
e->t.U32[3] = yf;
|
||||
e->p.U32[0] = yf;
|
||||
|
||||
e++;
|
||||
}
|
||||
|
|
|
@ -198,74 +198,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
{
|
||||
// FIXME q_div wasn't added to AVX2 code path.
|
||||
|
||||
#if 0 //_M_SSE >= 0x501
|
||||
|
||||
// TODO: something isn't right here, this makes other functions slower (split load/store? old sse code in 3rd party lib?)
|
||||
|
||||
GSVector8i o2((GSVector4i)m_context->XYOFFSET);
|
||||
GSVector8 tsize2(GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0));
|
||||
|
||||
for(int i = (int)m_vertex.next; i > 0; i -= 2, src += 2, dst += 2) // ok to overflow, allocator makes sure there is one more dummy vertex
|
||||
{
|
||||
GSVector8i v0 = GSVector8i::load<true>(src[0].m);
|
||||
GSVector8i v1 = GSVector8i::load<true>(src[1].m);
|
||||
|
||||
GSVector8 stcq = GSVector8::cast(v0.ac(v1));
|
||||
GSVector8i xyzuvf = v0.bd(v1);
|
||||
|
||||
//GSVector8 stcq = GSVector8::load(&src[0].m[0], &src[1].m[0]);
|
||||
//GSVector8i xyzuvf = GSVector8i::load(&src[0].m[1], &src[1].m[1]);
|
||||
|
||||
GSVector8i xy = xyzuvf.upl16() - o2;
|
||||
GSVector8i zf = xyzuvf.ywww().min_u32(GSVector8i::xffffff00());
|
||||
|
||||
GSVector8 p = GSVector8(xy).xyxy(GSVector8(zf) + (GSVector8::m_x4f800000 & GSVector8::cast(zf.sra32(31)))) * m_pos_scale2;
|
||||
GSVector8 c = GSVector8(GSVector8i::cast(stcq).uph8().upl16() << 7);
|
||||
|
||||
GSVector8 t = GSVector8::zero();
|
||||
|
||||
if(tme)
|
||||
{
|
||||
if(fst)
|
||||
{
|
||||
t = GSVector8(xyzuvf.uph16() << (16 - 4));
|
||||
}
|
||||
else
|
||||
{
|
||||
t = stcq.xyww() * tsize2;
|
||||
}
|
||||
}
|
||||
|
||||
if(primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
t = t.insert32<1, 3>(GSVector8::cast(xyzuvf));
|
||||
}
|
||||
|
||||
GSVector8::storel(&dst[0].p, p);
|
||||
|
||||
if(tme || primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
GSVector8::store<true>(&dst[0].t, t.ac(c));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector8::storel(&dst[0].c, c);
|
||||
}
|
||||
|
||||
GSVector8::storeh(&dst[1].p, p);
|
||||
|
||||
if(tme || primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
GSVector8::store<true>(&dst[1].t, t.bd(c));
|
||||
}
|
||||
else
|
||||
{
|
||||
GSVector8::storeh(&dst[1].c, c);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
GSVector4i off = (GSVector4i)m_context->XYOFFSET;
|
||||
GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0);
|
||||
GSVector4i z_max = GSVector4i::xffffffff().srl32(GSLocalMemory::m_psm[m_context->ZBUF.PSM].fmt * 8);
|
||||
|
@ -277,9 +209,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
GSVector4i xyzuvf(src->m[1]);
|
||||
|
||||
GSVector4i xy = xyzuvf.upl16() - off;
|
||||
GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00());
|
||||
|
||||
dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * m_pos_scale;
|
||||
dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7);
|
||||
|
||||
GSVector4 t = GSVector4::zero();
|
||||
|
@ -311,11 +241,19 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
}
|
||||
}
|
||||
|
||||
if (primclass == GS_SPRITE_CLASS || m_vt.m_eq.z)
|
||||
if (primclass == GS_SPRITE_CLASS)
|
||||
{
|
||||
dst->p = GSVector4(xy).xyyw(GSVector4(xyzuvf)) * m_pos_scale;
|
||||
|
||||
xyzuvf = xyzuvf.min_u32(z_max);
|
||||
t = t.insert32<1, 3>(GSVector4::cast(xyzuvf));
|
||||
}
|
||||
else
|
||||
{
|
||||
float z = static_cast<float>(static_cast<u32>(xyzuvf.extract32<1>()));
|
||||
dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4(z, 0.0, 0.0, 0.0));
|
||||
t = t.blend32<8>(GSVector4(xyzuvf << 7));
|
||||
}
|
||||
|
||||
dst->t = t;
|
||||
|
||||
|
@ -325,8 +263,6 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
|||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void GSRendererSW::Draw()
|
||||
|
@ -1352,7 +1288,6 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
|||
|
||||
gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt;
|
||||
gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS;
|
||||
gd.sel.zequal = !!m_vt.m_eq.z;
|
||||
gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U;
|
||||
gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max;
|
||||
}
|
||||
|
|
|
@ -64,7 +64,6 @@ union GSScanlineSelector
|
|||
u32 mmin : 2; // 54
|
||||
u32 notest : 1; // 55 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
|
||||
// TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction
|
||||
u32 zequal : 1; // 56
|
||||
u32 breakpoint : 1; // Insert a trap to stop the program, helpful to stop debugger on a program
|
||||
};
|
||||
|
||||
|
|
|
@ -82,6 +82,19 @@ void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem
|
|||
#endif
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::broadcastss(const XYm& reg, const Address& mem)
|
||||
{
|
||||
if (hasAVX)
|
||||
{
|
||||
vbroadcastss(reg, mem);
|
||||
}
|
||||
else
|
||||
{
|
||||
movss(reg, mem);
|
||||
shufps(reg, reg, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Generate()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
|
@ -152,16 +165,10 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
|
|||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
|
||||
movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
// GSVector4 df = t.wwww();
|
||||
broadcastss(xym1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
|
@ -185,38 +192,21 @@ void GSSetupPrimCodeGenerator2::Depth_XMM()
|
|||
|
||||
if (m_en.z)
|
||||
{
|
||||
if (m_sel.zequal)
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
broadcastss(xym0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
movdqa(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
u32 offset = 0;
|
||||
if (m_sel.prim != GS_POINT_CLASS)
|
||||
offset = sizeof(u32) * 1;
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
mov(eax, ptr[_index + offset]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, _64_vertex);
|
||||
|
||||
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
movdqa(_rip_local(p.z), xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
movdqa(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
|
||||
movdqa(_rip_local(d[i].z), xmm1);
|
||||
}
|
||||
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
|
||||
movdqa(_rip_local(d[i].z), xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -259,68 +249,19 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
|
|||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
if (m_sel.zequal)
|
||||
{
|
||||
u32 offset = 0;
|
||||
if (m_sel.prim != GS_POINT_CLASS)
|
||||
offset = sizeof(u32) * 1;
|
||||
|
||||
mov(eax, ptr[_index + offset]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, _64_vertex);
|
||||
|
||||
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
|
||||
mov(_rip_local(p.z), t1.cvt32());
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
extractps(_rip_local_d_p(z), xmm1, 2);
|
||||
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
// GSVector8 df = GSVector8::broadcast32(&dscan.t.w);
|
||||
vbroadcastss(ymm1, ptr[_dscan + offsetof(GSVertexSW, t.w)]);
|
||||
|
||||
cvtps2dq(ymm1, ymm1);
|
||||
pextrd(_rip_local_d_p(f), xmm1, 3);
|
||||
// local.d8.p.f = GSVector4i(tstep).extract32<3>();
|
||||
vmulps(xmm0, xmm1, xmm3);
|
||||
cvtps2dq(xmm0, xmm0);
|
||||
movd(_rip_local_d_p(f), xmm0);
|
||||
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
if (m_en.z)
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm0, Ymm(4 + i), ymm2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
movaps(_rip_local(d[i].z), ymm0);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
// m_local.d[i].f = GSVectorI(df * m_shift[i]).xxzzlh();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm0, Ymm(4 + i), ymm1);
|
||||
|
@ -332,6 +273,28 @@ void GSSetupPrimCodeGenerator2::Depth_YMM()
|
|||
movdqa(_rip_local(d[i].f), ymm0);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// const VectorF dz = VectorF::broadcast32(&dscan.p.z);
|
||||
vbroadcastss(ymm0, ptr[_dscan + offsetof(GSVertexSW, p.z)]);
|
||||
|
||||
// local.d8.p.z = (dz.extract<0>() * step_shift).extract32<0>();
|
||||
vmulss(xmm1, xmm0, xmm3);
|
||||
movss(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
// Save a byte in the encoding for ymm8-11 by swapping with ymm0 (multiplication is communative)
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm1, Ymm(4 + i), ymm0);
|
||||
else
|
||||
vmulps(ymm1, ymm0, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
movaps(_rip_local(d[i].z), ymm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -69,6 +69,8 @@ public:
|
|||
private:
|
||||
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
|
||||
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a 32-bit float to the whole register, whatever size that register might be
|
||||
void broadcastss(const XYm& reg, const Xbyak::Address& mem);
|
||||
|
||||
void Depth_XMM();
|
||||
void Depth_YMM();
|
||||
|
|
Loading…
Reference in New Issue