mirror of https://github.com/PCSX2/pcsx2.git
gsdx sw x64: update setup prim generator x64 SSE&AVX
This commit is contained in:
parent
4a47224ac1
commit
8e29e09943
|
@ -29,18 +29,20 @@ using namespace Xbyak;
|
||||||
|
|
||||||
void GSSetupPrimCodeGenerator::Generate()
|
void GSSetupPrimCodeGenerator::Generate()
|
||||||
{
|
{
|
||||||
|
#ifdef _WIN64
|
||||||
sub(rsp, 8 + 2 * 16);
|
sub(rsp, 8 + 2 * 16);
|
||||||
|
|
||||||
vmovdqa(ptr[rsp + 0], xmm6);
|
vmovdqa(ptr[rsp + 0], xmm6);
|
||||||
vmovdqa(ptr[rsp + 16], xmm7);
|
vmovdqa(ptr[rsp + 16], xmm7);
|
||||||
|
#endif
|
||||||
|
|
||||||
mov(r8, (size_t)&m_local);
|
mov(t0, (size_t)&m_local);
|
||||||
|
|
||||||
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||||
{
|
{
|
||||||
mov(rax, (size_t)&m_shift[0]);
|
mov(rax, (size_t)&m_shift[0]);
|
||||||
|
|
||||||
for(int i = 0; i < 5; i++)
|
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||||
{
|
{
|
||||||
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||||
}
|
}
|
||||||
|
@ -52,10 +54,12 @@ void GSSetupPrimCodeGenerator::Generate()
|
||||||
|
|
||||||
Color();
|
Color();
|
||||||
|
|
||||||
|
#ifdef _WIN64
|
||||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||||
|
|
||||||
add(rsp, 8 + 2 * 16);
|
add(rsp, 8 + 2 * 16);
|
||||||
|
#endif
|
||||||
|
|
||||||
ret();
|
ret();
|
||||||
}
|
}
|
||||||
|
@ -71,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
{
|
{
|
||||||
// GSVector4 p = dscan.p;
|
// GSVector4 p = dscan.p;
|
||||||
|
|
||||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
|
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
|
@ -85,9 +89,9 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
vcvttps2dq(xmm2, xmm2);
|
vcvttps2dq(xmm2, xmm2);
|
||||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||||
|
|
||||||
|
@ -97,7 +101,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
vmovdqa(ptr[r8 + variableOffset], xmm2);
|
vmovdqa(ptr[t0 + variableOffset], xmm2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,24 +114,28 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
// m_local.d4.z = dz * 4.0f;
|
// m_local.d4.z = dz * 4.0f;
|
||||||
|
|
||||||
vmulps(xmm1, xmm0, xmm3);
|
vmulps(xmm1, xmm0, xmm3);
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// m_local.d[i].z = dz * m_shift[i];
|
// m_local.d[i].z = dz * m_shift[i];
|
||||||
|
|
||||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
vmovdqa(ptr[r8 + variableOffset], xmm1);
|
vmovdqa(ptr[t0 + variableOffset], xmm1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// GSVector4 p = vertices[0].p;
|
// GSVector4 p = vertex[index[1]].p;
|
||||||
|
|
||||||
vmovaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
|
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
|
||||||
|
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||||
|
add(rax, a0);
|
||||||
|
|
||||||
|
vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
|
@ -136,41 +144,16 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
vcvttps2dq(xmm1, xmm0);
|
vcvttps2dq(xmm1, xmm0);
|
||||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(m_en.z)
|
if(m_en.z)
|
||||||
{
|
{
|
||||||
// GSVector4 z = p.zzzz();
|
// uint32 z is bypassed in t.w
|
||||||
|
|
||||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||||
|
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||||
if(m_sel.zoverflow)
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||||
{
|
|
||||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
|
||||||
|
|
||||||
mov(r9, (size_t)&GSVector4::m_half);
|
|
||||||
|
|
||||||
vbroadcastss(xmm1, ptr[r9]);
|
|
||||||
vmulps(xmm1, xmm0);
|
|
||||||
vcvttps2dq(xmm1, xmm1);
|
|
||||||
vpslld(xmm1, 1);
|
|
||||||
|
|
||||||
vcvttps2dq(xmm0, xmm0);
|
|
||||||
vpcmpeqd(xmm2, xmm2);
|
|
||||||
vpsrld(xmm2, 31);
|
|
||||||
vpand(xmm0, xmm2);
|
|
||||||
|
|
||||||
vpor(xmm0, xmm1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// m_local.p.z = GSVector4i(z);
|
|
||||||
|
|
||||||
vcvttps2dq(xmm0, xmm0);
|
|
||||||
}
|
|
||||||
|
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -184,7 +167,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
// GSVector4 t = dscan.t;
|
// GSVector4 t = dscan.t;
|
||||||
|
|
||||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
|
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||||
|
|
||||||
vmulps(xmm1, xmm0, xmm3);
|
vmulps(xmm1, xmm0, xmm3);
|
||||||
|
|
||||||
|
@ -194,13 +177,13 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
vcvttps2dq(xmm1, xmm1);
|
vcvttps2dq(xmm1, xmm1);
|
||||||
|
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// m_local.d4.stq = t * 4.0f;
|
// m_local.d4.stq = t * 4.0f;
|
||||||
|
|
||||||
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
vmovaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||||
|
@ -211,7 +194,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// GSVector4 v = ds/dt * m_shift[i];
|
// GSVector4 v = ds/dt * m_shift[i];
|
||||||
|
|
||||||
|
@ -228,8 +211,8 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
switch(j)
|
switch(j)
|
||||||
{
|
{
|
||||||
case 0: vmovdqa(ptr[r8 + variableOffsetS], xmm2); break;
|
case 0: vmovdqa(ptr[t0 + variableOffsetS], xmm2); break;
|
||||||
case 1: vmovdqa(ptr[r8 + variableOffsetT], xmm2); break;
|
case 1: vmovdqa(ptr[t0 + variableOffsetT], xmm2); break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -242,9 +225,9 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
switch(j)
|
switch(j)
|
||||||
{
|
{
|
||||||
case 0: vmovaps(ptr[r8 + variableOffsetS], xmm2); break;
|
case 0: vmovaps(ptr[t0 + variableOffsetS], xmm2); break;
|
||||||
case 1: vmovaps(ptr[r8 + variableOffsetT], xmm2); break;
|
case 1: vmovaps(ptr[t0 + variableOffsetT], xmm2); break;
|
||||||
case 2: vmovaps(ptr[r8 + variableOffsetQ], xmm2); break;
|
case 2: vmovaps(ptr[t0 + variableOffsetQ], xmm2); break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -262,7 +245,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
{
|
{
|
||||||
// GSVector4 c = dscan.c;
|
// GSVector4 c = dscan.c;
|
||||||
|
|
||||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
|
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||||
|
|
||||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||||
|
|
||||||
|
@ -270,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
vcvttps2dq(xmm1, xmm1);
|
vcvttps2dq(xmm1, xmm1);
|
||||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||||
vpackssdw(xmm1, xmm1);
|
vpackssdw(xmm1, xmm1);
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm1);
|
||||||
|
|
||||||
// xmm3 is not needed anymore
|
// xmm3 is not needed anymore
|
||||||
|
|
||||||
|
@ -299,12 +282,12 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
vpunpcklwd(xmm0, xmm1);
|
vpunpcklwd(xmm0, xmm1);
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
vmovdqa(ptr[r8 + variableOffset], xmm0);
|
vmovdqa(ptr[t0 + variableOffset], xmm0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// GSVector4 c = dscan.c;
|
// GSVector4 c = dscan.c;
|
||||||
|
|
||||||
vmovaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||||
|
|
||||||
// GSVector4 dg = c.yyyy();
|
// GSVector4 dg = c.yyyy();
|
||||||
// GSVector4 da = c.wwww();
|
// GSVector4 da = c.wwww();
|
||||||
|
@ -312,7 +295,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||||
|
|
||||||
|
@ -331,14 +314,31 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
vpunpcklwd(xmm0, xmm1);
|
vpunpcklwd(xmm0, xmm1);
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
vmovdqa(ptr[r8 + variableOffset], xmm0);
|
vmovdqa(ptr[t0 + variableOffset], xmm0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||||
|
|
||||||
vcvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
|
int last = 0;
|
||||||
|
|
||||||
|
switch(m_sel.prim)
|
||||||
|
{
|
||||||
|
case GS_POINT_CLASS: last = 0; break;
|
||||||
|
case GS_LINE_CLASS: last = 1; break;
|
||||||
|
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||||
|
case GS_SPRITE_CLASS: last = 1; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||||
|
{
|
||||||
|
mov(eax, ptr[a1 + sizeof(uint32) * last]);
|
||||||
|
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||||
|
add(rax, a0);
|
||||||
|
}
|
||||||
|
|
||||||
|
vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||||
|
|
||||||
// c = c.upl16(c.zwxy());
|
// c = c.upl16(c.zwxy());
|
||||||
|
|
||||||
|
@ -358,8 +358,8 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
|
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||||
vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,16 +29,20 @@ using namespace Xbyak;
|
||||||
|
|
||||||
void GSSetupPrimCodeGenerator::Generate()
|
void GSSetupPrimCodeGenerator::Generate()
|
||||||
{
|
{
|
||||||
|
#ifdef _WIN64
|
||||||
sub(rsp, 8 + 2 * 16);
|
sub(rsp, 8 + 2 * 16);
|
||||||
|
|
||||||
vmovdqa(ptr[rsp + 0], xmm6);
|
vmovdqa(ptr[rsp + 0], xmm6);
|
||||||
vmovdqa(ptr[rsp + 16], xmm7);
|
vmovdqa(ptr[rsp + 16], xmm7);
|
||||||
|
#endif
|
||||||
|
|
||||||
mov(r8, (size_t)&m_local);
|
mov(t0, (size_t)&m_local);
|
||||||
|
|
||||||
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||||
{
|
{
|
||||||
for(int i = 0; i < 5; i++)
|
mov(rax, (size_t)&m_shift[0]);
|
||||||
|
|
||||||
|
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||||
{
|
{
|
||||||
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||||
}
|
}
|
||||||
|
@ -50,10 +54,12 @@ void GSSetupPrimCodeGenerator::Generate()
|
||||||
|
|
||||||
Color();
|
Color();
|
||||||
|
|
||||||
|
#ifdef _WIN64
|
||||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||||
|
|
||||||
add(rsp, 8 + 2 * 16);
|
add(rsp, 8 + 2 * 16);
|
||||||
|
#endif
|
||||||
|
|
||||||
ret();
|
ret();
|
||||||
}
|
}
|
||||||
|
@ -69,7 +75,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
{
|
{
|
||||||
// GSVector4 p = dscan.p;
|
// GSVector4 p = dscan.p;
|
||||||
|
|
||||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, p)]);
|
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
|
@ -85,9 +91,9 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
cvttps2dq(xmm2, xmm2);
|
cvttps2dq(xmm2, xmm2);
|
||||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||||
|
|
||||||
|
@ -98,7 +104,7 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
movdqa(ptr[r8 + variableOffset], xmm2);
|
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -112,9 +118,9 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
|
|
||||||
movaps(xmm1, xmm0);
|
movaps(xmm1, xmm0);
|
||||||
mulps(xmm1, xmm3);
|
mulps(xmm1, xmm3);
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// m_local.d[i].z = dz * m_shift[i];
|
// m_local.d[i].z = dz * m_shift[i];
|
||||||
|
|
||||||
|
@ -122,15 +128,19 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
mulps(xmm1, Xmm(4 + i));
|
mulps(xmm1, Xmm(4 + i));
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
movdqa(ptr[r8 + variableOffset], xmm1);
|
movdqa(ptr[t0 + variableOffset], xmm1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// GSVector4 p = vertices[0].p;
|
// GSVector4 p = vertex[index[1]].p;
|
||||||
|
|
||||||
movaps(xmm0, ptr[rcx + offsetof(GSVertexSW, p)]);
|
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
|
||||||
|
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||||
|
add(rax, a0);
|
||||||
|
|
||||||
|
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||||
|
|
||||||
if(m_en.f)
|
if(m_en.f)
|
||||||
{
|
{
|
||||||
|
@ -139,42 +149,16 @@ void GSSetupPrimCodeGenerator::Depth()
|
||||||
cvttps2dq(xmm1, xmm0);
|
cvttps2dq(xmm1, xmm0);
|
||||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(m_en.z)
|
if(m_en.z)
|
||||||
{
|
{
|
||||||
// GSVector4 z = p.zzzz();
|
// uint32 z is bypassed in t.w
|
||||||
|
|
||||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||||
|
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||||
if(m_sel.zoverflow)
|
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||||
{
|
|
||||||
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
|
||||||
|
|
||||||
mov(r9, (size_t)&GSVector4::m_half);
|
|
||||||
|
|
||||||
movss(xmm1, ptr[r9]);
|
|
||||||
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
||||||
mulps(xmm1, xmm0);
|
|
||||||
cvttps2dq(xmm1, xmm1);
|
|
||||||
pslld(xmm1, 1);
|
|
||||||
|
|
||||||
cvttps2dq(xmm0, xmm0);
|
|
||||||
pcmpeqd(xmm2, xmm2);
|
|
||||||
psrld(xmm2, 31);
|
|
||||||
pand(xmm0, xmm2);
|
|
||||||
|
|
||||||
por(xmm0, xmm1);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// m_local.p.z = GSVector4i(z);
|
|
||||||
|
|
||||||
cvttps2dq(xmm0, xmm0);
|
|
||||||
}
|
|
||||||
|
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -188,7 +172,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
// GSVector4 t = dscan.t;
|
// GSVector4 t = dscan.t;
|
||||||
|
|
||||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, t)]);
|
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||||
|
|
||||||
movaps(xmm1, xmm0);
|
movaps(xmm1, xmm0);
|
||||||
mulps(xmm1, xmm3);
|
mulps(xmm1, xmm3);
|
||||||
|
@ -199,13 +183,13 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
cvttps2dq(xmm1, xmm1);
|
cvttps2dq(xmm1, xmm1);
|
||||||
|
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// m_local.d4.stq = t * 4.0f;
|
// m_local.d4.stq = t * 4.0f;
|
||||||
|
|
||||||
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||||
|
@ -217,7 +201,7 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
movaps(xmm1, xmm0);
|
movaps(xmm1, xmm0);
|
||||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// GSVector4 v = ds/dt * m_shift[i];
|
// GSVector4 v = ds/dt * m_shift[i];
|
||||||
|
|
||||||
|
@ -235,8 +219,8 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
switch(j)
|
switch(j)
|
||||||
{
|
{
|
||||||
case 0: movdqa(ptr[r8 + variableOffsetS], xmm2); break;
|
case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break;
|
||||||
case 1: movdqa(ptr[r8 + variableOffsetT], xmm2); break;
|
case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -249,9 +233,9 @@ void GSSetupPrimCodeGenerator::Texture()
|
||||||
|
|
||||||
switch(j)
|
switch(j)
|
||||||
{
|
{
|
||||||
case 0: movaps(ptr[r8 + variableOffsetS], xmm2); break;
|
case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break;
|
||||||
case 1: movaps(ptr[r8 + variableOffsetT], xmm2); break;
|
case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break;
|
||||||
case 2: movaps(ptr[r8 + variableOffsetQ], xmm2); break;
|
case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -269,7 +253,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
{
|
{
|
||||||
// GSVector4 c = dscan.c;
|
// GSVector4 c = dscan.c;
|
||||||
|
|
||||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]);
|
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||||
movaps(xmm1, xmm0);
|
movaps(xmm1, xmm0);
|
||||||
|
|
||||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||||
|
@ -279,7 +263,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
cvttps2dq(xmm2, xmm2);
|
cvttps2dq(xmm2, xmm2);
|
||||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||||
packssdw(xmm2, xmm2);
|
packssdw(xmm2, xmm2);
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
||||||
|
|
||||||
// xmm3 is not needed anymore
|
// xmm3 is not needed anymore
|
||||||
|
|
||||||
|
@ -289,7 +273,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||||
|
|
||||||
|
@ -310,12 +294,12 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
punpcklwd(xmm2, xmm3);
|
punpcklwd(xmm2, xmm3);
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
movdqa(ptr[r8 + variableOffset], xmm2);
|
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// GSVector4 c = dscan.c;
|
// GSVector4 c = dscan.c;
|
||||||
|
|
||||||
movaps(xmm0, ptr[rdx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||||
movaps(xmm1, xmm0);
|
movaps(xmm1, xmm0);
|
||||||
|
|
||||||
// GSVector4 dg = c.yyyy();
|
// GSVector4 dg = c.yyyy();
|
||||||
|
@ -324,7 +308,7 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||||
|
|
||||||
for(int i = 0; i < 4; i++)
|
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||||
{
|
{
|
||||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||||
|
|
||||||
|
@ -345,14 +329,31 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
punpcklwd(xmm2, xmm3);
|
punpcklwd(xmm2, xmm3);
|
||||||
|
|
||||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||||
movdqa(ptr[r8 + variableOffset], xmm2);
|
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// GSVector4i c = GSVector4i(vertices[0].c);
|
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||||
|
|
||||||
cvttps2dq(xmm0, ptr[rcx + offsetof(GSVertexSW, c)]);
|
int last = 0;
|
||||||
|
|
||||||
|
switch(m_sel.prim)
|
||||||
|
{
|
||||||
|
case GS_POINT_CLASS: last = 0; break;
|
||||||
|
case GS_LINE_CLASS: last = 1; break;
|
||||||
|
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||||
|
case GS_SPRITE_CLASS: last = 1; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||||
|
{
|
||||||
|
mov(eax, ptr[a1 + sizeof(uint32) * last]);
|
||||||
|
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||||
|
add(rax, a0);
|
||||||
|
}
|
||||||
|
|
||||||
|
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||||
|
|
||||||
// c = c.upl16(c.zwxy());
|
// c = c.upl16(c.zwxy());
|
||||||
|
|
||||||
|
@ -372,8 +373,8 @@ void GSSetupPrimCodeGenerator::Color()
|
||||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||||
|
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||||
movdqa(ptr[r8 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue