gsdx sw x64: use rip addressing on setup prim

byte code: 9017 => 8736

Save a register
This commit is contained in:
Gregory Hainaut 2016-11-20 14:07:58 +01:00
parent 923c297dfc
commit 7c06e87d59
4 changed files with 32 additions and 23 deletions

View File

@ -179,7 +179,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack) // these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
struct struct
{ {
GSVector8 z, zo; GSVector8 z, zo;
GSVector8i f; GSVector8i f;
@ -196,7 +196,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
GSVector8i uv_minmax[2]; GSVector8i uv_minmax[2];
GSVector8i trb, tga; GSVector8i trb, tga;
GSVector8i test; GSVector8i test;
} temp; } temp;
#else #else
@ -207,7 +207,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack) // these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
struct struct
{ {
GSVector4 z, zo; GSVector4 z, zo;
GSVector4i f; GSVector4i f;
@ -224,7 +224,7 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
GSVector4i uv_minmax[2]; GSVector4i uv_minmax[2];
GSVector4i trb, tga; GSVector4i trb, tga;
GSVector4i test; GSVector4i test;
} temp; } temp;
#endif #endif

View File

@ -67,6 +67,7 @@ void GSSetupPrimCodeGenerator::InitVectors()
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize) GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize) : GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param) , m_local(*(GSScanlineLocalData*)param)
, m_rip(false)
{ {
m_sel.key = key; m_sel.key = key;

View File

@ -31,6 +31,7 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
GSScanlineSelector m_sel; GSScanlineSelector m_sel;
GSScanlineLocalData& m_local; GSScanlineLocalData& m_local;
bool m_rip;
struct {uint32 z:1, f:1, t:1, c:1;} m_en; struct {uint32 z:1, f:1, t:1, c:1;} m_en;

View File

@ -27,8 +27,14 @@
using namespace Xbyak; using namespace Xbyak;
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
void GSSetupPrimCodeGenerator::Generate_AVX() void GSSetupPrimCodeGenerator::Generate_AVX()
{ {
// Technically we just need the delta < 2GB
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
#ifdef _WIN64 #ifdef _WIN64
sub(rsp, 8 + 2 * 16); sub(rsp, 8 + 2 * 16);
@ -36,7 +42,8 @@ void GSSetupPrimCodeGenerator::Generate_AVX()
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
#endif #endif
mov(t0, (size_t)&m_local); if (!m_rip)
mov(t0, (size_t)&m_local);
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip) if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{ {
@ -89,7 +96,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX()
vcvttps2dq(xmm2, xmm2); vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2); vmovdqa(_rip_local(d4.f), xmm2);
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
@ -101,7 +108,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX()
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[t0 + variableOffset], xmm2); vmovdqa(_rip_local_v(d[i].f, variableOffset), xmm2);
} }
} }
@ -114,7 +121,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX()
// m_local.d4.z = dz * 4.0f; // m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3); vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1); vmovdqa(_rip_local(d4.z), xmm1);
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{ {
@ -123,7 +130,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX()
vmulps(xmm1, xmm0, Xmm(4 + i)); vmulps(xmm1, xmm0, Xmm(4 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[t0 + variableOffset], xmm1); vmovdqa(_rip_local_v(d[i].z, variableOffset), xmm1);
} }
} }
} }
@ -144,7 +151,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX()
vcvttps2dq(xmm1, xmm0); vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1); vmovdqa(_rip_local(p.f), xmm1);
} }
if(m_en.z) if(m_en.z)
@ -153,7 +160,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX()
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]); vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0); vmovdqa(_rip_local(p.z), xmm0);
} }
} }
} }
@ -177,13 +184,13 @@ void GSSetupPrimCodeGenerator::Texture_AVX()
vcvttps2dq(xmm1, xmm1); vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); vmovdqa(_rip_local(d4.stq), xmm1);
} }
else else
{ {
// m_local.d4.stq = t * 4.0f; // m_local.d4.stq = t * 4.0f;
vmovaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); vmovaps(_rip_local(d4.stq), xmm1);
} }
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -211,8 +218,8 @@ void GSSetupPrimCodeGenerator::Texture_AVX()
switch(j) switch(j)
{ {
case 0: vmovdqa(ptr[t0 + variableOffsetS], xmm2); break; case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
case 1: vmovdqa(ptr[t0 + variableOffsetT], xmm2); break; case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
} }
} }
else else
@ -225,9 +232,9 @@ void GSSetupPrimCodeGenerator::Texture_AVX()
switch(j) switch(j)
{ {
case 0: vmovaps(ptr[t0 + variableOffsetS], xmm2); break; case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
case 1: vmovaps(ptr[t0 + variableOffsetT], xmm2); break; case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
case 2: vmovaps(ptr[t0 + variableOffsetQ], xmm2); break; case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), xmm2); break;
} }
} }
} }
@ -253,7 +260,7 @@ void GSSetupPrimCodeGenerator::Color_AVX()
vcvttps2dq(xmm1, xmm1); vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0)); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1); vpackssdw(xmm1, xmm1);
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm1); vmovdqa(_rip_local(d4.c), xmm1);
// xmm3 is not needed anymore // xmm3 is not needed anymore
@ -282,7 +289,7 @@ void GSSetupPrimCodeGenerator::Color_AVX()
vpunpcklwd(xmm0, xmm1); vpunpcklwd(xmm0, xmm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[t0 + variableOffset], xmm0); vmovdqa(_rip_local_v(d[i].rb, variableOffset), xmm0);
} }
// GSVector4 c = dscan.c; // GSVector4 c = dscan.c;
@ -314,7 +321,7 @@ void GSSetupPrimCodeGenerator::Color_AVX()
vpunpcklwd(xmm0, xmm1); vpunpcklwd(xmm0, xmm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0])); const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(ptr[t0 + variableOffset], xmm0); vmovdqa(_rip_local_v(d[i].ga, variableOffset), xmm0);
} }
} }
else else
@ -358,8 +365,8 @@ void GSSetupPrimCodeGenerator::Color_AVX()
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1); vmovdqa(_rip_local(c.rb), xmm1);
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2); vmovdqa(_rip_local(c.ga), xmm2);
} }
} }