mirror of https://github.com/PCSX2/pcsx2.git
gsdx sw x64: use rip addressing on draw scan line
byte code 41997 => 41893 However it will allow to save 2 registers as future optimization
This commit is contained in:
parent
7c06e87d59
commit
994274623b
|
@ -108,6 +108,7 @@ void GSDrawScanlineCodeGenerator::InitVectors()
|
|||
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
, m_rip(false)
|
||||
{
|
||||
m_sel.key = key;
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
|
|||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
|
||||
void Generate();
|
||||
|
||||
|
|
|
@ -23,6 +23,8 @@
|
|||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
// Ease the reading of the code
|
||||
#define _m_local r11
|
||||
#define _m_local__gd r12
|
||||
|
@ -45,7 +47,8 @@
|
|||
#define _zm xmm5
|
||||
#define _fd xmm6
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[_m_local + offsetof(GSScanlineLocalData, field)])
|
||||
#define _rip_global(field) (m_rip ? ptr[rip + &m_local.gd->field] : ptr[_m_local__gd + offsetof(GSScanlineGlobalData, field)])
|
||||
|
||||
#ifdef _WIN64
|
||||
#else
|
||||
|
@ -63,6 +66,9 @@ void GSDrawScanlineCodeGenerator::Generate_AVX()
|
|||
{
|
||||
bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
|
||||
bool need_clut = need_tex && m_sel.tlu;
|
||||
m_rip = (size_t)getCurr() < 0x80000000;
|
||||
m_rip &= (size_t)&m_local < 0x80000000;
|
||||
m_rip &= (size_t)&m_local.gd < 0x80000000;
|
||||
|
||||
#ifdef _WIN64
|
||||
push(rbx);
|
||||
|
@ -92,13 +98,13 @@ void GSDrawScanlineCodeGenerator::Generate_AVX()
|
|||
|
||||
mov(r10, (size_t)&m_test[0]);
|
||||
mov(_m_local, (size_t)&m_local);
|
||||
mov(_m_local__gd, ptr[_m_local + offsetof(GSScanlineLocalData, gd)]);
|
||||
mov(_m_local__gd, _rip_local(gd));
|
||||
|
||||
mov(_m_local__gd__vm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, vm)]);
|
||||
mov(_m_local__gd__vm, _rip_global(vm));
|
||||
if(need_clut)
|
||||
mov(_m_local__gd__clut, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, clut)]);
|
||||
mov(_m_local__gd__clut, _rip_global(clut));
|
||||
if(need_tex)
|
||||
mov(_m_local__gd__tex, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, tex)]);
|
||||
mov(_m_local__gd__tex, _rip_global(tex));
|
||||
|
||||
Init_AVX();
|
||||
|
||||
|
@ -321,19 +327,22 @@ void GSDrawScanlineCodeGenerator::Init_AVX()
|
|||
|
||||
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
|
||||
|
||||
mov(rax, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fzbr)]);
|
||||
mov(rax, _rip_global(fzbr));
|
||||
lea(t1, ptr[rax + a2 * 8]);
|
||||
|
||||
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
|
||||
|
||||
mov(rax, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fzbc)]);
|
||||
mov(rax, _rip_global(fzbc));
|
||||
lea(t0, ptr[rax + rbx * 2]);
|
||||
|
||||
if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
|
||||
{
|
||||
// a1 = &m_local.d[skip] // note a1 was (skip << 4)
|
||||
|
||||
lea(a1, ptr[a1 * 8 + _m_local + offsetof(GSScanlineLocalData, d)]);
|
||||
// FIXME
|
||||
//lea(a1, ptr[a1 * 8 + _m_local + offsetof(GSScanlineLocalData, d)]);
|
||||
lea(rax, _rip_local(d));
|
||||
lea(a1, ptr[rax + a1 * 8]);
|
||||
}
|
||||
|
||||
if(m_sel.prim != GS_SPRITE_CLASS)
|
||||
|
@ -365,11 +374,11 @@ void GSDrawScanlineCodeGenerator::Init_AVX()
|
|||
{
|
||||
if(m_sel.ztest)
|
||||
{
|
||||
vmovdqa(_z, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]);
|
||||
vmovdqa(_z, _rip_local(p.z));
|
||||
}
|
||||
|
||||
if(m_sel.fwrite && m_sel.fge)
|
||||
vmovdqa(_f, ptr[_m_local + offsetof(GSScanlineLocalData, p.f)]);
|
||||
vmovdqa(_f, _rip_local(p.f));
|
||||
}
|
||||
|
||||
if(m_sel.fb)
|
||||
|
@ -388,7 +397,7 @@ void GSDrawScanlineCodeGenerator::Init_AVX()
|
|||
vpsrlw(xmm1, 9);
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)], xmm1);
|
||||
vmovdqa(_rip_local(temp.cov), xmm1);
|
||||
#else
|
||||
vmovdqa(ptr[rsp + _rz_cov], xmm1);
|
||||
#endif
|
||||
|
@ -463,8 +472,8 @@ void GSDrawScanlineCodeGenerator::Init_AVX()
|
|||
}
|
||||
else
|
||||
{
|
||||
vmovdqa(_f_rb, ptr[_m_local + offsetof(GSScanlineLocalData, c.rb)]);
|
||||
vmovdqa(_f_ga, ptr[_m_local + offsetof(GSScanlineLocalData, c.ga)]);
|
||||
vmovdqa(_f_rb, _rip_local(c.rb));
|
||||
vmovdqa(_f_ga, _rip_local(c.ga));
|
||||
}
|
||||
|
||||
vmovdqa(_rb, _f_rb);
|
||||
|
@ -496,14 +505,14 @@ void GSDrawScanlineCodeGenerator::Step_AVX()
|
|||
|
||||
if(m_sel.zb)
|
||||
{
|
||||
vaddps(_z, ptr[_m_local + offsetof(GSScanlineLocalData, d4.z)]);
|
||||
vaddps(_z, _rip_local(d4.z));
|
||||
}
|
||||
|
||||
// f = f.add16(m_local.d4.f);
|
||||
|
||||
if(m_sel.fwrite && m_sel.fge)
|
||||
{
|
||||
vpaddw(_f, ptr[_m_local + offsetof(GSScanlineLocalData, d4.f)]);
|
||||
vpaddw(_f, _rip_local(d4.f));
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -524,7 +533,7 @@ void GSDrawScanlineCodeGenerator::Step_AVX()
|
|||
// si += st.xxxx();
|
||||
// if(!sprite) ti += st.yyyy();
|
||||
|
||||
vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.stq)]);
|
||||
vmovdqa(xmm0, _rip_local(d4.stq));
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpaddd(_s, xmm1);
|
||||
|
@ -543,7 +552,7 @@ void GSDrawScanlineCodeGenerator::Step_AVX()
|
|||
// t += stq.yyyy();
|
||||
// q += stq.zzzz();
|
||||
|
||||
vmovaps(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.stq)]);
|
||||
vmovaps(xmm0, _rip_local(d4.stq));
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
|
@ -564,7 +573,7 @@ void GSDrawScanlineCodeGenerator::Step_AVX()
|
|||
// rb = rb.add16(c.xxxx());
|
||||
// ga = ga.add16(c.yyyy());
|
||||
|
||||
vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, d4.c)]);
|
||||
vmovdqa(xmm0, _rip_local(d4.c));
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
|
@ -648,7 +657,7 @@ void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
|
|||
if(m_sel.zwrite)
|
||||
{
|
||||
#ifdef _WIN64
|
||||
vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)], xmm0);
|
||||
vmovdqa(_rip_local(temp.zs), xmm0);
|
||||
#else
|
||||
vmovdqa(ptr[rsp + _rz_zs], xmm0);
|
||||
#endif
|
||||
|
@ -666,7 +675,7 @@ void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
|
|||
if(m_sel.zwrite && m_sel.zpsm < 2)
|
||||
{
|
||||
#ifdef _WIN64
|
||||
vmovdqa(ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], xmm1);
|
||||
vmovdqa(_rip_local(temp.zd), xmm1);
|
||||
#else
|
||||
vmovdqa(ptr[rsp + _rz_zd], xmm1);
|
||||
#endif
|
||||
|
@ -973,7 +982,7 @@ void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
|
|||
{
|
||||
if(region)
|
||||
{
|
||||
vpmaxsw(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]);
|
||||
vpmaxsw(uv, _rip_global(t.min));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -981,23 +990,23 @@ void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
|
|||
vpmaxsw(uv, xmm0);
|
||||
}
|
||||
|
||||
vpminsw(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]);
|
||||
vpminsw(uv, _rip_global(t.max));
|
||||
}
|
||||
else
|
||||
{
|
||||
vpand(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]);
|
||||
vpand(uv, _rip_global(t.min));
|
||||
|
||||
if(region)
|
||||
{
|
||||
vpor(uv, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]);
|
||||
vpor(uv, _rip_global(t.max));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vmovdqa(xmm2, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]);
|
||||
vmovdqa(xmm3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]);
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.mask)]);
|
||||
vmovdqa(xmm2, _rip_global(t.min));
|
||||
vmovdqa(xmm3, _rip_global(t.max));
|
||||
vmovdqa(xmm0, _rip_global(t.mask));
|
||||
|
||||
// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
|
||||
|
||||
|
@ -1034,7 +1043,7 @@ void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
|
|||
{
|
||||
if(region)
|
||||
{
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]);
|
||||
vmovdqa(xmm0, _rip_global(t.min));
|
||||
vpmaxsw(uv0, xmm0);
|
||||
vpmaxsw(uv1, xmm0);
|
||||
}
|
||||
|
@ -1045,19 +1054,19 @@ void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
|
|||
vpmaxsw(uv1, xmm0);
|
||||
}
|
||||
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]);
|
||||
vmovdqa(xmm0, _rip_global(t.max));
|
||||
vpminsw(uv0, xmm0);
|
||||
vpminsw(uv1, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]);
|
||||
vmovdqa(xmm0, _rip_global(t.min));
|
||||
vpand(uv0, xmm0);
|
||||
vpand(uv1, xmm0);
|
||||
|
||||
if(region)
|
||||
{
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]);
|
||||
vmovdqa(xmm0, _rip_global(t.max));
|
||||
vpor(uv0, xmm0);
|
||||
vpor(uv1, xmm0);
|
||||
}
|
||||
|
@ -1065,9 +1074,9 @@ void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
|
|||
}
|
||||
else
|
||||
{
|
||||
vmovdqa(xmm2, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.min)]);
|
||||
vmovdqa(xmm3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.max)]);
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, t.mask)]);
|
||||
vmovdqa(xmm2, _rip_global(t.min));
|
||||
vmovdqa(xmm3, _rip_global(t.max));
|
||||
vmovdqa(xmm0, _rip_global(t.mask));
|
||||
|
||||
// uv0
|
||||
|
||||
|
@ -1217,7 +1226,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
|
|||
if(m_sel.edge)
|
||||
{
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm0, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]);
|
||||
vmovdqa(xmm0, _rip_local(temp.cov));
|
||||
#else
|
||||
vmovdqa(xmm0, ptr[rsp + _rz_cov]);
|
||||
#endif
|
||||
|
@ -1242,7 +1251,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
|
|||
if(m_sel.edge)
|
||||
{
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.cov)]);
|
||||
vmovdqa(xmm1, _rip_local(temp.cov));
|
||||
#else
|
||||
vmovdqa(xmm1, ptr[rsp + _rz_cov]);
|
||||
#endif
|
||||
|
@ -1265,12 +1274,12 @@ void GSDrawScanlineCodeGenerator::ReadMask_AVX()
|
|||
{
|
||||
if(m_sel.fwrite)
|
||||
{
|
||||
vmovdqa(_fm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fm)]);
|
||||
vmovdqa(_fm, _rip_global(fm));
|
||||
}
|
||||
|
||||
if(m_sel.zwrite)
|
||||
{
|
||||
vmovdqa(_zm, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, zm)]);
|
||||
vmovdqa(_zm, _rip_global(zm));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1290,13 +1299,13 @@ void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
|
|||
case ATST_LEQUAL:
|
||||
// t = (ga >> 16) > m_local.gd->aref;
|
||||
vpsrld(xmm1, _ga, 16);
|
||||
vpcmpgtd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]);
|
||||
vpcmpgtd(xmm1, _rip_global(aref));
|
||||
break;
|
||||
|
||||
case ATST_EQUAL:
|
||||
// t = (ga >> 16) != m_local.gd->aref;
|
||||
vpsrld(xmm1, _ga, 16);
|
||||
vpcmpeqd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]);
|
||||
vpcmpeqd(xmm1, _rip_global(aref));
|
||||
vpcmpeqd(xmm0, xmm0);
|
||||
vpxor(xmm1, xmm0);
|
||||
break;
|
||||
|
@ -1305,14 +1314,14 @@ void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
|
|||
case ATST_GREATER:
|
||||
// t = (ga >> 16) < m_local.gd->aref;
|
||||
vpsrld(xmm0, _ga, 16);
|
||||
vmovdqa(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]);
|
||||
vmovdqa(xmm1, _rip_global(aref));
|
||||
vpcmpgtd(xmm1, xmm0);
|
||||
break;
|
||||
|
||||
case ATST_NOTEQUAL:
|
||||
// t = (ga >> 16) == m_local.gd->aref;
|
||||
vpsrld(xmm1, _ga, 16);
|
||||
vpcmpeqd(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, aref)]);
|
||||
vpcmpeqd(xmm1, _rip_global(aref));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1422,8 +1431,8 @@ void GSDrawScanlineCodeGenerator::Fog_AVX()
|
|||
|
||||
vmovdqa(xmm6, _ga);
|
||||
|
||||
vmovdqa(xmm0, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, frb)]);
|
||||
vmovdqa(xmm1, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, fga)]);
|
||||
vmovdqa(xmm0, _rip_global(frb));
|
||||
vmovdqa(xmm1, _rip_global(fga));
|
||||
|
||||
lerp16(_rb, xmm0, _f, 0);
|
||||
lerp16(_ga, xmm1, _f, 0);
|
||||
|
@ -1551,19 +1560,19 @@ void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
|
|||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zs)]);
|
||||
vmovdqa(xmm1, _rip_local(temp.zs));
|
||||
#else
|
||||
vmovdqa(xmm1, ptr[rsp + _rz_zs]);
|
||||
#endif
|
||||
else
|
||||
vmovdqa(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, p.z)]);
|
||||
vmovdqa(xmm1, _rip_local(p.z));
|
||||
|
||||
if(m_sel.ztest && m_sel.zpsm < 2)
|
||||
{
|
||||
// zs = zs.blend8(zd, zm);
|
||||
|
||||
#ifdef _WIN64
|
||||
vpblendvb(xmm1, ptr[_m_local + offsetof(GSScanlineLocalData, temp.zd)], _zm);
|
||||
vpblendvb(xmm1, _rip_local(temp.zd), _zm);
|
||||
#else
|
||||
vpblendvb(xmm1, ptr[rsp + _rz_zd], _zm);
|
||||
#endif
|
||||
|
@ -1676,7 +1685,7 @@ void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
|
|||
vpsllw(xmm15, 7);
|
||||
break;
|
||||
case 2:
|
||||
vmovdqa(xmm15, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, afix)]);
|
||||
vmovdqa(xmm15, _rip_global(afix));
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1807,7 +1816,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
|
|||
|
||||
if(m_sel.fpsm == 2 && m_sel.dthe)
|
||||
{
|
||||
mov(a3, ptr[_m_local__gd + offsetof(GSScanlineGlobalData, dimx)]);
|
||||
mov(a3, _rip_global(dimx));
|
||||
|
||||
// y = (top & 3) << 5
|
||||
|
||||
|
|
Loading…
Reference in New Issue