GS/SW: Make local a function parameter to DrawScanline

This commit is contained in:
Stenzek 2023-01-25 20:51:13 +10:00 committed by refractionpcsx2
parent 3292121e67
commit ee4eadf7a6
8 changed files with 67 additions and 101 deletions

View File

@ -114,8 +114,9 @@ typedef GSVector4 VectorF;
#define LOCAL_STEP local.d4
#endif
void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local, const GSScanlineGlobalData& global)
void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local)
{
const GSScanlineGlobalData& global = *local.gd;
GSScanlineSelector sel = global.sel;
bool has_z = sel.zb != 0;
@ -297,8 +298,10 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons
}
}
void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local, const GSScanlineGlobalData& global)
void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local)
{
const GSScanlineGlobalData& global = *local.gd;
GSScanlineSelector sel = global.sel;
constexpr int vlen = sizeof(VectorF) / sizeof(float);
@ -1575,13 +1578,13 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex
#ifndef ENABLE_JIT_RASTERIZER
void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local)
{
CSetupPrim(vertex, index, dscan, local, *local.gd);
CSetupPrim(vertex, index, dscan, local);
}
void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan)
void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local)
{
CDrawScanline(pixels, left, top, scan, m_local, m_global);
CDrawScanline(pixels, left, top, scan, local);
}
void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan)
void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local)
{
u32 zwrite = m_global.sel.zwrite;
u32 edge = m_global.sel.edge;
@ -1589,7 +1592,7 @@ void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& s
m_global.sel.zwrite = 0;
m_global.sel.edge = 1;
CDrawScanline(pixels, left, top, scan, m_local, m_global);
CDrawScanline(pixels, left, top, scan, local);
m_global.sel.zwrite = zwrite;
m_global.sel.edge = edge;

View File

@ -34,7 +34,7 @@ public:
};
typedef void (*SetupPrimPtr)(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local);
typedef void (*DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan);
typedef void (*DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local);
protected:
GSScanlineGlobalData m_global = {};
@ -80,8 +80,8 @@ public:
void BeginDraw(const GSRasterizerData* data);
void EndDraw(u64 frame, u64 ticks, int actual, int total, int prims);
static void CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local, const GSScanlineGlobalData& global);
static void CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local, const GSScanlineGlobalData& global);
static void CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local);
static void CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local);
template<class T> static bool TestAlpha(T& test, T& fm, T& zm, const T& ga, const GSScanlineGlobalData& global);
template<class T> static void WritePixel(const T& src, int addr, int i, u32 psm, const GSScanlineGlobalData& global);
@ -89,14 +89,14 @@ public:
#ifdef ENABLE_JIT_RASTERIZER
__forceinline void SetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local) { m_sp(vertex, index, dscan, local); }
__forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) { m_ds(pixels, left, top, scan); }
__forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) { m_de(pixels, left, top, scan); }
__forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { m_ds(pixels, left, top, scan, local); }
__forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { m_de(pixels, left, top, scan, local); }
#else
void SetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local);
void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan);
void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan);
void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local);
void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local);
#endif

View File

@ -21,19 +21,8 @@
MULTI_ISA_UNSHARED_IMPL;
using namespace Xbyak;
// Ease the reading of the code
// Note, there are versions without the _64 prefix that can be used as source (but not destination) operands on both 32 and 64 bit
#define _64_g_const r10
#define _64_m_local r12
#define _64_m_local__gd r13
#define _64_m_local__gd__vm t3
#define _64_m_local__gd__clut r11
// If use_lod, m_local.gd->tex, else m_local.gd->tex[0]
#define _64_m_local__gd__tex r14
#define _rip_local_(ptrtype, field) ((m_rip) ? ptrtype[rip + (char*)&m_local.field] : ptrtype[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _rip_local(field) _rip_local_(ptr, field)
#define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)])
#define _rip_local(field) ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]
#define _rip_global(field) ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]
/// On AVX, does a v-prefixed separate destination operation
/// On SSE, moves src1 into dst using movdqa, then does the operation
@ -84,22 +73,29 @@ using namespace Xbyak;
GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, const ProcessorFeatures& cpu, void* param, u64 key)
: _parent(base, cpu)
, m_local(*(GSScanlineLocalData*)param)
, m_rip(false)
#ifdef _WIN32
, a0(rcx), a1(rdx)
, a2(r8) , a3(r9)
, t0(rdi), t1(rsi)
, t2(r8) , t3(r9)
, _g_const(r10)
, _m_local(r12)
, _m_local__gd(r13)
, _m_local__gd__vm(t3)
, _m_local__gd__clut(r11)
, _m_local__gd__tex(r14)
#else
, a0(rdi), a1(rsi)
, a2(rdx), a3(rcx)
, t0(r8) , t1(r9)
, t0(r12), t1(r9)
, t2(rcx), t3(rsi)
, _g_const(r10)
, _m_local(r8)
, _m_local__gd(r13)
, _m_local__gd__vm(t3)
, _m_local__gd__clut(r11)
, _m_local__gd__tex(r14)
#endif
, _g_const(chooseLocal(&g_const, _64_g_const))
, _m_local(chooseLocal(&m_local, _64_m_local))
, _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd))
, _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm))
, _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15)
, _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14)
{
@ -111,10 +107,10 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator*
// MARK: - Helpers
GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr)
GSDrawScanlineCodeGenerator2::AddressReg GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr)
{
mov(reg, (size_t)addr);
return choose3264((size_t)addr, reg);
return reg;
}
void GSDrawScanlineCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem)
@ -335,14 +331,11 @@ void GSDrawScanlineCodeGenerator2::split16_2x8(const XYm& l, const XYm& h, const
void GSDrawScanlineCodeGenerator2::Generate()
{
bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
bool need_clut = need_tex && m_sel.tlu;
m_rip = (size_t)getCurr() < 0x80000000;
m_rip &= (size_t)&m_local < 0x80000000;
m_rip &= (size_t)&m_local.gd < 0x80000000;
const bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
const bool need_clut = need_tex && m_sel.tlu;
push(rbp);
mov(rbp, rsp); // Stack traces look much nicer this way
mov(rbp, rsp); // Stack traces look much nicer this way, TODO drop in release builds
#ifdef _WIN32
push(rbx);
push(rsi);
@ -359,23 +352,23 @@ void GSDrawScanlineCodeGenerator2::Generate()
}
#else
mov(ptr[rsp + _64_rz_rbx], rbx);
if (!m_rip)
{
mov(ptr[rsp + _64_rz_r12], r12);
mov(ptr[rsp + _64_rz_r13], r13);
}
mov(ptr[rsp + _64_rz_r12], r12);
mov(ptr[rsp + _64_rz_r13], r13);
mov(ptr[rsp + _64_rz_r14], r14);
mov(ptr[rsp + _64_rz_r15], r15);
#endif
mov(_64_g_const, (size_t)&g_const);
if (!m_rip)
{
mov(_64_m_local, (size_t)&m_local);
mov(_64_m_local__gd, _rip_local(gd));
}
mov(_g_const, (size_t)&g_const);
#ifdef _WIN32
// Local (5th arg) is passed on the stack in Windows.
// 32 bytes shadow space less the 7 pushed registers and return address = 96.
mov(_m_local, ptr[rsp + _64_win_stack_size + 96]);
#endif
mov(_m_local__gd, _rip_local(gd));
if (need_clut)
mov(_64_m_local__gd__clut, _rip_global(clut));
mov(_m_local__gd__clut, _rip_global(clut));
Init();
@ -398,7 +391,7 @@ L("loop");
// xym7 = test | z0
// xym15 = | test
bool tme = m_sel.tfx != TFX_NONE;
const bool tme = m_sel.tfx != TFX_NONE;
TestZ(tme ? xym5 : xym2, tme ? xym6 : xym3);
@ -609,11 +602,8 @@ L("exit");
pop(rbx);
#else
mov(rbx, ptr[rsp + _64_rz_rbx]);
if (!m_rip)
{
mov(r12, ptr[rsp + _64_rz_r12]);
mov(r13, ptr[rsp + _64_rz_r13]);
}
mov(r12, ptr[rsp + _64_rz_r12]);
mov(r13, ptr[rsp + _64_rz_r13]);
mov(r14, ptr[rsp + _64_rz_r14]);
mov(r15, ptr[rsp + _64_rz_r15]);
#endif
@ -889,13 +879,13 @@ void GSDrawScanlineCodeGenerator2::Init()
mov(ptr[rsp + _top], a2);
}
mov(_64_m_local__gd__vm, _rip_global(vm));
mov(_m_local__gd__vm, _rip_global(vm));
if (m_sel.fb && m_sel.tfx != TFX_NONE)
{
if (use_lod)
lea(_64_m_local__gd__tex, _rip_global(tex));
lea(_m_local__gd__tex, _rip_global(tex));
else
mov(_64_m_local__gd__tex, _rip_global(tex));
mov(_m_local__gd__tex, _rip_global(tex));
}
}
@ -3177,7 +3167,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(
void GSDrawScanlineCodeGenerator2::ReadTexelImplLoadTexLOD(int lod, int mip_offset)
{
AddressReg texIn = _64_m_local__gd__tex;
AddressReg texIn = _m_local__gd__tex;
Address lod_addr = m_sel.lcm ? _rip_global(lod.i.U32[lod]) : _rip_local(temp.lod.i.U32[lod]);
mov(ebx, lod_addr);
mov(rbx, ptr[texIn + rbx * wordsize + mip_offset]);
@ -3230,7 +3220,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplYmm(
}
else
{
AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex;
AddressReg tex = texInRBX ? rbx : _m_local__gd__tex;
if (!m_sel.tlu)
{
pcmpeqd(t1[i], t1[i]);
@ -3309,8 +3299,8 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(const Xmm& dst, const Xmm& addr
{
ASSERT(i < 4);
AddressReg clut = _64_m_local__gd__clut;
AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex;
AddressReg clut = _m_local__gd__clut;
AddressReg tex = texInRBX ? rbx : _m_local__gd__tex;
Address src = m_sel.tlu ? ptr[clut + rax * 4] : ptr[tex + rax * 4];
// Extract address offset

View File

@ -38,10 +38,6 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
using _parent = GSNewCodeGenerator;
using XYm = DRAW_SCANLINE_VECTOR_REGISTER;
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
/// On x86-32 the same values are just raw 32-bit addresses
using LocalAddr = Choose3264<size_t, AddressReg>::type;
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
constexpr static int wordsize = 8;
@ -74,32 +70,27 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
bool m_rip;
bool use_lod;
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
/// Note: a2 and t3 are only available on x86-64
/// Outside of Init, usable registers are a0, t0, t1, t2, t3[x64], rax, rbx, rdx, r10+
const AddressReg a0, a1, a2, a3, t0, t1, t2, t3;
const LocalAddr _g_const, _m_local, _m_local__gd, _m_local__gd__vm;
const AddressReg _g_const, _m_local, _m_local__gd, _m_local__gd__vm, _m_local__gd__clut;
// If use_lod, m_local.gd->tex, else m_local.gd->tex[0]
const AddressReg _m_local__gd__tex;
/// Available on both x86 and x64, not always valid
const XYm _rb, _ga, _fm, _zm, _fd, _test;
/// Always valid if needed, x64 only
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
/// Returns the first arg on 32-bit, second on 64-bit
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
{
return choose3264((size_t)addr32, reg64);
}
public:
GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, const ProcessorFeatures& cpu, void* param, u64 key);
void Generate();
private:
/// Loads the given address into the given register if needed, and returns something that can be used in a `ptr[]`
LocalAddr loadAddress(AddressReg reg, const void* addr);
AddressReg loadAddress(AddressReg reg, const void* addr);
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast 128 bits of integers from memory to the whole register, whatever size that register might be

View File

@ -84,7 +84,6 @@ static bool shouldUseCDrawScanline(u64 key)
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, u64 key, void* code, size_t maxsize)
: Xbyak::CodeGenerator(maxsize, code)
, m_local(*(GSScanlineLocalData*)param)
, m_rip(false)
{
m_sel.key = key;
@ -93,18 +92,8 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, u64 key, v
if (shouldUseCDrawScanline(key))
{
#if defined(_WIN32)
mov(r8, reinterpret_cast<size_t>(&m_local));
push(ptr[r8 + offsetof(GSScanlineLocalData, gd)]);
push(r8);
sub(rsp, 32); // CC required shadow space
call(reinterpret_cast<void*>(GSDrawScanline::CDrawScanline));
ret(48);
#else
mov(r8, reinterpret_cast<size_t>(&m_local));
mov(r9, ptr[r8 + offsetof(GSScanlineLocalData, gd)]);
jmp(reinterpret_cast<void*>(GSDrawScanline::CDrawScanline));
#endif
mov(rax, reinterpret_cast<size_t>(GSDrawScanline::CDrawScanline)); // TODO: Get rid of once we move to memory map
jmp(rax);
return;
}

View File

@ -39,7 +39,6 @@ class GSDrawScanlineCodeGenerator : public Xbyak::CodeGenerator
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
bool m_rip;
public:
GSDrawScanlineCodeGenerator(void* param, u64 key, void* code, size_t maxsize);

View File

@ -91,12 +91,6 @@ public:
using AddressReg = Xbyak::Reg64;
using RipType = Xbyak::RegRip;
template <typename T32, typename T64>
struct Choose3264 { using type = T64; };
template <typename T32, typename T64>
static T64 choose3264(T32 t32, T64 t64) { return t64; }
const bool hasAVX, hasAVX2, hasFMA;
const Xmm xmm0{0}, xmm1{1}, xmm2{2}, xmm3{3}, xmm4{4}, xmm5{5}, xmm6{6}, xmm7{7}, xmm8{8}, xmm9{9}, xmm10{10}, xmm11{11}, xmm12{12}, xmm13{13}, xmm14{14}, xmm15{15};

View File

@ -1140,7 +1140,7 @@ void GSRasterizer::DrawScanline(int pixels, int left, int top, const GSVertexSW&
ASSERT(m_pixels.actual <= m_pixels.total);
m_ds->DrawScanline(pixels, left, top, scan);
m_ds->DrawScanline(pixels, left, top, scan, m_ds->GetLocalData());
}
void GSRasterizer::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan)
@ -1151,7 +1151,7 @@ void GSRasterizer::DrawEdge(int pixels, int left, int top, const GSVertexSW& sca
ASSERT(m_pixels.actual <= m_pixels.total);
m_ds->DrawEdge(pixels, left, top, scan);
m_ds->DrawEdge(pixels, left, top, scan, m_ds->GetLocalData());
}
void GSRasterizer::Sync()