diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index 9f58dc2d3e..a249f4e5b0 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -114,8 +114,9 @@ typedef GSVector4 VectorF; #define LOCAL_STEP local.d4 #endif -void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local, const GSScanlineGlobalData& global) +void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local) { + const GSScanlineGlobalData& global = *local.gd; GSScanlineSelector sel = global.sel; bool has_z = sel.zb != 0; @@ -297,8 +298,10 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u32* index, cons } } -void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local, const GSScanlineGlobalData& global) +void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { + const GSScanlineGlobalData& global = *local.gd; + GSScanlineSelector sel = global.sel; constexpr int vlen = sizeof(VectorF) / sizeof(float); @@ -1575,13 +1578,13 @@ void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSVertex #ifndef ENABLE_JIT_RASTERIZER void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local) { - CSetupPrim(vertex, index, dscan, local, *local.gd); + CSetupPrim(vertex, index, dscan, local); } -void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) +void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { - CDrawScanline(pixels, left, top, scan, m_local, m_global); + CDrawScanline(pixels, left, top, scan, local); } -void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) +void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { u32 zwrite = m_global.sel.zwrite; u32 edge = m_global.sel.edge; @@ -1589,7 +1592,7 @@ void GSDrawScanline::DrawEdge(int pixels, int left, int top, const GSVertexSW& s m_global.sel.zwrite = 0; m_global.sel.edge = 1; - CDrawScanline(pixels, left, top, scan, m_local, m_global); + CDrawScanline(pixels, left, top, scan, local); m_global.sel.zwrite = zwrite; m_global.sel.edge = edge; diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.h b/pcsx2/GS/Renderers/SW/GSDrawScanline.h index f4a425b4c4..41bf25a469 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.h @@ -34,7 +34,7 @@ public: }; typedef void (*SetupPrimPtr)(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local); - typedef void (*DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan); + typedef void (*DrawScanlinePtr)(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local); protected: GSScanlineGlobalData m_global = {}; @@ -80,8 +80,8 @@ public: void BeginDraw(const GSRasterizerData* data); void EndDraw(u64 frame, u64 ticks, int actual, int total, int prims); - static void CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local, const GSScanlineGlobalData& global); - static void CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local, const GSScanlineGlobalData& global); + static void CSetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local); + static void CDrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local); template static bool TestAlpha(T& test, T& fm, T& zm, const T& ga, const GSScanlineGlobalData& global); template static void WritePixel(const T& src, int addr, int i, u32 psm, const GSScanlineGlobalData& global); @@ -89,14 +89,14 @@ public: #ifdef ENABLE_JIT_RASTERIZER __forceinline void SetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local) { m_sp(vertex, index, dscan, local); } - __forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan) { m_ds(pixels, left, top, scan); } - __forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) { m_de(pixels, left, top, scan); } + __forceinline void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { m_ds(pixels, left, top, scan, local); } + __forceinline void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local) { m_de(pixels, left, top, scan, local); } #else void SetupPrim(const GSVertexSW* vertex, const u32* index, const GSVertexSW& dscan, GSScanlineLocalData& local); - void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan); - void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan); + void DrawScanline(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local); + void DrawEdge(int pixels, int left, int top, const GSVertexSW& scan, GSScanlineLocalData& local); #endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index 32e96ee54a..cb0f7dc8d9 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -21,19 +21,8 @@ MULTI_ISA_UNSHARED_IMPL; using namespace Xbyak; -// Ease the reading of the code -// Note, there are versions without the _64 prefix that can be used as source (but not destination) operands on both 32 and 64 bit -#define _64_g_const r10 -#define _64_m_local r12 -#define _64_m_local__gd r13 -#define _64_m_local__gd__vm t3 -#define _64_m_local__gd__clut r11 -// If use_lod, m_local.gd->tex, else m_local.gd->tex[0] -#define _64_m_local__gd__tex r14 - -#define _rip_local_(ptrtype, field) ((m_rip) ? ptrtype[rip + (char*)&m_local.field] : ptrtype[_m_local + OFFSETOF(GSScanlineLocalData, field)]) -#define _rip_local(field) _rip_local_(ptr, field) -#define _rip_global(field) ((m_rip) ? ptr[rip + (char*)&m_local.gd->field] : ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)]) +#define _rip_local(field) ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)] +#define _rip_global(field) ptr[_m_local__gd + OFFSETOF(GSScanlineGlobalData, field)] /// On AVX, does a v-prefixed separate destination operation /// On SSE, moves src1 into dst using movdqa, then does the operation @@ -84,22 +73,29 @@ using namespace Xbyak; GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, const ProcessorFeatures& cpu, void* param, u64 key) : _parent(base, cpu) , m_local(*(GSScanlineLocalData*)param) - , m_rip(false) #ifdef _WIN32 , a0(rcx), a1(rdx) , a2(r8) , a3(r9) , t0(rdi), t1(rsi) , t2(r8) , t3(r9) + , _g_const(r10) + , _m_local(r12) + , _m_local__gd(r13) + , _m_local__gd__vm(t3) + , _m_local__gd__clut(r11) + , _m_local__gd__tex(r14) #else , a0(rdi), a1(rsi) , a2(rdx), a3(rcx) - , t0(r8) , t1(r9) + , t0(r12), t1(r9) , t2(rcx), t3(rsi) + , _g_const(r10) + , _m_local(r8) + , _m_local__gd(r13) + , _m_local__gd__vm(t3) + , _m_local__gd__clut(r11) + , _m_local__gd__tex(r14) #endif - , _g_const(chooseLocal(&g_const, _64_g_const)) - , _m_local(chooseLocal(&m_local, _64_m_local)) - , _m_local__gd(chooseLocal(m_local.gd, _64_m_local__gd)) - , _m_local__gd__vm(chooseLocal(m_local.gd->vm, _64_m_local__gd__vm)) , _rb(xym5), _ga(xym6), _fm(xym3), _zm(xym4), _fd(xym2), _test(xym15) , _z(xym8), _f(xym9), _s(xym10), _t(xym11), _q(xym12), _f_rb(xym13), _f_ga(xym14) { @@ -111,10 +107,10 @@ GSDrawScanlineCodeGenerator2::GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* // MARK: - Helpers -GSDrawScanlineCodeGenerator2::LocalAddr GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr) +GSDrawScanlineCodeGenerator2::AddressReg GSDrawScanlineCodeGenerator2::loadAddress(AddressReg reg, const void* addr) { mov(reg, (size_t)addr); - return choose3264((size_t)addr, reg); + return reg; } void GSDrawScanlineCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem) @@ -335,14 +331,11 @@ void GSDrawScanlineCodeGenerator2::split16_2x8(const XYm& l, const XYm& h, const void GSDrawScanlineCodeGenerator2::Generate() { - bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; - bool need_clut = need_tex && m_sel.tlu; - m_rip = (size_t)getCurr() < 0x80000000; - m_rip &= (size_t)&m_local < 0x80000000; - m_rip &= (size_t)&m_local.gd < 0x80000000; + const bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE; + const bool need_clut = need_tex && m_sel.tlu; push(rbp); - mov(rbp, rsp); // Stack traces look much nicer this way + mov(rbp, rsp); // Stack traces look much nicer this way, TODO drop in release builds #ifdef _WIN32 push(rbx); push(rsi); @@ -359,23 +352,23 @@ void GSDrawScanlineCodeGenerator2::Generate() } #else mov(ptr[rsp + _64_rz_rbx], rbx); - if (!m_rip) - { - mov(ptr[rsp + _64_rz_r12], r12); - mov(ptr[rsp + _64_rz_r13], r13); - } + mov(ptr[rsp + _64_rz_r12], r12); + mov(ptr[rsp + _64_rz_r13], r13); mov(ptr[rsp + _64_rz_r14], r14); mov(ptr[rsp + _64_rz_r15], r15); #endif - mov(_64_g_const, (size_t)&g_const); - if (!m_rip) - { - mov(_64_m_local, (size_t)&m_local); - mov(_64_m_local__gd, _rip_local(gd)); - } + mov(_g_const, (size_t)&g_const); + +#ifdef _WIN32 + // Local (5th arg) is passed on the stack in Windows. + // 32 bytes shadow space less the 7 pushed registers and return address = 96. + mov(_m_local, ptr[rsp + _64_win_stack_size + 96]); +#endif + + mov(_m_local__gd, _rip_local(gd)); if (need_clut) - mov(_64_m_local__gd__clut, _rip_global(clut)); + mov(_m_local__gd__clut, _rip_global(clut)); Init(); @@ -398,7 +391,7 @@ L("loop"); // xym7 = test | z0 // xym15 = | test - bool tme = m_sel.tfx != TFX_NONE; + const bool tme = m_sel.tfx != TFX_NONE; TestZ(tme ? xym5 : xym2, tme ? xym6 : xym3); @@ -609,11 +602,8 @@ L("exit"); pop(rbx); #else mov(rbx, ptr[rsp + _64_rz_rbx]); - if (!m_rip) - { - mov(r12, ptr[rsp + _64_rz_r12]); - mov(r13, ptr[rsp + _64_rz_r13]); - } + mov(r12, ptr[rsp + _64_rz_r12]); + mov(r13, ptr[rsp + _64_rz_r13]); mov(r14, ptr[rsp + _64_rz_r14]); mov(r15, ptr[rsp + _64_rz_r15]); #endif @@ -889,13 +879,13 @@ void GSDrawScanlineCodeGenerator2::Init() mov(ptr[rsp + _top], a2); } - mov(_64_m_local__gd__vm, _rip_global(vm)); + mov(_m_local__gd__vm, _rip_global(vm)); if (m_sel.fb && m_sel.tfx != TFX_NONE) { if (use_lod) - lea(_64_m_local__gd__tex, _rip_global(tex)); + lea(_m_local__gd__tex, _rip_global(tex)); else - mov(_64_m_local__gd__tex, _rip_global(tex)); + mov(_m_local__gd__tex, _rip_global(tex)); } } @@ -3177,7 +3167,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl( void GSDrawScanlineCodeGenerator2::ReadTexelImplLoadTexLOD(int lod, int mip_offset) { - AddressReg texIn = _64_m_local__gd__tex; + AddressReg texIn = _m_local__gd__tex; Address lod_addr = m_sel.lcm ? _rip_global(lod.i.U32[lod]) : _rip_local(temp.lod.i.U32[lod]); mov(ebx, lod_addr); mov(rbx, ptr[texIn + rbx * wordsize + mip_offset]); @@ -3230,7 +3220,7 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImplYmm( } else { - AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex; + AddressReg tex = texInRBX ? rbx : _m_local__gd__tex; if (!m_sel.tlu) { pcmpeqd(t1[i], t1[i]); @@ -3309,8 +3299,8 @@ void GSDrawScanlineCodeGenerator2::ReadTexelImpl(const Xmm& dst, const Xmm& addr { ASSERT(i < 4); - AddressReg clut = _64_m_local__gd__clut; - AddressReg tex = texInRBX ? rbx : _64_m_local__gd__tex; + AddressReg clut = _m_local__gd__clut; + AddressReg tex = texInRBX ? rbx : _m_local__gd__tex; Address src = m_sel.tlu ? ptr[clut + rax * 4] : ptr[tex + rax * 4]; // Extract address offset diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h index 0f2d363321..395d759216 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h @@ -38,10 +38,6 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator using _parent = GSNewCodeGenerator; using XYm = DRAW_SCANLINE_VECTOR_REGISTER; - /// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach - /// On x86-32 the same values are just raw 32-bit addresses - using LocalAddr = Choose3264::type; - constexpr static bool isXmm = std::is_same::value; constexpr static bool isYmm = std::is_same::value; constexpr static int wordsize = 8; @@ -74,32 +70,27 @@ class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator GSScanlineSelector m_sel; GSScanlineLocalData& m_local; - bool m_rip; bool use_lod; const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15}; /// Note: a2 and t3 are only available on x86-64 /// Outside of Init, usable registers are a0, t0, t1, t2, t3[x64], rax, rbx, rdx, r10+ const AddressReg a0, a1, a2, a3, t0, t1, t2, t3; - const LocalAddr _g_const, _m_local, _m_local__gd, _m_local__gd__vm; + const AddressReg _g_const, _m_local, _m_local__gd, _m_local__gd__vm, _m_local__gd__clut; + // If use_lod, m_local.gd->tex, else m_local.gd->tex[0] + const AddressReg _m_local__gd__tex; /// Available on both x86 and x64, not always valid const XYm _rb, _ga, _fm, _zm, _fd, _test; /// Always valid if needed, x64 only const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga; - /// Returns the first arg on 32-bit, second on 64-bit - static LocalAddr chooseLocal(const void* addr32, AddressReg reg64) - { - return choose3264((size_t)addr32, reg64); - } - public: GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, const ProcessorFeatures& cpu, void* param, u64 key); void Generate(); private: /// Loads the given address into the given register if needed, and returns something that can be used in a `ptr[]` - LocalAddr loadAddress(AddressReg reg, const void* addr); + AddressReg loadAddress(AddressReg reg, const void* addr); /// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be void broadcastf128(const XYm& reg, const Xbyak::Address& mem); /// Broadcast 128 bits of integers from memory to the whole register, whatever size that register might be diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp index 5ff37c8278..9b214a6cb0 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp @@ -84,7 +84,6 @@ static bool shouldUseCDrawScanline(u64 key) GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, u64 key, void* code, size_t maxsize) : Xbyak::CodeGenerator(maxsize, code) , m_local(*(GSScanlineLocalData*)param) - , m_rip(false) { m_sel.key = key; @@ -93,18 +92,8 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, u64 key, v if (shouldUseCDrawScanline(key)) { -#if defined(_WIN32) - mov(r8, reinterpret_cast(&m_local)); - push(ptr[r8 + offsetof(GSScanlineLocalData, gd)]); - push(r8); - sub(rsp, 32); // CC required shadow space - call(reinterpret_cast(GSDrawScanline::CDrawScanline)); - ret(48); -#else - mov(r8, reinterpret_cast(&m_local)); - mov(r9, ptr[r8 + offsetof(GSScanlineLocalData, gd)]); - jmp(reinterpret_cast(GSDrawScanline::CDrawScanline)); -#endif + mov(rax, reinterpret_cast(GSDrawScanline::CDrawScanline)); // TODO: Get rid of once we move to memory map + jmp(rax); return; } diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h index 3221c72000..879bb8aa1b 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h @@ -39,7 +39,6 @@ class GSDrawScanlineCodeGenerator : public Xbyak::CodeGenerator GSScanlineSelector m_sel; GSScanlineLocalData& m_local; - bool m_rip; public: GSDrawScanlineCodeGenerator(void* param, u64 key, void* code, size_t maxsize); diff --git a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h index 8b01522e21..da84a071cc 100644 --- a/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h @@ -91,12 +91,6 @@ public: using AddressReg = Xbyak::Reg64; using RipType = Xbyak::RegRip; - template - struct Choose3264 { using type = T64; }; - - template - static T64 choose3264(T32 t32, T64 t64) { return t64; } - const bool hasAVX, hasAVX2, hasFMA; const Xmm xmm0{0}, xmm1{1}, xmm2{2}, xmm3{3}, xmm4{4}, xmm5{5}, xmm6{6}, xmm7{7}, xmm8{8}, xmm9{9}, xmm10{10}, xmm11{11}, xmm12{12}, xmm13{13}, xmm14{14}, xmm15{15}; diff --git a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp index 6cd2ceebdc..0e062afff9 100644 --- a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp +++ b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp @@ -1140,7 +1140,7 @@ void GSRasterizer::DrawScanline(int pixels, int left, int top, const GSVertexSW& ASSERT(m_pixels.actual <= m_pixels.total); - m_ds->DrawScanline(pixels, left, top, scan); + m_ds->DrawScanline(pixels, left, top, scan, m_ds->GetLocalData()); } void GSRasterizer::DrawEdge(int pixels, int left, int top, const GSVertexSW& scan) @@ -1151,7 +1151,7 @@ void GSRasterizer::DrawEdge(int pixels, int left, int top, const GSVertexSW& sca ASSERT(m_pixels.actual <= m_pixels.total); - m_ds->DrawEdge(pixels, left, top, scan); + m_ds->DrawEdge(pixels, left, top, scan, m_ds->GetLocalData()); } void GSRasterizer::Sync()