From aaca1b01e58728c7513882bc9806d0b64e6e87be Mon Sep 17 00:00:00 2001 From: Fiora Date: Mon, 25 Aug 2014 13:56:01 -0700 Subject: [PATCH] JIT64: clean up and unify float load/store code While we're at it, support a bunch of float load/store variants that weren't implemented in the JIT. Might not have a big speed impact on typical games but they're used at least a bit in povray and luabench. 694 -> 644 seconds on povray. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 9 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 34 +-- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 231 ++++++++++-------- 3 files changed, 149 insertions(+), 125 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 1c6e082fd1..5a0585f306 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -192,11 +192,9 @@ public: void cntlzwx(UGeckoInstruction inst); - void lfs(UGeckoInstruction inst); - void lfd(UGeckoInstruction inst); - void stfd(UGeckoInstruction inst); - void stfs(UGeckoInstruction inst); - void stfsx(UGeckoInstruction inst); + void lfXXX(UGeckoInstruction inst); + void stfXXX(UGeckoInstruction inst); + void stfiwx(UGeckoInstruction inst); void psq_l(UGeckoInstruction inst); void psq_st(UGeckoInstruction inst); @@ -211,7 +209,6 @@ public: void srwx(UGeckoInstruction inst); void dcbst(UGeckoInstruction inst); void dcbz(UGeckoInstruction inst); - void lfsx(UGeckoInstruction inst); void subfic(UGeckoInstruction inst); void subfcx(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index c64cbdca66..5a9c51472f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -82,15 +82,15 @@ static GekkoOPTemplate primarytable[] = {46, &Jit64::lmw}, //"lmw", OPTYPE_SYSTEM, FL_EVIL, 10}}, {47, &Jit64::stmw}, //"stmw", OPTYPE_SYSTEM, FL_EVIL, 10}}, - {48, &Jit64::lfs}, //"lfs", OPTYPE_LOADFP, FL_IN_A}}, - {49, &Jit64::FallBackToInterpreter}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, - {50, &Jit64::lfd}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, - {51, &Jit64::FallBackToInterpreter}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, + {48, &Jit64::lfXXX}, //"lfs", OPTYPE_LOADFP, FL_IN_A}}, + {49, &Jit64::lfXXX}, //"lfsu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, + {50, &Jit64::lfXXX}, //"lfd", OPTYPE_LOADFP, FL_IN_A}}, + {51, &Jit64::lfXXX}, //"lfdu", OPTYPE_LOADFP, FL_OUT_A | FL_IN_A}}, - {52, &Jit64::stfs}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, - {53, &Jit64::FallBackToInterpreter}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, - {54, &Jit64::stfd}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, - {55, &Jit64::FallBackToInterpreter}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, + {52, &Jit64::stfXXX}, //"stfs", OPTYPE_STOREFP, FL_IN_A}}, + {53, &Jit64::stfXXX}, //"stfsu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, + {54, &Jit64::stfXXX}, //"stfd", OPTYPE_STOREFP, FL_IN_A}}, + {55, &Jit64::stfXXX}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {56, &Jit64::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}}, {57, &Jit64::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, @@ -253,16 +253,16 @@ static GekkoOPTemplate table31[] = {725, &Jit64::FallBackToInterpreter}, //"stswi", OPTYPE_STORE, FL_EVIL}}, // fp load/store - {535, &Jit64::lfsx}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, - {567, &Jit64::FallBackToInterpreter}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, - {599, &Jit64::FallBackToInterpreter}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, - {631, &Jit64::FallBackToInterpreter}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, + {535, &Jit64::lfXXX}, //"lfsx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, + {567, &Jit64::lfXXX}, //"lfsux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, + {599, &Jit64::lfXXX}, //"lfdx", OPTYPE_LOADFP, FL_IN_A0 | FL_IN_B}}, + {631, &Jit64::lfXXX}, //"lfdux", OPTYPE_LOADFP, FL_IN_A | FL_IN_B}}, - {663, &Jit64::stfsx}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, - {695, &Jit64::FallBackToInterpreter}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, - {727, &Jit64::FallBackToInterpreter}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, - {759, &Jit64::FallBackToInterpreter}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, - {983, &Jit64::FallBackToInterpreter}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, + {663, &Jit64::stfXXX}, //"stfsx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, + {695, &Jit64::stfXXX}, //"stfsux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, + {727, &Jit64::stfXXX}, //"stfdx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, + {759, &Jit64::stfXXX}, //"stfdux", OPTYPE_STOREFP, FL_IN_A | FL_IN_B}}, + {983, &Jit64::stfiwx}, //"stfiwx", OPTYPE_STOREFP, FL_IN_A0 | FL_IN_B}}, {19, &Jit64::mfcr}, //"mfcr", OPTYPE_SYSTEM, FL_OUT_D}}, {83, &Jit64::mfmsr}, //"mfmsr", OPTYPE_SYSTEM, FL_OUT_D}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index b49de4cea5..d1f7ca9f8f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -14,134 +14,161 @@ using namespace Gen; // TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common, // and pshufb could help a lot. -void Jit64::lfs(UGeckoInstruction inst) +void Jit64::lfXXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); + bool indexed = inst.OPCD == 31; + bool update = indexed ? !!(inst.SUBOP10 & 0x20) : !!(inst.OPCD & 1); + bool single = indexed ? !(inst.SUBOP10 & 0x40) : !(inst.OPCD & 2); + update &= indexed || inst.SIMM_16; int d = inst.RD; int a = inst.RA; - FALLBACK_IF(!a); + int b = inst.RB; - s32 offset = (s32)(s16)inst.SIMM_16; + FALLBACK_IF(!indexed && !a); - SafeLoadToReg(EAX, gpr.R(a), 32, offset, CallerSavedRegistersInUse(), false); + if (update) + gpr.BindToRegister(a, true, true); + s32 offset = 0; + OpArg addr = gpr.R(a); + if (indexed) + { + if (update) + { + ADD(32, addr, gpr.R(b)); + } + else + { + addr = R(EAX); + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + LEA(32, EAX, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + else + { + MOV(32, addr, gpr.R(b)); + if (a) + ADD(32, addr, gpr.R(a)); + } + } + } + else + { + if (update) + ADD(32, addr, Imm32((s32)(s16)inst.SIMM_16)); + else + offset = (s32)(s16)inst.SIMM_16; + } + + SafeLoadToReg(RAX, addr, single ? 32 : 64, offset, CallerSavedRegistersInUse(), false); fpr.Lock(d); - fpr.BindToRegister(d, js.memcheck); + fpr.BindToRegister(d, js.memcheck || !single); MEMCHECK_START - ConvertSingleToDouble(fpr.RX(d), EAX, true); + if (single) + { + ConvertSingleToDouble(fpr.RX(d), EAX, true); + } + else + { + MOVQ_xmm(XMM0, R(RAX)); + MOVSD(fpr.RX(d), R(XMM0)); + } MEMCHECK_END - fpr.UnlockAll(); + gpr.UnlockAll(); } - -void Jit64::lfd(UGeckoInstruction inst) +void Jit64::stfXXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(!inst.RA); - - int d = inst.RD; - int a = inst.RA; - - s32 offset = (s32)(s16)inst.SIMM_16; - - SafeLoadToReg(RAX, gpr.R(a), 64, offset, CallerSavedRegistersInUse(), false); - - fpr.Lock(d); - fpr.BindToRegister(d, true); - - MEMCHECK_START - MOVQ_xmm(XMM0, R(RAX)); - MOVSD(fpr.RX(d), R(XMM0)); - MEMCHECK_END - - fpr.UnlockAll(); -} - - -void Jit64::stfd(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(!inst.RA); + bool indexed = inst.OPCD == 31; + bool update = indexed ? !!(inst.SUBOP10&0x20) : !!(inst.OPCD&1); + bool single = indexed ? !(inst.SUBOP10&0x40) : !(inst.OPCD&2); + update &= indexed || inst.SIMM_16; int s = inst.RS; int a = inst.RA; + int b = inst.RB; + + FALLBACK_IF(!indexed && !a); + + s32 offset = 0; + gpr.FlushLockX(ABI_PARAM1); + if (indexed) + { + if (update) + { + gpr.BindToRegister(a, true, true); + ADD(32, gpr.R(a), gpr.R(b)); + MOV(32, R(ABI_PARAM1), gpr.R(a)); + } + else + { + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + LEA(32, ABI_PARAM1, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + else + { + MOV(32, R(ABI_PARAM1), gpr.R(b)); + if (a) + ADD(32, R(ABI_PARAM1), gpr.R(a)); + } + } + } + else + { + if (update) + { + gpr.BindToRegister(a, true, true); + ADD(32, gpr.R(a), Imm32((s32)(s16)inst.SIMM_16)); + } + else + { + offset = (s32)(s16)inst.SIMM_16; + } + MOV(32, R(ABI_PARAM1), gpr.R(a)); + } + + if (single) + { + fpr.BindToRegister(s, true, false); + ConvertDoubleToSingle(XMM0, fpr.RX(s)); + SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse()); + fpr.UnlockAll(); + } + else + { + if (fpr.R(s).IsSimpleReg()) + MOVQ_xmm(R(RAX), fpr.RX(s)); + else + MOV(64, R(RAX), fpr.R(s)); + SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse()); + } + gpr.UnlockAll(); + gpr.UnlockAllX(); +} + +// This one is a little bit weird; it stores the low 32 bits of a double without converting it +void Jit64::stfiwx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStoreFloatingOff); + + int s = inst.RS; + int a = inst.RA; + int b = inst.RB; gpr.FlushLockX(ABI_PARAM1); - MOV(32, R(ABI_PARAM1), gpr.R(a)); + MOV(32, R(ABI_PARAM1), gpr.R(b)); + if (a) + ADD(32, R(ABI_PARAM1), gpr.R(a)); if (fpr.R(s).IsSimpleReg()) - MOVQ_xmm(R(RAX), fpr.RX(s)); + MOVD_xmm(R(EAX), fpr.RX(s)); else - MOV(64, R(RAX), fpr.R(s)); - - s32 offset = (s32)(s16)inst.SIMM_16; - SafeWriteRegToReg(RAX, ABI_PARAM1, 64, offset, CallerSavedRegistersInUse()); - - gpr.UnlockAllX(); -} - -void Jit64::stfs(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(!inst.RA); - - int s = inst.RS; - int a = inst.RA; - s32 offset = (s32)(s16)inst.SIMM_16; - - fpr.BindToRegister(s, true, false); - ConvertDoubleToSingle(XMM0, fpr.RX(s)); - gpr.FlushLockX(ABI_PARAM1); - MOV(32, R(ABI_PARAM1), gpr.R(a)); - SafeWriteF32ToReg(XMM0, ABI_PARAM1, offset, CallerSavedRegistersInUse()); - fpr.UnlockAll(); - gpr.UnlockAllX(); -} - -void Jit64::stfsx(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITLoadStoreFloatingOff); - - gpr.FlushLockX(ABI_PARAM1); - MOV(32, R(ABI_PARAM1), gpr.R(inst.RB)); - if (inst.RA) - ADD(32, R(ABI_PARAM1), gpr.R(inst.RA)); - - int s = inst.RS; - fpr.Lock(s); - fpr.BindToRegister(s, true, false); - ConvertDoubleToSingle(XMM0, fpr.RX(s)); - SafeWriteF32ToReg(XMM0, ABI_PARAM1, 0, CallerSavedRegistersInUse()); - fpr.UnlockAll(); - gpr.UnlockAllX(); -} - -void Jit64::lfsx(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITLoadStoreFloatingOff); - - MOV(32, R(EAX), gpr.R(inst.RB)); - if (inst.RA) - ADD(32, R(EAX), gpr.R(inst.RA)); - - SafeLoadToReg(EAX, R(EAX), 32, 0, CallerSavedRegistersInUse(), false); - - fpr.Lock(inst.RS); - fpr.BindToRegister(inst.RS, js.memcheck); - - MEMCHECK_START - ConvertSingleToDouble(fpr.RX(inst.RS), EAX, true); - MEMCHECK_END - - fpr.UnlockAll(); + MOV(32, R(EAX), fpr.R(s)); + SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, CallerSavedRegistersInUse()); gpr.UnlockAllX(); }