diff --git a/Source/Core/Common/BitSet.h b/Source/Core/Common/BitSet.h index 9be7ccbe92..70b9204fa5 100644 --- a/Source/Core/Common/BitSet.h +++ b/Source/Core/Common/BitSet.h @@ -21,6 +21,18 @@ static inline int CountSetBits(T v) v = (v + (v >> 4)) & (T)~(T)0/255*15; return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; } +static inline int LeastSignificantSetBit(u8 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +static inline int LeastSignificantSetBit(u16 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} static inline int LeastSignificantSetBit(u32 val) { unsigned long index; @@ -34,8 +46,12 @@ static inline int LeastSignificantSetBit(u64 val) return (int)index; } #else +static inline int CountSetBits(u8 val) { return __builtin_popcount(val); } +static inline int CountSetBits(u16 val) { return __builtin_popcount(val); } static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } +static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); } +static inline int LeastSignificantSetBit(u16 val) { return __builtin_ctz(val); } static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } #endif @@ -163,5 +179,7 @@ public: } +typedef BS::BitSet BitSet8; +typedef BS::BitSet BitSet16; typedef BS::BitSet BitSet32; typedef BS::BitSet BitSet64; diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index b6ff10b52e..639a899f57 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -137,10 +137,10 @@ static GekkoOPTemplate table4_2[] = static GekkoOPTemplate table4_3[] = { - {6, Interpreter::psq_lx, {"psq_lx", OPTYPE_PS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, - {7, Interpreter::psq_stx, {"psq_stx", OPTYPE_PS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, - {38, Interpreter::psq_lux, {"psq_lux", OPTYPE_PS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, - {39, Interpreter::psq_stux, {"psq_stux", OPTYPE_PS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {6, Interpreter::psq_lx, {"psq_lx", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {7, Interpreter::psq_stx, {"psq_stx", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {38, Interpreter::psq_lux, {"psq_lux", OPTYPE_LOADPS, FL_OUT_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, + {39, Interpreter::psq_stux, {"psq_stux", OPTYPE_STOREPS, FL_IN_FLOAT_S | FL_OUT_A | FL_IN_AB | FL_USE_FPU | FL_LOADSTORE, 1, 0, 0, 0}}, }; static GekkoOPTemplate table19[] = diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 539bb1b84b..2ccd045291 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -15,6 +15,7 @@ #include "Core/PatchEngine.h" #include "Core/HLE/HLE.h" #include "Core/HW/ProcessorInterface.h" +#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/Profiler.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/Jit64_Tables.h" @@ -605,6 +606,35 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.skipnext = false; js.carryFlagSet = false; js.carryFlagInverted = false; + js.assumeNoPairedQuantize = false; + + // If the block only uses one GQR and the GQR is zero at compile time, make a guess that the block + // never uses quantized loads/stores. Many paired-heavy games use largely float loads and stores, + // which are significantly faster when inlined (especially in MMU mode, where this lets them use + // fastmem). + // Insert a check that the GQR is still zero at the start of the block in case our guess turns out + // wrong. + // TODO: support any other constant GQR value, not merely zero/unquantized: we can optimize quantized + // loadstores too, it'd just be more code. + if (code_block.m_gqr_used.Count() == 1 && js.pairedQuantizeAddresses.find(js.blockStart) == js.pairedQuantizeAddresses.end()) + { + int gqr = *code_block.m_gqr_used.begin(); + if (!code_block.m_gqr_modified[gqr] && !GQR(gqr)) + { + CMP(32, PPCSTATE(spr[SPR_GQR0 + gqr]), Imm8(0)); + FixupBranch failure = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(failure); + MOV(32, PPCSTATE(pc), Imm32(js.blockStart)); + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunctionC((void *)&JitInterface::CompileExceptionCheck, (u32)JitInterface::ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE); + ABI_PopRegistersAndAdjustStack({}, 0); + JMP(asm_routines.dispatcher, true); + SwitchToNearCode(); + js.assumeNoPairedQuantize = true; + } + } + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index b6dac78f86..0196f5deec 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -11,6 +11,7 @@ #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" +#include "Core/PowerPC/JitCommon/JitAsmCommon.h" using namespace Gen; @@ -20,7 +21,6 @@ void Jit64::psq_stXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(!inst.RA); s32 offset = inst.SIMM_12; bool indexed = inst.OPCD == 4; @@ -30,12 +30,75 @@ void Jit64::psq_stXX(UGeckoInstruction inst) int s = inst.FS; int i = indexed ? inst.Ix : inst.I; int w = indexed ? inst.Wx : inst.W; + FALLBACK_IF(!a); gpr.Lock(a, b); + if (js.assumeNoPairedQuantize) + { + int storeOffset = 0; + gpr.BindToRegister(a, true, update); + X64Reg addr = gpr.RX(a); + if (update && js.memcheck) + { + addr = RSCRATCH2; + MOV(32, R(addr), gpr.R(a)); + } + if (indexed) + { + if (update) + { + ADD(32, R(addr), gpr.R(b)); + } + else + { + addr = RSCRATCH2; + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + { + LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + } + else + { + MOV(32, R(addr), gpr.R(b)); + if (a) + ADD(32, R(addr), gpr.R(a)); + } + } + } + else + { + if (update) + ADD(32, R(addr), Imm32(offset)); + else + storeOffset = offset; + } + + fpr.Lock(s); + if (w) + { + CVTSD2SS(XMM0, fpr.R(s)); + MOVD_xmm(R(RSCRATCH), XMM0); + } + else + { + CVTPD2PS(XMM0, fpr.R(s)); + MOVQ_xmm(R(RSCRATCH), XMM0); + ROL(64, R(RSCRATCH), Imm8(32)); + } + + BitSet32 registersInUse = CallerSavedRegistersInUse(); + if (update && js.memcheck) + registersInUse[addr] = true; + SafeWriteRegToReg(RSCRATCH, addr, w ? 32 : 64, storeOffset, registersInUse); + MemoryExceptionCheck(); + if (update && js.memcheck) + MOV(32, gpr.R(a), R(addr)); + gpr.UnlockAll(); + fpr.UnlockAll(); + return; + } gpr.FlushLockX(RSCRATCH_EXTRA); if (update) gpr.BindToRegister(a, true, true); - fpr.BindToRegister(s, true, false); if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && (indexed || offset)) { if (indexed) @@ -92,7 +155,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(!inst.RA); s32 offset = inst.SIMM_12; bool indexed = inst.OPCD == 4; @@ -102,8 +164,116 @@ void Jit64::psq_lXX(UGeckoInstruction inst) int s = inst.FS; int i = indexed ? inst.Ix : inst.I; int w = indexed ? inst.Wx : inst.W; + FALLBACK_IF(!a); gpr.Lock(a, b); + if (js.assumeNoPairedQuantize) + { + s32 loadOffset = 0; + gpr.BindToRegister(a, true, update); + X64Reg addr = gpr.RX(a); + if (update && js.memcheck) + { + addr = RSCRATCH2; + MOV(32, R(addr), gpr.R(a)); + } + if (indexed) + { + if (update) + { + ADD(32, R(addr), gpr.R(b)); + } + else + { + addr = RSCRATCH2; + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + { + LEA(32, addr, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + } + else + { + MOV(32, R(addr), gpr.R(b)); + if (a) + ADD(32, R(addr), gpr.R(a)); + } + } + } + else + { + if (update) + ADD(32, R(addr), Imm32(offset)); + else + loadOffset = offset; + } + + fpr.Lock(s); + if (js.memcheck) + { + fpr.StoreFromRegister(s); + js.revertFprLoad = s; + } + fpr.BindToRegister(s, false); + + // Let's mirror the JitAsmCommon code and assume all non-MMU loads go to RAM. + if (!js.memcheck) + { + if (w) + { + if (cpu_info.bSSSE3) + { + MOVD_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset)); + PSHUFB(XMM0, M(pbswapShuffle1x4)); + UNPCKLPS(XMM0, M(m_one)); + } + else + { + LoadAndSwap(32, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset)); + MOVD_xmm(XMM0, R(RSCRATCH)); + UNPCKLPS(XMM0, M(m_one)); + } + } + else + { + if (cpu_info.bSSSE3) + { + MOVQ_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset)); + PSHUFB(XMM0, M(pbswapShuffle2x4)); + } + else + { + LoadAndSwap(64, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset)); + ROL(64, R(RSCRATCH), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH)); + } + } + CVTPS2PD(fpr.RX(s), R(XMM0)); + } + else + { + BitSet32 registersInUse = CallerSavedRegistersInUse(); + registersInUse[fpr.RX(s) << 16] = false; + if (update) + registersInUse[addr] = true; + SafeLoadToReg(RSCRATCH, R(addr), w ? 32 : 64, loadOffset, registersInUse, false); + MemoryExceptionCheck(); + if (w) + { + MOVD_xmm(XMM0, R(RSCRATCH)); + UNPCKLPS(XMM0, M(m_one)); + } + else + { + ROL(64, R(RSCRATCH), Imm8(32)); + MOVQ_xmm(XMM0, R(RSCRATCH)); + } + CVTPS2PD(fpr.RX(s), R(XMM0)); + if (update) + MOV(32, gpr.R(a), R(addr)); + } + gpr.UnlockAll(); + fpr.UnlockAll(); + return; + } gpr.FlushLockX(RSCRATCH_EXTRA); gpr.BindToRegister(a, true, update); fpr.BindToRegister(s, false, true); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 0f95402983..c7b3995510 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -191,8 +191,8 @@ void CommonAsmRoutines::GenMfcr() // Safe + Fast Quantizers, originally from JITIL by magumagu -static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -static const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 }; +const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; +const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 }; static const float GC_ALIGNED16(m_quantizeTableS[]) = { @@ -257,7 +257,7 @@ static const float GC_ALIGNED16(m_255) = 255.0f; static const float GC_ALIGNED16(m_127) = 127.0f; static const float GC_ALIGNED16(m_m128) = -128.0f; -static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f}; +const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f }; #define QUANTIZE_OVERFLOW_SAFE diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 34a7232a45..b41bc26875 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -6,6 +6,10 @@ #include "Core/PowerPC/JitCommon/Jit_Util.h" +extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]); +extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]); +extern const float GC_ALIGNED16(m_one[]); + class CommonAsmRoutinesBase { public: diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index cb79f3f511..96dfdf7510 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -84,6 +84,7 @@ protected: int revertGprLoad; int revertFprLoad; + bool assumeNoPairedQuantize; bool firstFPInstructionFound; bool isLastInstruction; bool memcheck; @@ -104,6 +105,7 @@ protected: JitBlock *curBlock; std::unordered_set fifoWriteAddresses; + std::unordered_set pairedQuantizeAddresses; }; PPCAnalyst::CodeBlock code_block; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp index a3aee33ed4..f4c84be65f 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.cpp @@ -65,6 +65,7 @@ using namespace Gen; Core::DisplayMessage("Clearing code cache.", 3000); #endif jit->js.fifoWriteAddresses.clear(); + jit->js.pairedQuantizeAddresses.clear(); for (int i = 0; i < num_blocks; i++) { DestroyBlock(i, false); @@ -311,7 +312,10 @@ using namespace Gen; if (!forced) { for (u32 i = address; i < address + length; i += 4) + { jit->js.fifoWriteAddresses.erase(i); + jit->js.pairedQuantizeAddresses.erase(i); + } } } } diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 87cb0c6d43..d3ad766733 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -240,18 +240,24 @@ namespace JitInterface case ExceptionType::EXCEPTIONS_FIFO_WRITE: exception_addresses = &jit->js.fifoWriteAddresses; break; + case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE: + exception_addresses = &jit->js.pairedQuantizeAddresses; + break; } if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end())) { - int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type; - if (optype == OPTYPE_STORE || optype == OPTYPE_STOREFP || (optype == OPTYPE_STOREPS)) + if (type == ExceptionType::EXCEPTIONS_FIFO_WRITE) { - exception_addresses->insert(PC); - - // Invalidate the JIT block so that it gets recompiled with the external exception check included. - jit->GetBlockCache()->InvalidateICache(PC, 4, true); + // Check in case the code has been replaced since: do we need to do this? + int optype = GetOpInfo(Memory::ReadUnchecked_U32(PC))->type; + if (optype != OPTYPE_STORE && optype != OPTYPE_STOREFP && (optype != OPTYPE_STOREPS)) + return; } + exception_addresses->insert(PC); + + // Invalidate the JIT block so that it gets recompiled with the external exception check included. + jit->GetBlockCache()->InvalidateICache(PC, 4, true); } } diff --git a/Source/Core/Core/PowerPC/JitInterface.h b/Source/Core/Core/PowerPC/JitInterface.h index 1fe8b85086..3c56683af1 100644 --- a/Source/Core/Core/PowerPC/JitInterface.h +++ b/Source/Core/Core/PowerPC/JitInterface.h @@ -13,7 +13,8 @@ namespace JitInterface { enum class ExceptionType { - EXCEPTIONS_FIFO_WRITE + EXCEPTIONS_FIFO_WRITE, + EXCEPTIONS_PAIRED_QUANTIZE }; void DoState(PointerWrap &p); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 5a1f859bf2..c2068a2e61 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -638,6 +638,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 block->m_broken = false; block->m_memory_exception = false; block->m_num_instructions = 0; + block->m_gqr_used = BitSet8(0); if (address == 0) { @@ -865,6 +866,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // Forward scan, for flags that need the other direction for calculation. BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; + BitSet8 gqrUsed, gqrModified; for (u32 i = 0; i < block->m_num_instructions; i++) { code[i].fprIsSingle = fprIsSingle; @@ -903,7 +905,22 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 if (!strncmp(code[i].opinfo->opname, "mtfs", 4)) fprIsStoreSafe = BitSet32(0); } + + if (code[i].opinfo->type == OPTYPE_STOREPS || code[i].opinfo->type == OPTYPE_LOADPS) + { + int gqr = code[i].inst.OPCD == 4 ? code[i].inst.Ix : code[i].inst.I; + gqrUsed[gqr] = true; + } + + if (code[i].inst.OPCD == 31 && code[i].inst.SUBOP10 == 467) // mtspr + { + int gqr = ((code[i].inst.SPRU << 5) | code[i].inst.SPRL) - SPR_GQR0; + if (gqr >= 0 && gqr <= 7) + gqrModified[gqr] = true; + } } + block->m_gqr_used = gqrUsed; + block->m_gqr_modified = gqrModified; return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 59c637e5b2..468d036a8e 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -154,6 +154,12 @@ struct CodeBlock // Did we have a memory_exception? bool m_memory_exception; + + // Which GQRs this block uses, if any. + BitSet8 m_gqr_used; + + // Which GQRs this block modifies, if any. + BitSet8 m_gqr_modified; }; class PPCAnalyzer