diff --git a/Source/Core/Core/PowerPC/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter.cpp index de7aa218ae..c57ab1e8d8 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter.cpp @@ -144,7 +144,7 @@ void CachedInterpreter::Jit(u32 address) js.blockStart = PC; js.firstFPInstructionFound = false; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.downcountAmount = 0; js.curBlock = b; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 0b9d1e6f5b..40c5cbeb06 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -349,7 +349,7 @@ bool Jit64::Cleanup() { bool did_something = false; - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) { ABI_PushRegistersAndAdjustStack({}, 0); ABI_CallFunction(GPFifo::FastCheckGatherPipe); @@ -597,7 +597,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc js.firstFPInstructionFound = false; js.isLastInstruction = false; js.blockStart = em_address; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; js.curBlock = b; js.numLoadStoreInst = 0; @@ -690,6 +690,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc } } + if (js.noSpeculativeConstantsAddresses.find(js.blockStart) == + js.noSpeculativeConstantsAddresses.end()) + { + IntializeSpeculativeConstants(); + } + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { @@ -724,10 +730,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc js.fifoWriteAddresses.find(ops[i].address) != js.fifoWriteAddresses.end(); // Gather pipe writes using an immediate address are explicitly tracked. - if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo)) + if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo)) { - if (js.fifoBytesThisBlock >= 32) - js.fifoBytesThisBlock -= 32; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); @@ -967,3 +972,39 @@ void Jit64::EnableOptimization() analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); } + +void Jit64::IntializeSpeculativeConstants() +{ + // If the block depends on an input register which looks like a gather pipe or MMIO related + // constant, guess that it is actually a constant input, and specialize the block based on this + // assumption. This happens when there are branches in code writing to the gather pipe, but only + // the first block loads the constant. + // Insert a check at the start of the block to verify that the value is actually constant. + // This can save a lot of backpatching and optimize gather pipe writes in more places. + const u8* target = nullptr; + for (auto i : code_block.m_gpr_inputs) + { + u32 compileTimeValue = PowerPC::ppcState.gpr[i]; + if (PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue) || + PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue - 0x8000) || + compileTimeValue == 0xCC000000) + { + if (!target) + { + SwitchToFarCode(); + target = GetCodePtr(); + MOV(32, PPCSTATE(pc), Imm32(js.blockStart)); + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunctionC( + reinterpret_cast(&JitInterface::CompileExceptionCheck), + static_cast(JitInterface::ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS)); + ABI_PopRegistersAndAdjustStack({}, 0); + JMP(asm_routines.dispatcher, true); + SwitchToNearCode(); + } + CMP(32, PPCSTATE(gpr[i]), Imm32(compileTimeValue)); + J_CC(CC_NZ, target); + gpr.SetImmediate32(i, compileTimeValue, false); + } + } +} diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 4fce2e91bf..9ff580d1a2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -68,6 +68,8 @@ public: BitSet32 CallerSavedRegistersInUse() const; BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const; + void IntializeSpeculativeConstants(); + JitBlockCache* GetBlockCache() override { return &blocks; } void Trace(); diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index bc5384d481..67baa2f485 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -227,10 +227,12 @@ void RegCache::DiscardRegContentsIfCached(size_t preg) } } -void GPRRegCache::SetImmediate32(size_t preg, u32 immValue) +void GPRRegCache::SetImmediate32(size_t preg, u32 immValue, bool dirty) { + // "dirty" can be false to avoid redundantly flushing an immediate when + // processing speculative constants. DiscardRegContentsIfCached(preg); - regs[preg].away = true; + regs[preg].away |= dirty; regs[preg].location = Imm32(immValue); } @@ -282,10 +284,7 @@ void RegCache::KillImmediate(size_t preg, bool doLoad, bool makeDirty) void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty) { - if (!regs[i].away && regs[i].location.IsImm()) - PanicAlert("Bad immediate"); - - if (!regs[i].away || (regs[i].away && regs[i].location.IsImm())) + if (!regs[i].away || regs[i].location.IsImm()) { X64Reg xr = GetFreeXReg(); if (xregs[xr].dirty) @@ -294,7 +293,7 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty) PanicAlert("GetFreeXReg returned locked register"); xregs[xr].free = false; xregs[xr].ppcReg = i; - xregs[xr].dirty = makeDirty || regs[i].location.IsImm(); + xregs[xr].dirty = makeDirty || regs[i].away; if (doLoad) LoadRegister(i, xr); for (size_t j = 0; j < regs.size(); j++) diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 14ed139a42..f47e57e377 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -161,7 +161,7 @@ public: void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; Gen::OpArg GetDefaultLocation(size_t reg) const override; const Gen::X64Reg* GetAllocationOrder(size_t* count) override; - void SetImmediate32(size_t preg, u32 immValue); + void SetImmediate32(size_t preg, u32 immValue, bool dirty = true); BitSet32 GetRegUtilization() override; BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; }; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index f834282ea9..24eb65dc1d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -592,6 +592,6 @@ void Jit64::eieio(UGeckoInstruction inst) // optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block, // which is generally safe. However postponing FIFO writes across eieio instructions // is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection). - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) js.mustCheckFifo = true; } diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 3849d3f014..62259cf579 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -515,7 +515,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc { js.isLastInstruction = false; js.blockStart = em_address; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.curBlock = b; jit->js.numLoadStoreInst = 0; jit->js.numFloatingPointInst = 0; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 3564c91c01..c203c1e002 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -170,7 +170,7 @@ void JitArm64::Break(UGeckoInstruction inst) void JitArm64::Cleanup() { - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) { gpr.Lock(W0); MOVP2R(X0, &GPFifo::FastCheckGatherPipe); @@ -424,7 +424,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB js.firstFPInstructionFound = false; js.assumeNoPairedQuantize = false; js.blockStart = em_address; - js.fifoBytesThisBlock = 0; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; js.downcountAmount = 0; js.skipInstructions = 0; @@ -512,10 +512,9 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB bool gatherPipeIntCheck = jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end(); - if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo)) + if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo)) { - if (js.fifoBytesThisBlock >= 32) - js.fifoBytesThisBlock -= 32; + js.fifoBytesSinceCheck = 0; js.mustCheckFifo = false; gpr.Lock(W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 6a66bf1907..cddaddfcb0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -333,7 +333,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } ADD(W0, W0, accessSize >> 3); STR(INDEX_UNSIGNED, W0, X30, count_off); - js.fifoBytesThisBlock += accessSize >> 3; + js.fifoBytesSinceCheck += accessSize >> 3; if (accessSize != 8) gpr.Unlock(WA); @@ -833,6 +833,6 @@ void JitArm64::eieio(UGeckoInstruction inst) // optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block, // which is generally safe. However postponing FIFO writes across eieio instructions // is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection). - if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) js.mustCheckFifo = true; } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 256b724c72..f1ecdaba36 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -442,7 +442,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) ADD(W0, W0, accessSize >> 3); STR(INDEX_UNSIGNED, W0, X30, count_off); - js.fifoBytesThisBlock += accessSize >> 3; + js.fifoBytesSinceCheck += accessSize >> 3; if (update) { diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 489bb5ba38..611395e3b8 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -104,7 +104,7 @@ protected: u8* trampolineExceptionHandler; bool mustCheckFifo; - int fifoBytesThisBlock; + int fifoBytesSinceCheck; PPCAnalyst::BlockStats st; PPCAnalyst::BlockRegStats gpa; @@ -116,6 +116,7 @@ protected: std::unordered_set fifoWriteAddresses; std::unordered_set pairedQuantizeAddresses; + std::unordered_set noSpeculativeConstantsAddresses; }; PPCAnalyst::CodeBlock code_block; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 8b2bc45024..bc873f4026 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -462,7 +462,7 @@ void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize) CALL(jit->GetAsmRoutines()->fifoDirectWrite64); break; } - jit->js.fifoBytesThisBlock += accessSize >> 3; + jit->js.fifoBytesSinceCheck += accessSize >> 3; } bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address, diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 8ace00aa0e..8b81f07fb5 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -260,6 +260,9 @@ void CompileExceptionCheck(ExceptionType type) case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE: exception_addresses = &jit->js.pairedQuantizeAddresses; break; + case ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS: + exception_addresses = &jit->js.noSpeculativeConstantsAddresses; + break; } if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end())) diff --git a/Source/Core/Core/PowerPC/JitInterface.h b/Source/Core/Core/PowerPC/JitInterface.h index 4608f306a1..01fd6bf911 100644 --- a/Source/Core/Core/PowerPC/JitInterface.h +++ b/Source/Core/Core/PowerPC/JitInterface.h @@ -15,7 +15,8 @@ namespace JitInterface enum class ExceptionType { EXCEPTIONS_FIFO_WRITE, - EXCEPTIONS_PAIRED_QUANTIZE + EXCEPTIONS_PAIRED_QUANTIZE, + EXCEPTIONS_SPECULATIVE_CONSTANTS }; void DoState(PointerWrap& p); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 9010eea42e..8e6df5d8ce 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -843,10 +843,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 } // Forward scan, for flags that need the other direction for calculation. - BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; + BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe, gprDefined, gprBlockInputs; BitSet8 gqrUsed, gqrModified; for (u32 i = 0; i < block->m_num_instructions; i++) { + gprBlockInputs |= code[i].regsIn & ~gprDefined; + gprDefined |= code[i].regsOut; + code[i].fprIsSingle = fprIsSingle; code[i].fprIsDuplicated = fprIsDuplicated; code[i].fprIsStoreSafe = fprIsStoreSafe; @@ -899,6 +902,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32 } block->m_gqr_used = gqrUsed; block->m_gqr_modified = gqrModified; + block->m_gpr_inputs = gprBlockInputs; return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 2fddaa59f1..42625757b4 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -154,6 +154,9 @@ struct CodeBlock // Which GQRs this block modifies, if any. BitSet8 m_gqr_modified; + + // Which GPRs this block reads from before defining, if any. + BitSet32 m_gpr_inputs; }; class PPCAnalyzer