Jit: FIFO optimization improvements
This introduces speculative constants, allowing FIFO writes to be optimized in more places. It also clarifies the guarantees of the FIFO optimization, changing the location of some of the checks and potentially avoiding redundant checks.
This commit is contained in:
parent
0bd5db3e05
commit
789975e350
|
@ -136,7 +136,7 @@ void CachedInterpreter::Jit(u32 address)
|
|||
|
||||
js.blockStart = PC;
|
||||
js.firstFPInstructionFound = false;
|
||||
js.fifoBytesThisBlock = 0;
|
||||
js.fifoBytesSinceCheck = 0;
|
||||
js.downcountAmount = 0;
|
||||
js.curBlock = b;
|
||||
|
||||
|
|
|
@ -349,7 +349,7 @@ bool Jit64::Cleanup()
|
|||
{
|
||||
bool did_something = false;
|
||||
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
|
||||
{
|
||||
ABI_PushRegistersAndAdjustStack({}, 0);
|
||||
ABI_CallFunction((void*)&GPFifo::FastCheckGatherPipe);
|
||||
|
@ -597,7 +597,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
js.firstFPInstructionFound = false;
|
||||
js.isLastInstruction = false;
|
||||
js.blockStart = em_address;
|
||||
js.fifoBytesThisBlock = 0;
|
||||
js.fifoBytesSinceCheck = 0;
|
||||
js.mustCheckFifo = false;
|
||||
js.curBlock = b;
|
||||
js.numLoadStoreInst = 0;
|
||||
|
@ -690,6 +690,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
}
|
||||
}
|
||||
|
||||
if (js.noSpeculativeConstantsAddresses.find(js.blockStart) ==
|
||||
js.noSpeculativeConstantsAddresses.end())
|
||||
{
|
||||
IntializeSpeculativeConstants();
|
||||
}
|
||||
|
||||
// Translate instructions
|
||||
for (u32 i = 0; i < code_block.m_num_instructions; i++)
|
||||
{
|
||||
|
@ -724,10 +730,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
js.fifoWriteAddresses.find(ops[i].address) != js.fifoWriteAddresses.end();
|
||||
|
||||
// Gather pipe writes using an immediate address are explicitly tracked.
|
||||
if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo))
|
||||
if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo))
|
||||
{
|
||||
if (js.fifoBytesThisBlock >= 32)
|
||||
js.fifoBytesThisBlock -= 32;
|
||||
js.fifoBytesSinceCheck = 0;
|
||||
js.mustCheckFifo = false;
|
||||
BitSet32 registersInUse = CallerSavedRegistersInUse();
|
||||
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
|
||||
|
@ -967,3 +972,39 @@ void Jit64::EnableOptimization()
|
|||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||
}
|
||||
|
||||
void Jit64::IntializeSpeculativeConstants()
|
||||
{
|
||||
// If the block depends on an input register which looks like a gather pipe or MMIO related
|
||||
// constant, guess that it is actually a constant input, and specialize the block based on this
|
||||
// assumption. This happens when there are branches in code writing to the gather pipe, but only
|
||||
// the first block loads the constant.
|
||||
// Insert a check at the start of the block to verify that the value is actually constant.
|
||||
// This can save a lot of backpatching and optimize gather pipe writes in more places.
|
||||
const u8* target = nullptr;
|
||||
for (auto i : code_block.m_gpr_inputs)
|
||||
{
|
||||
u32 compileTimeValue = PowerPC::ppcState.gpr[i];
|
||||
if (PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue) ||
|
||||
PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue - 0x8000) ||
|
||||
compileTimeValue == 0xCC000000)
|
||||
{
|
||||
if (!target)
|
||||
{
|
||||
SwitchToFarCode();
|
||||
target = GetCodePtr();
|
||||
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
|
||||
ABI_PushRegistersAndAdjustStack({}, 0);
|
||||
ABI_CallFunctionC(
|
||||
reinterpret_cast<void*>(&JitInterface::CompileExceptionCheck),
|
||||
static_cast<u32>(JitInterface::ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS));
|
||||
ABI_PopRegistersAndAdjustStack({}, 0);
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
SwitchToNearCode();
|
||||
}
|
||||
CMP(32, PPCSTATE(gpr[i]), Imm32(compileTimeValue));
|
||||
J_CC(CC_NZ, target);
|
||||
gpr.SetImmediate32(i, compileTimeValue, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,6 +68,8 @@ public:
|
|||
BitSet32 CallerSavedRegistersInUse() const;
|
||||
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
|
||||
|
||||
void IntializeSpeculativeConstants();
|
||||
|
||||
JitBlockCache* GetBlockCache() override { return &blocks; }
|
||||
void Trace();
|
||||
|
||||
|
|
|
@ -227,10 +227,12 @@ void RegCache::DiscardRegContentsIfCached(size_t preg)
|
|||
}
|
||||
}
|
||||
|
||||
void GPRRegCache::SetImmediate32(size_t preg, u32 immValue)
|
||||
void GPRRegCache::SetImmediate32(size_t preg, u32 immValue, bool dirty)
|
||||
{
|
||||
// "dirty" can be false to avoid redundantly flushing an immediate when
|
||||
// processing speculative constants.
|
||||
DiscardRegContentsIfCached(preg);
|
||||
regs[preg].away = true;
|
||||
regs[preg].away |= dirty;
|
||||
regs[preg].location = Imm32(immValue);
|
||||
}
|
||||
|
||||
|
@ -282,10 +284,7 @@ void RegCache::KillImmediate(size_t preg, bool doLoad, bool makeDirty)
|
|||
|
||||
void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
|
||||
{
|
||||
if (!regs[i].away && regs[i].location.IsImm())
|
||||
PanicAlert("Bad immediate");
|
||||
|
||||
if (!regs[i].away || (regs[i].away && regs[i].location.IsImm()))
|
||||
if (!regs[i].away || regs[i].location.IsImm())
|
||||
{
|
||||
X64Reg xr = GetFreeXReg();
|
||||
if (xregs[xr].dirty)
|
||||
|
@ -294,7 +293,7 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
|
|||
PanicAlert("GetFreeXReg returned locked register");
|
||||
xregs[xr].free = false;
|
||||
xregs[xr].ppcReg = i;
|
||||
xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
|
||||
xregs[xr].dirty = makeDirty || regs[i].away;
|
||||
if (doLoad)
|
||||
LoadRegister(i, xr);
|
||||
for (size_t j = 0; j < regs.size(); j++)
|
||||
|
|
|
@ -161,7 +161,7 @@ public:
|
|||
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
|
||||
Gen::OpArg GetDefaultLocation(size_t reg) const override;
|
||||
const Gen::X64Reg* GetAllocationOrder(size_t* count) override;
|
||||
void SetImmediate32(size_t preg, u32 immValue);
|
||||
void SetImmediate32(size_t preg, u32 immValue, bool dirty = true);
|
||||
BitSet32 GetRegUtilization() override;
|
||||
BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
|
||||
};
|
||||
|
|
|
@ -607,6 +607,6 @@ void Jit64::eieio(UGeckoInstruction inst)
|
|||
// optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block,
|
||||
// which is generally safe. However postponing FIFO writes across eieio instructions
|
||||
// is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection).
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
|
||||
js.mustCheckFifo = true;
|
||||
}
|
||||
|
|
|
@ -515,7 +515,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
|
|||
{
|
||||
js.isLastInstruction = false;
|
||||
js.blockStart = em_address;
|
||||
js.fifoBytesThisBlock = 0;
|
||||
js.fifoBytesSinceCheck = 0;
|
||||
js.curBlock = b;
|
||||
jit->js.numLoadStoreInst = 0;
|
||||
jit->js.numFloatingPointInst = 0;
|
||||
|
|
|
@ -150,7 +150,7 @@ void JitArm64::Break(UGeckoInstruction inst)
|
|||
|
||||
void JitArm64::Cleanup()
|
||||
{
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
|
||||
{
|
||||
gpr.Lock(W0);
|
||||
MOVI2R(X0, (u64)&GPFifo::FastCheckGatherPipe);
|
||||
|
@ -404,7 +404,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
|
|||
js.firstFPInstructionFound = false;
|
||||
js.assumeNoPairedQuantize = false;
|
||||
js.blockStart = em_address;
|
||||
js.fifoBytesThisBlock = 0;
|
||||
js.fifoBytesSinceCheck = 0;
|
||||
js.mustCheckFifo = false;
|
||||
js.downcountAmount = 0;
|
||||
js.skipInstructions = 0;
|
||||
|
@ -492,10 +492,9 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
|
|||
bool gatherPipeIntCheck =
|
||||
jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end();
|
||||
|
||||
if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo))
|
||||
if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo))
|
||||
{
|
||||
if (js.fifoBytesThisBlock >= 32)
|
||||
js.fifoBytesThisBlock -= 32;
|
||||
js.fifoBytesSinceCheck = 0;
|
||||
js.mustCheckFifo = false;
|
||||
|
||||
gpr.Lock(W30);
|
||||
|
|
|
@ -333,7 +333,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
|
|||
}
|
||||
ADD(W0, W0, accessSize >> 3);
|
||||
STR(INDEX_UNSIGNED, W0, X30, count_off);
|
||||
js.fifoBytesThisBlock += accessSize >> 3;
|
||||
js.fifoBytesSinceCheck += accessSize >> 3;
|
||||
|
||||
if (accessSize != 8)
|
||||
gpr.Unlock(WA);
|
||||
|
@ -862,6 +862,6 @@ void JitArm64::eieio(UGeckoInstruction inst)
|
|||
// optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block,
|
||||
// which is generally safe. However postponing FIFO writes across eieio instructions
|
||||
// is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection).
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
|
||||
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
|
||||
js.mustCheckFifo = true;
|
||||
}
|
||||
|
|
|
@ -442,7 +442,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
|
|||
|
||||
ADD(W0, W0, accessSize >> 3);
|
||||
STR(INDEX_UNSIGNED, W0, X30, count_off);
|
||||
js.fifoBytesThisBlock += accessSize >> 3;
|
||||
js.fifoBytesSinceCheck += accessSize >> 3;
|
||||
|
||||
if (update)
|
||||
{
|
||||
|
|
|
@ -104,7 +104,7 @@ protected:
|
|||
u8* trampolineExceptionHandler;
|
||||
|
||||
bool mustCheckFifo;
|
||||
int fifoBytesThisBlock;
|
||||
int fifoBytesSinceCheck;
|
||||
|
||||
PPCAnalyst::BlockStats st;
|
||||
PPCAnalyst::BlockRegStats gpa;
|
||||
|
@ -116,6 +116,7 @@ protected:
|
|||
|
||||
std::unordered_set<u32> fifoWriteAddresses;
|
||||
std::unordered_set<u32> pairedQuantizeAddresses;
|
||||
std::unordered_set<u32> noSpeculativeConstantsAddresses;
|
||||
};
|
||||
|
||||
PPCAnalyst::CodeBlock code_block;
|
||||
|
|
|
@ -475,7 +475,7 @@ void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize)
|
|||
CALL(jit->GetAsmRoutines()->fifoDirectWrite64);
|
||||
break;
|
||||
}
|
||||
jit->js.fifoBytesThisBlock += accessSize >> 3;
|
||||
jit->js.fifoBytesSinceCheck += accessSize >> 3;
|
||||
}
|
||||
|
||||
bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address,
|
||||
|
|
|
@ -260,6 +260,9 @@ void CompileExceptionCheck(ExceptionType type)
|
|||
case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE:
|
||||
exception_addresses = &jit->js.pairedQuantizeAddresses;
|
||||
break;
|
||||
case ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS:
|
||||
exception_addresses = &jit->js.noSpeculativeConstantsAddresses;
|
||||
break;
|
||||
}
|
||||
|
||||
if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end()))
|
||||
|
|
|
@ -15,7 +15,8 @@ namespace JitInterface
|
|||
enum class ExceptionType
|
||||
{
|
||||
EXCEPTIONS_FIFO_WRITE,
|
||||
EXCEPTIONS_PAIRED_QUANTIZE
|
||||
EXCEPTIONS_PAIRED_QUANTIZE,
|
||||
EXCEPTIONS_SPECULATIVE_CONSTANTS
|
||||
};
|
||||
|
||||
void DoState(PointerWrap& p);
|
||||
|
|
|
@ -849,10 +849,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||
}
|
||||
|
||||
// Forward scan, for flags that need the other direction for calculation.
|
||||
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
|
||||
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe, gprDefined, gprBlockInputs;
|
||||
BitSet8 gqrUsed, gqrModified;
|
||||
for (u32 i = 0; i < block->m_num_instructions; i++)
|
||||
{
|
||||
gprBlockInputs |= code[i].regsIn & ~gprDefined;
|
||||
gprDefined |= code[i].regsOut;
|
||||
|
||||
code[i].fprIsSingle = fprIsSingle;
|
||||
code[i].fprIsDuplicated = fprIsDuplicated;
|
||||
code[i].fprIsStoreSafe = fprIsStoreSafe;
|
||||
|
@ -905,6 +908,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||
}
|
||||
block->m_gqr_used = gqrUsed;
|
||||
block->m_gqr_modified = gqrModified;
|
||||
block->m_gpr_inputs = gprBlockInputs;
|
||||
return address;
|
||||
}
|
||||
|
||||
|
|
|
@ -154,6 +154,9 @@ struct CodeBlock
|
|||
|
||||
// Which GQRs this block modifies, if any.
|
||||
BitSet8 m_gqr_modified;
|
||||
|
||||
// Which GPRs this block reads from before defining, if any.
|
||||
BitSet32 m_gpr_inputs;
|
||||
};
|
||||
|
||||
class PPCAnalyzer
|
||||
|
|
Loading…
Reference in New Issue