Jit: FIFO optimization improvements

This introduces speculative constants, allowing FIFO writes to be
optimized in more places.

It also clarifies the guarantees of the FIFO optimization, changing
the location of some of the checks and potentially avoiding redundant
checks.
This commit is contained in:
hthh 2016-06-17 21:31:27 +10:00
parent 0bd5db3e05
commit 789975e350
16 changed files with 81 additions and 28 deletions

View File

@ -136,7 +136,7 @@ void CachedInterpreter::Jit(u32 address)
js.blockStart = PC;
js.firstFPInstructionFound = false;
js.fifoBytesThisBlock = 0;
js.fifoBytesSinceCheck = 0;
js.downcountAmount = 0;
js.curBlock = b;

View File

@ -349,7 +349,7 @@ bool Jit64::Cleanup()
{
bool did_something = false;
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction((void*)&GPFifo::FastCheckGatherPipe);
@ -597,7 +597,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
js.firstFPInstructionFound = false;
js.isLastInstruction = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.fifoBytesSinceCheck = 0;
js.mustCheckFifo = false;
js.curBlock = b;
js.numLoadStoreInst = 0;
@ -690,6 +690,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
}
}
if (js.noSpeculativeConstantsAddresses.find(js.blockStart) ==
js.noSpeculativeConstantsAddresses.end())
{
IntializeSpeculativeConstants();
}
// Translate instructions
for (u32 i = 0; i < code_block.m_num_instructions; i++)
{
@ -724,10 +730,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
js.fifoWriteAddresses.find(ops[i].address) != js.fifoWriteAddresses.end();
// Gather pipe writes using an immediate address are explicitly tracked.
if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo))
if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo))
{
if (js.fifoBytesThisBlock >= 32)
js.fifoBytesThisBlock -= 32;
js.fifoBytesSinceCheck = 0;
js.mustCheckFifo = false;
BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
@ -967,3 +972,39 @@ void Jit64::EnableOptimization()
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
}
void Jit64::IntializeSpeculativeConstants()
{
// If the block depends on an input register which looks like a gather pipe or MMIO related
// constant, guess that it is actually a constant input, and specialize the block based on this
// assumption. This happens when there are branches in code writing to the gather pipe, but only
// the first block loads the constant.
// Insert a check at the start of the block to verify that the value is actually constant.
// This can save a lot of backpatching and optimize gather pipe writes in more places.
const u8* target = nullptr;
for (auto i : code_block.m_gpr_inputs)
{
u32 compileTimeValue = PowerPC::ppcState.gpr[i];
if (PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue) ||
PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue - 0x8000) ||
compileTimeValue == 0xCC000000)
{
if (!target)
{
SwitchToFarCode();
target = GetCodePtr();
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC(
reinterpret_cast<void*>(&JitInterface::CompileExceptionCheck),
static_cast<u32>(JitInterface::ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS));
ABI_PopRegistersAndAdjustStack({}, 0);
JMP(asm_routines.dispatcher, true);
SwitchToNearCode();
}
CMP(32, PPCSTATE(gpr[i]), Imm32(compileTimeValue));
J_CC(CC_NZ, target);
gpr.SetImmediate32(i, compileTimeValue, false);
}
}
}

View File

@ -68,6 +68,8 @@ public:
BitSet32 CallerSavedRegistersInUse() const;
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
void IntializeSpeculativeConstants();
JitBlockCache* GetBlockCache() override { return &blocks; }
void Trace();

View File

@ -227,10 +227,12 @@ void RegCache::DiscardRegContentsIfCached(size_t preg)
}
}
void GPRRegCache::SetImmediate32(size_t preg, u32 immValue)
void GPRRegCache::SetImmediate32(size_t preg, u32 immValue, bool dirty)
{
// "dirty" can be false to avoid redundantly flushing an immediate when
// processing speculative constants.
DiscardRegContentsIfCached(preg);
regs[preg].away = true;
regs[preg].away |= dirty;
regs[preg].location = Imm32(immValue);
}
@ -282,10 +284,7 @@ void RegCache::KillImmediate(size_t preg, bool doLoad, bool makeDirty)
void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
{
if (!regs[i].away && regs[i].location.IsImm())
PanicAlert("Bad immediate");
if (!regs[i].away || (regs[i].away && regs[i].location.IsImm()))
if (!regs[i].away || regs[i].location.IsImm())
{
X64Reg xr = GetFreeXReg();
if (xregs[xr].dirty)
@ -294,7 +293,7 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
PanicAlert("GetFreeXReg returned locked register");
xregs[xr].free = false;
xregs[xr].ppcReg = i;
xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
xregs[xr].dirty = makeDirty || regs[i].away;
if (doLoad)
LoadRegister(i, xr);
for (size_t j = 0; j < regs.size(); j++)

View File

@ -161,7 +161,7 @@ public:
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
Gen::OpArg GetDefaultLocation(size_t reg) const override;
const Gen::X64Reg* GetAllocationOrder(size_t* count) override;
void SetImmediate32(size_t preg, u32 immValue);
void SetImmediate32(size_t preg, u32 immValue, bool dirty = true);
BitSet32 GetRegUtilization() override;
BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
};

View File

@ -607,6 +607,6 @@ void Jit64::eieio(UGeckoInstruction inst)
// optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block,
// which is generally safe. However postponing FIFO writes across eieio instructions
// is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection).
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
js.mustCheckFifo = true;
}

View File

@ -515,7 +515,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
{
js.isLastInstruction = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.fifoBytesSinceCheck = 0;
js.curBlock = b;
jit->js.numLoadStoreInst = 0;
jit->js.numFloatingPointInst = 0;

View File

@ -150,7 +150,7 @@ void JitArm64::Break(UGeckoInstruction inst)
void JitArm64::Cleanup()
{
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{
gpr.Lock(W0);
MOVI2R(X0, (u64)&GPFifo::FastCheckGatherPipe);
@ -404,7 +404,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
js.firstFPInstructionFound = false;
js.assumeNoPairedQuantize = false;
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.fifoBytesSinceCheck = 0;
js.mustCheckFifo = false;
js.downcountAmount = 0;
js.skipInstructions = 0;
@ -492,10 +492,9 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
bool gatherPipeIntCheck =
jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end();
if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo))
if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo))
{
if (js.fifoBytesThisBlock >= 32)
js.fifoBytesThisBlock -= 32;
js.fifoBytesSinceCheck = 0;
js.mustCheckFifo = false;
gpr.Lock(W30);

View File

@ -333,7 +333,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
}
ADD(W0, W0, accessSize >> 3);
STR(INDEX_UNSIGNED, W0, X30, count_off);
js.fifoBytesThisBlock += accessSize >> 3;
js.fifoBytesSinceCheck += accessSize >> 3;
if (accessSize != 8)
gpr.Unlock(WA);
@ -862,6 +862,6 @@ void JitArm64::eieio(UGeckoInstruction inst)
// optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block,
// which is generally safe. However postponing FIFO writes across eieio instructions
// is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection).
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0)
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
js.mustCheckFifo = true;
}

View File

@ -442,7 +442,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
ADD(W0, W0, accessSize >> 3);
STR(INDEX_UNSIGNED, W0, X30, count_off);
js.fifoBytesThisBlock += accessSize >> 3;
js.fifoBytesSinceCheck += accessSize >> 3;
if (update)
{

View File

@ -104,7 +104,7 @@ protected:
u8* trampolineExceptionHandler;
bool mustCheckFifo;
int fifoBytesThisBlock;
int fifoBytesSinceCheck;
PPCAnalyst::BlockStats st;
PPCAnalyst::BlockRegStats gpa;
@ -116,6 +116,7 @@ protected:
std::unordered_set<u32> fifoWriteAddresses;
std::unordered_set<u32> pairedQuantizeAddresses;
std::unordered_set<u32> noSpeculativeConstantsAddresses;
};
PPCAnalyst::CodeBlock code_block;

View File

@ -475,7 +475,7 @@ void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize)
CALL(jit->GetAsmRoutines()->fifoDirectWrite64);
break;
}
jit->js.fifoBytesThisBlock += accessSize >> 3;
jit->js.fifoBytesSinceCheck += accessSize >> 3;
}
bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address,

View File

@ -260,6 +260,9 @@ void CompileExceptionCheck(ExceptionType type)
case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE:
exception_addresses = &jit->js.pairedQuantizeAddresses;
break;
case ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS:
exception_addresses = &jit->js.noSpeculativeConstantsAddresses;
break;
}
if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end()))

View File

@ -15,7 +15,8 @@ namespace JitInterface
enum class ExceptionType
{
EXCEPTIONS_FIFO_WRITE,
EXCEPTIONS_PAIRED_QUANTIZE
EXCEPTIONS_PAIRED_QUANTIZE,
EXCEPTIONS_SPECULATIVE_CONSTANTS
};
void DoState(PointerWrap& p);

View File

@ -849,10 +849,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
}
// Forward scan, for flags that need the other direction for calculation.
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe, gprDefined, gprBlockInputs;
BitSet8 gqrUsed, gqrModified;
for (u32 i = 0; i < block->m_num_instructions; i++)
{
gprBlockInputs |= code[i].regsIn & ~gprDefined;
gprDefined |= code[i].regsOut;
code[i].fprIsSingle = fprIsSingle;
code[i].fprIsDuplicated = fprIsDuplicated;
code[i].fprIsStoreSafe = fprIsStoreSafe;
@ -905,6 +908,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
}
block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified;
block->m_gpr_inputs = gprBlockInputs;
return address;
}

View File

@ -154,6 +154,9 @@ struct CodeBlock
// Which GQRs this block modifies, if any.
BitSet8 m_gqr_modified;
// Which GPRs this block reads from before defining, if any.
BitSet32 m_gpr_inputs;
};
class PPCAnalyzer