Merge pull request #4123 from hthh/improve-const-stores

Jit: FIFO optimization improvements
This commit is contained in:
Markus Wick 2016-10-04 08:32:48 +02:00 committed by GitHub
commit cf3513f7fc
16 changed files with 81 additions and 28 deletions

View File

@ -144,7 +144,7 @@ void CachedInterpreter::Jit(u32 address)
js.blockStart = PC; js.blockStart = PC;
js.firstFPInstructionFound = false; js.firstFPInstructionFound = false;
js.fifoBytesThisBlock = 0; js.fifoBytesSinceCheck = 0;
js.downcountAmount = 0; js.downcountAmount = 0;
js.curBlock = b; js.curBlock = b;

View File

@ -349,7 +349,7 @@ bool Jit64::Cleanup()
{ {
bool did_something = false; bool did_something = false;
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{ {
ABI_PushRegistersAndAdjustStack({}, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(GPFifo::FastCheckGatherPipe); ABI_CallFunction(GPFifo::FastCheckGatherPipe);
@ -597,7 +597,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
js.firstFPInstructionFound = false; js.firstFPInstructionFound = false;
js.isLastInstruction = false; js.isLastInstruction = false;
js.blockStart = em_address; js.blockStart = em_address;
js.fifoBytesThisBlock = 0; js.fifoBytesSinceCheck = 0;
js.mustCheckFifo = false; js.mustCheckFifo = false;
js.curBlock = b; js.curBlock = b;
js.numLoadStoreInst = 0; js.numLoadStoreInst = 0;
@ -690,6 +690,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
} }
} }
if (js.noSpeculativeConstantsAddresses.find(js.blockStart) ==
js.noSpeculativeConstantsAddresses.end())
{
IntializeSpeculativeConstants();
}
// Translate instructions // Translate instructions
for (u32 i = 0; i < code_block.m_num_instructions; i++) for (u32 i = 0; i < code_block.m_num_instructions; i++)
{ {
@ -724,10 +730,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
js.fifoWriteAddresses.find(ops[i].address) != js.fifoWriteAddresses.end(); js.fifoWriteAddresses.find(ops[i].address) != js.fifoWriteAddresses.end();
// Gather pipe writes using an immediate address are explicitly tracked. // Gather pipe writes using an immediate address are explicitly tracked.
if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo)) if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo))
{ {
if (js.fifoBytesThisBlock >= 32) js.fifoBytesSinceCheck = 0;
js.fifoBytesThisBlock -= 32;
js.mustCheckFifo = false; js.mustCheckFifo = false;
BitSet32 registersInUse = CallerSavedRegistersInUse(); BitSet32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
@ -967,3 +972,39 @@ void Jit64::EnableOptimization()
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
} }
void Jit64::IntializeSpeculativeConstants()
{
// If the block depends on an input register which looks like a gather pipe or MMIO related
// constant, guess that it is actually a constant input, and specialize the block based on this
// assumption. This happens when there are branches in code writing to the gather pipe, but only
// the first block loads the constant.
// Insert a check at the start of the block to verify that the value is actually constant.
// This can save a lot of backpatching and optimize gather pipe writes in more places.
const u8* target = nullptr;
for (auto i : code_block.m_gpr_inputs)
{
u32 compileTimeValue = PowerPC::ppcState.gpr[i];
if (PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue) ||
PowerPC::IsOptimizableGatherPipeWrite(compileTimeValue - 0x8000) ||
compileTimeValue == 0xCC000000)
{
if (!target)
{
SwitchToFarCode();
target = GetCodePtr();
MOV(32, PPCSTATE(pc), Imm32(js.blockStart));
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunctionC(
reinterpret_cast<void*>(&JitInterface::CompileExceptionCheck),
static_cast<u32>(JitInterface::ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS));
ABI_PopRegistersAndAdjustStack({}, 0);
JMP(asm_routines.dispatcher, true);
SwitchToNearCode();
}
CMP(32, PPCSTATE(gpr[i]), Imm32(compileTimeValue));
J_CC(CC_NZ, target);
gpr.SetImmediate32(i, compileTimeValue, false);
}
}
}

View File

@ -68,6 +68,8 @@ public:
BitSet32 CallerSavedRegistersInUse() const; BitSet32 CallerSavedRegistersInUse() const;
BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const; BitSet8 ComputeStaticGQRs(const PPCAnalyst::CodeBlock&) const;
void IntializeSpeculativeConstants();
JitBlockCache* GetBlockCache() override { return &blocks; } JitBlockCache* GetBlockCache() override { return &blocks; }
void Trace(); void Trace();

View File

@ -227,10 +227,12 @@ void RegCache::DiscardRegContentsIfCached(size_t preg)
} }
} }
void GPRRegCache::SetImmediate32(size_t preg, u32 immValue) void GPRRegCache::SetImmediate32(size_t preg, u32 immValue, bool dirty)
{ {
// "dirty" can be false to avoid redundantly flushing an immediate when
// processing speculative constants.
DiscardRegContentsIfCached(preg); DiscardRegContentsIfCached(preg);
regs[preg].away = true; regs[preg].away |= dirty;
regs[preg].location = Imm32(immValue); regs[preg].location = Imm32(immValue);
} }
@ -282,10 +284,7 @@ void RegCache::KillImmediate(size_t preg, bool doLoad, bool makeDirty)
void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty) void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
{ {
if (!regs[i].away && regs[i].location.IsImm()) if (!regs[i].away || regs[i].location.IsImm())
PanicAlert("Bad immediate");
if (!regs[i].away || (regs[i].away && regs[i].location.IsImm()))
{ {
X64Reg xr = GetFreeXReg(); X64Reg xr = GetFreeXReg();
if (xregs[xr].dirty) if (xregs[xr].dirty)
@ -294,7 +293,7 @@ void RegCache::BindToRegister(size_t i, bool doLoad, bool makeDirty)
PanicAlert("GetFreeXReg returned locked register"); PanicAlert("GetFreeXReg returned locked register");
xregs[xr].free = false; xregs[xr].free = false;
xregs[xr].ppcReg = i; xregs[xr].ppcReg = i;
xregs[xr].dirty = makeDirty || regs[i].location.IsImm(); xregs[xr].dirty = makeDirty || regs[i].away;
if (doLoad) if (doLoad)
LoadRegister(i, xr); LoadRegister(i, xr);
for (size_t j = 0; j < regs.size(); j++) for (size_t j = 0; j < regs.size(); j++)

View File

@ -161,7 +161,7 @@ public:
void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; void LoadRegister(size_t preg, Gen::X64Reg newLoc) override;
Gen::OpArg GetDefaultLocation(size_t reg) const override; Gen::OpArg GetDefaultLocation(size_t reg) const override;
const Gen::X64Reg* GetAllocationOrder(size_t* count) override; const Gen::X64Reg* GetAllocationOrder(size_t* count) override;
void SetImmediate32(size_t preg, u32 immValue); void SetImmediate32(size_t preg, u32 immValue, bool dirty = true);
BitSet32 GetRegUtilization() override; BitSet32 GetRegUtilization() override;
BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; BitSet32 CountRegsIn(size_t preg, u32 lookahead) override;
}; };

View File

@ -592,6 +592,6 @@ void Jit64::eieio(UGeckoInstruction inst)
// optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block, // optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block,
// which is generally safe. However postponing FIFO writes across eieio instructions // which is generally safe. However postponing FIFO writes across eieio instructions
// is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection). // is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection).
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
js.mustCheckFifo = true; js.mustCheckFifo = true;
} }

View File

@ -515,7 +515,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBloc
{ {
js.isLastInstruction = false; js.isLastInstruction = false;
js.blockStart = em_address; js.blockStart = em_address;
js.fifoBytesThisBlock = 0; js.fifoBytesSinceCheck = 0;
js.curBlock = b; js.curBlock = b;
jit->js.numLoadStoreInst = 0; jit->js.numLoadStoreInst = 0;
jit->js.numFloatingPointInst = 0; jit->js.numFloatingPointInst = 0;

View File

@ -170,7 +170,7 @@ void JitArm64::Break(UGeckoInstruction inst)
void JitArm64::Cleanup() void JitArm64::Cleanup()
{ {
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{ {
gpr.Lock(W0); gpr.Lock(W0);
MOVP2R(X0, &GPFifo::FastCheckGatherPipe); MOVP2R(X0, &GPFifo::FastCheckGatherPipe);
@ -424,7 +424,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
js.firstFPInstructionFound = false; js.firstFPInstructionFound = false;
js.assumeNoPairedQuantize = false; js.assumeNoPairedQuantize = false;
js.blockStart = em_address; js.blockStart = em_address;
js.fifoBytesThisBlock = 0; js.fifoBytesSinceCheck = 0;
js.mustCheckFifo = false; js.mustCheckFifo = false;
js.downcountAmount = 0; js.downcountAmount = 0;
js.skipInstructions = 0; js.skipInstructions = 0;
@ -512,10 +512,9 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitB
bool gatherPipeIntCheck = bool gatherPipeIntCheck =
jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end(); jit->js.fifoWriteAddresses.find(ops[i].address) != jit->js.fifoWriteAddresses.end();
if (jo.optimizeGatherPipe && (js.fifoBytesThisBlock >= 32 || js.mustCheckFifo)) if (jo.optimizeGatherPipe && (js.fifoBytesSinceCheck >= 32 || js.mustCheckFifo))
{ {
if (js.fifoBytesThisBlock >= 32) js.fifoBytesSinceCheck = 0;
js.fifoBytesThisBlock -= 32;
js.mustCheckFifo = false; js.mustCheckFifo = false;
gpr.Lock(W30); gpr.Lock(W30);

View File

@ -333,7 +333,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
} }
ADD(W0, W0, accessSize >> 3); ADD(W0, W0, accessSize >> 3);
STR(INDEX_UNSIGNED, W0, X30, count_off); STR(INDEX_UNSIGNED, W0, X30, count_off);
js.fifoBytesThisBlock += accessSize >> 3; js.fifoBytesSinceCheck += accessSize >> 3;
if (accessSize != 8) if (accessSize != 8)
gpr.Unlock(WA); gpr.Unlock(WA);
@ -833,6 +833,6 @@ void JitArm64::eieio(UGeckoInstruction inst)
// optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block, // optimizeGatherPipe generally postpones FIFO checks to the end of the JIT block,
// which is generally safe. However postponing FIFO writes across eieio instructions // which is generally safe. However postponing FIFO writes across eieio instructions
// is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection). // is incorrect (would crash NBA2K11 strap screen if we improve our FIFO detection).
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
js.mustCheckFifo = true; js.mustCheckFifo = true;
} }

View File

@ -442,7 +442,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
ADD(W0, W0, accessSize >> 3); ADD(W0, W0, accessSize >> 3);
STR(INDEX_UNSIGNED, W0, X30, count_off); STR(INDEX_UNSIGNED, W0, X30, count_off);
js.fifoBytesThisBlock += accessSize >> 3; js.fifoBytesSinceCheck += accessSize >> 3;
if (update) if (update)
{ {

View File

@ -104,7 +104,7 @@ protected:
u8* trampolineExceptionHandler; u8* trampolineExceptionHandler;
bool mustCheckFifo; bool mustCheckFifo;
int fifoBytesThisBlock; int fifoBytesSinceCheck;
PPCAnalyst::BlockStats st; PPCAnalyst::BlockStats st;
PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats gpa;
@ -116,6 +116,7 @@ protected:
std::unordered_set<u32> fifoWriteAddresses; std::unordered_set<u32> fifoWriteAddresses;
std::unordered_set<u32> pairedQuantizeAddresses; std::unordered_set<u32> pairedQuantizeAddresses;
std::unordered_set<u32> noSpeculativeConstantsAddresses;
}; };
PPCAnalyst::CodeBlock code_block; PPCAnalyst::CodeBlock code_block;

View File

@ -462,7 +462,7 @@ void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize)
CALL(jit->GetAsmRoutines()->fifoDirectWrite64); CALL(jit->GetAsmRoutines()->fifoDirectWrite64);
break; break;
} }
jit->js.fifoBytesThisBlock += accessSize >> 3; jit->js.fifoBytesSinceCheck += accessSize >> 3;
} }
bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address, bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address,

View File

@ -260,6 +260,9 @@ void CompileExceptionCheck(ExceptionType type)
case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE: case ExceptionType::EXCEPTIONS_PAIRED_QUANTIZE:
exception_addresses = &jit->js.pairedQuantizeAddresses; exception_addresses = &jit->js.pairedQuantizeAddresses;
break; break;
case ExceptionType::EXCEPTIONS_SPECULATIVE_CONSTANTS:
exception_addresses = &jit->js.noSpeculativeConstantsAddresses;
break;
} }
if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end())) if (PC != 0 && (exception_addresses->find(PC)) == (exception_addresses->end()))

View File

@ -15,7 +15,8 @@ namespace JitInterface
enum class ExceptionType enum class ExceptionType
{ {
EXCEPTIONS_FIFO_WRITE, EXCEPTIONS_FIFO_WRITE,
EXCEPTIONS_PAIRED_QUANTIZE EXCEPTIONS_PAIRED_QUANTIZE,
EXCEPTIONS_SPECULATIVE_CONSTANTS
}; };
void DoState(PointerWrap& p); void DoState(PointerWrap& p);

View File

@ -843,10 +843,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
} }
// Forward scan, for flags that need the other direction for calculation. // Forward scan, for flags that need the other direction for calculation.
BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe, gprDefined, gprBlockInputs;
BitSet8 gqrUsed, gqrModified; BitSet8 gqrUsed, gqrModified;
for (u32 i = 0; i < block->m_num_instructions; i++) for (u32 i = 0; i < block->m_num_instructions; i++)
{ {
gprBlockInputs |= code[i].regsIn & ~gprDefined;
gprDefined |= code[i].regsOut;
code[i].fprIsSingle = fprIsSingle; code[i].fprIsSingle = fprIsSingle;
code[i].fprIsDuplicated = fprIsDuplicated; code[i].fprIsDuplicated = fprIsDuplicated;
code[i].fprIsStoreSafe = fprIsStoreSafe; code[i].fprIsStoreSafe = fprIsStoreSafe;
@ -899,6 +902,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
} }
block->m_gqr_used = gqrUsed; block->m_gqr_used = gqrUsed;
block->m_gqr_modified = gqrModified; block->m_gqr_modified = gqrModified;
block->m_gpr_inputs = gprBlockInputs;
return address; return address;
} }

View File

@ -154,6 +154,9 @@ struct CodeBlock
// Which GQRs this block modifies, if any. // Which GQRs this block modifies, if any.
BitSet8 m_gqr_modified; BitSet8 m_gqr_modified;
// Which GPRs this block reads from before defining, if any.
BitSet32 m_gpr_inputs;
}; };
class PPCAnalyzer class PPCAnalyzer