Fastmem: jump to trampolines instead of calling them

Should be slightly faster, and also lets us skip the nops on the way back.

Remove the trampoline cache, since it isn't really useful anymore with this.
This commit is contained in:
Fiora 2015-01-02 14:47:44 -08:00
parent 9923d705df
commit 2a8936312e
4 changed files with 18 additions and 96 deletions

View File

@ -785,8 +785,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
else
{
exceptionHandlerAtLoc[js.fastmemLoadStore] = GetWritableCodePtr();
// the fastmem trampoline is jumping here, so we need to pop the return stack
ADD(64, R(RSP), Imm8(8));
}
gpr.Flush(FLUSH_MAINTAIN_STATE);

View File

@ -83,7 +83,6 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
if (!info.isMemoryWrite)
{
XEmitter emitter(codePtr);
int bswapNopCount;
if (info.byteSwap || info.operandSize == 1)
bswapNopCount = 0;
@ -109,9 +108,11 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
totalSize += 3;
}
const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse, exceptionHandler);
emitter.CALL((void *)trampoline);
XEmitter emitter(codePtr);
int padding = totalSize - BACKPATCH_SIZE;
u8* returnPtr = codePtr + 5 + padding;
const u8* trampoline = trampolines.GenerateReadTrampoline(info, registersInUse, exceptionHandler, returnPtr);
emitter.JMP(trampoline, true);
if (padding > 0)
{
emitter.NOP(padding);
@ -162,9 +163,10 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
start = codePtr - bswapSize;
}
XEmitter emitter(start);
const u8 *trampoline = trampolines.GetWriteTrampoline(info, registersInUse, exceptionHandler, pc);
emitter.CALL((void *)trampoline);
ptrdiff_t padding = (codePtr - emitter.GetCodePtr()) + info.instructionSize;
ptrdiff_t padding = (codePtr - (start + 5)) + info.instructionSize;
u8* returnPtr = start + 5 + padding;
const u8* trampoline = trampolines.GenerateWriteTrampoline(info, registersInUse, exceptionHandler, returnPtr, pc);
emitter.JMP(trampoline, true);
if (padding > 0)
{
emitter.NOP(padding);

View File

@ -27,29 +27,14 @@ void TrampolineCache::Init()
void TrampolineCache::ClearCodeSpace()
{
X64CodeBlock::ClearCodeSpace();
cachedTrampolines.clear();
}
void TrampolineCache::Shutdown()
{
FreeCodeSpace();
cachedTrampolines.clear();
}
const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler)
{
TrampolineCacheKey key = { registersInUse, exceptionHandler, 0, info };
auto it = cachedTrampolines.find(key);
if (it != cachedTrampolines.end())
return it->second;
const u8* trampoline = GenerateReadTrampoline(info, registersInUse, exceptionHandler);
cachedTrampolines[key] = trampoline;
return trampoline;
}
const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler)
const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr)
{
if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full");
@ -60,9 +45,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B
registersInUse[addrReg] = true;
registersInUse[dataReg] = false;
// It's a read. Easy.
// RSP alignment here is 8 due to the call.
ABI_PushRegistersAndAdjustStack(registersInUse, 8);
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
int dataRegSize = info.operandSize == 8 ? 64 : 32;
MOVTwo(dataRegSize, ABI_PARAM1, addrReg, ABI_PARAM2, dataReg);
@ -89,30 +72,17 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B
if (dataReg != ABI_RETURN)
MOV(dataRegSize, R(dataReg), R(ABI_RETURN));
ABI_PopRegistersAndAdjustStack(registersInUse, 8);
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
if (exceptionHandler)
{
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI));
J_CC(CC_NZ, exceptionHandler);
}
RET();
JMP(returnPtr, true);
return trampoline;
}
const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc)
{
TrampolineCacheKey key = { registersInUse, exceptionHandler, pc, info };
auto it = cachedTrampolines.find(key);
if (it != cachedTrampolines.end())
return it->second;
const u8* trampoline = GenerateWriteTrampoline(info, registersInUse, exceptionHandler, pc);
cachedTrampolines[key] = trampoline;
return trampoline;
}
const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc)
const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr, u32 pc)
{
if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full");
@ -122,15 +92,13 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info,
X64Reg dataReg = (X64Reg)info.regOperandReg;
X64Reg addrReg = (X64Reg)info.scaledReg;
// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a
// hardware access - we can take shortcuts.
// Don't treat FIFO writes specially for now because they require a burst
// check anyway.
// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
MOV(32, PPCSTATE(pc), Imm32(pc));
ABI_PushRegistersAndAdjustStack(registersInUse, 8);
ABI_PushRegistersAndAdjustStack(registersInUse, 0);
if (info.hasImmediate)
{
@ -178,38 +146,13 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info,
break;
}
ABI_PopRegistersAndAdjustStack(registersInUse, 8);
ABI_PopRegistersAndAdjustStack(registersInUse, 0);
if (exceptionHandler)
{
TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI));
J_CC(CC_NZ, exceptionHandler);
}
RET();
JMP(returnPtr, true);
return trampoline;
}
size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const
{
size_t res = std::hash<int>()(k.registersInUse.m_val);
res ^= std::hash<int>()(k.info.operandSize) >> 1;
res ^= std::hash<int>()(k.info.regOperandReg) >> 2;
res ^= std::hash<int>()(k.info.scaledReg) >> 3;
res ^= std::hash<u64>()(k.info.immediate) >> 4;
res ^= std::hash<int>()(k.pc) >> 5;
res ^= std::hash<int>()(k.info.displacement) << 1;
res ^= std::hash<bool>()(k.info.signExtend) << 2;
res ^= std::hash<bool>()(k.info.hasImmediate) << 3;
res ^= std::hash<bool>()(k.info.isMemoryWrite) << 4;
res ^= std::hash<u8*>()(k.exceptionHandler) << 5;
return res;
}
bool TrampolineCacheKey::operator==(const TrampolineCacheKey &other) const
{
return pc == other.pc &&
registersInUse == other.registersInUse &&
exceptionHandler == other.exceptionHandler &&
info == other.info;
}

View File

@ -14,34 +14,13 @@
// We need at least this many bytes for backpatching.
const int BACKPATCH_SIZE = 5;
struct TrampolineCacheKey
{
BitSet32 registersInUse;
u8* exceptionHandler;
u32 pc;
InstructionInfo info;
bool operator==(const TrampolineCacheKey &other) const;
};
struct TrampolineCacheKeyHasher
{
size_t operator()(const TrampolineCacheKey& k) const;
};
class TrampolineCache : public Gen::X64CodeBlock
{
public:
void Init();
void Shutdown();
const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler);
const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc);
const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr);
const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr, u32 pc);
void ClearCodeSpace();
private:
const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler);
const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc);
std::unordered_map<TrampolineCacheKey, const u8*, TrampolineCacheKeyHasher> cachedTrampolines;
};