JIT: use fastmem loads in MMU mode.

Even in games that require MMU mode, loads outside the area specified by
the BAT are rare, so fastmem is a substantial improvement.

All of the interesting changes are in the backpatch handler, to make it
generate DSI exceptions correctly.
This commit is contained in:
magumagu 2014-12-11 14:12:20 -08:00
parent 7ebca647b3
commit 43d56febc4
6 changed files with 110 additions and 35 deletions

View File

@ -96,6 +96,13 @@ u16 Read_U16(const u32 _Address);
u32 Read_U32(const u32 _Address); u32 Read_U32(const u32 _Address);
u64 Read_U64(const u32 _Address); u64 Read_U64(const u32 _Address);
u32 Read_S8_Val(u32 address, u32 val);
u32 Read_U8_Val(u32 address, u32 val);
u32 Read_S16_Val(u32 address, u32 val);
u32 Read_U16_Val(u32 address, u32 val);
u32 Read_U32_Val(u32 address, u32 val);
u64 Read_U64_Val(u32 address, u64 val);
// Useful helper functions, used by ARM JIT // Useful helper functions, used by ARM JIT
float Read_F32(const u32 _Address); float Read_F32(const u32 _Address);
double Read_F64(const u32 _Address); double Read_F64(const u32 _Address);

View File

@ -57,10 +57,12 @@ GXPeekZ
// ---------------- // ----------------
// Overloaded byteswap functions, for use within the templated functions below. // Overloaded byteswap functions, for use within the templated functions below.
inline u8 bswap(u8 val) {return val;} inline u8 bswap(u8 val) { return val; }
inline u16 bswap(u16 val) {return Common::swap16(val);} inline s8 bswap(s8 val) { return val; }
inline u32 bswap(u32 val) {return Common::swap32(val);} inline u16 bswap(u16 val) { return Common::swap16(val); }
inline u64 bswap(u64 val) {return Common::swap64(val);} inline s16 bswap(s16 val) { return Common::swap16(val); }
inline u32 bswap(u32 val) { return Common::swap32(val); }
inline u64 bswap(u64 val) { return Common::swap64(val); }
// ================= // =================
@ -89,8 +91,8 @@ static u32 EFB_Read(const u32 addr)
static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite); static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite);
template <typename T> template <typename T, typename U>
inline void ReadFromHardware(T &_var, const u32 em_address, Memory::XCheckTLBFlag flag) inline void ReadFromHardware(U &_var, const u32 em_address, Memory::XCheckTLBFlag flag)
{ {
// TODO: Figure out the fastest order of tests for both read and write (they are probably different). // TODO: Figure out the fastest order of tests for both read and write (they are probably different).
if ((em_address & 0xC8000000) == 0xC8000000) if ((em_address & 0xC8000000) == 0xC8000000)
@ -98,7 +100,7 @@ inline void ReadFromHardware(T &_var, const u32 em_address, Memory::XCheckTLBFla
if (em_address < 0xcc000000) if (em_address < 0xcc000000)
_var = EFB_Read(em_address); _var = EFB_Read(em_address);
else else
_var = mmio_mapping->Read<T>(em_address); _var = (T)mmio_mapping->Read<typename std::make_unsigned<T>::type>(em_address);
} }
else if (((em_address & 0xF0000000) == 0x80000000) || else if (((em_address & 0xF0000000) == 0x80000000) ||
((em_address & 0xF0000000) == 0xC0000000) || ((em_address & 0xF0000000) == 0xC0000000) ||
@ -449,6 +451,42 @@ float Read_F32(const u32 _Address)
return cvt.d; return cvt.d;
} }
u32 Read_U8_Val(u32 address, u32 val)
{
ReadFromHardware<u8>(val, address, FLAG_READ);
return val;
}
u32 Read_S8_Val(u32 address, u32 val)
{
ReadFromHardware<s8>(val, address, FLAG_READ);
return val;
}
u32 Read_U16_Val(u32 address, u32 val)
{
ReadFromHardware<u16>(val, address, FLAG_READ);
return val;
}
u32 Read_S16_Val(u32 address, u32 val)
{
ReadFromHardware<s16>(val, address, FLAG_READ);
return val;
}
u32 Read_U32_Val(u32 address, u32 val)
{
ReadFromHardware<u32>(val, address, FLAG_READ);
return val;
}
u64 Read_U64_Val(u32 address, u64 val)
{
ReadFromHardware<u64>(val, address, FLAG_READ);
return val;
}
u32 Read_U8_ZX(const u32 _Address) u32 Read_U8_ZX(const u32 _Address)
{ {
return (u32)Read_U8(_Address); return (u32)Read_U8(_Address);

View File

@ -444,7 +444,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedFloatTwo = AlignCode4(); const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
{ {
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
} }
@ -464,7 +464,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedFloatOne = AlignCode4(); const u8* loadPairedFloatOne = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
{ {
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M(m_one)); UNPCKLPS(XMM0, M(m_one));
} }
@ -486,7 +486,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
if (jit->js.memcheck) if (jit->js.memcheck)
{ {
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
} }
else else
@ -512,7 +512,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedU8One = AlignCode4(); const u8* loadPairedU8One = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
@ -525,7 +525,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
if (jit->js.memcheck) if (jit->js.memcheck)
{ {
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
} }
else else
@ -551,7 +551,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedS8One = AlignCode4(); const u8* loadPairedS8One = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
@ -563,7 +563,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedU16Two = AlignCode4(); const u8* loadPairedU16Two = AlignCode4();
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->js.memcheck) if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
@ -585,7 +585,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedU16One = AlignCode4(); const u8* loadPairedU16One = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
@ -596,7 +596,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedS16Two = AlignCode4(); const u8* loadPairedS16Two = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
@ -618,7 +618,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedS16One = AlignCode4(); const u8* loadPairedS16One = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
else else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));

View File

@ -85,9 +85,25 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx)
else else
bswapNopCount = 2; bswapNopCount = 2;
int totalSize = info.instructionSize + bswapNopCount;
if (info.operandSize == 2 && !info.byteSwap)
{
if ((codePtr[totalSize] & 0xF0) == 0x40)
{
++totalSize;
}
if (codePtr[totalSize] != 0xc1 || codePtr[totalSize + 2] != 0x10)
{
PanicAlert("BackPatch: didn't find expected shift %p", codePtr);
return nullptr;
}
info.signExtend = (codePtr[totalSize + 1] & 0x10) != 0;
totalSize += 3;
}
const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse); const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse);
emitter.CALL((void *)trampoline); emitter.CALL((void *)trampoline);
int padding = info.instructionSize + bswapNopCount - BACKPATCH_SIZE; int padding = totalSize - BACKPATCH_SIZE;
if (padding > 0) if (padding > 0)
{ {
emitter.NOP(padding); emitter.NOP(padding);

View File

@ -296,8 +296,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
{ {
registersInUse[reg_value] = false; registersInUse[reg_value] = false;
} }
if (!jit->js.memcheck && if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem &&
SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem &&
!opAddress.IsImm() && !opAddress.IsImm() &&
!(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM)) !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM))
#ifdef ENABLE_MEM_CHECK #ifdef ENABLE_MEM_CHECK

View File

@ -57,40 +57,55 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B
const u8* trampoline = GetCodePtr(); const u8* trampoline = GetCodePtr();
X64Reg addrReg = (X64Reg)info.scaledReg; X64Reg addrReg = (X64Reg)info.scaledReg;
X64Reg dataReg = (X64Reg)info.regOperandReg; X64Reg dataReg = (X64Reg)info.regOperandReg;
registersInUse[addrReg] = true;
registersInUse[dataReg] = false;
// It's a read. Easy. // It's a read. Easy.
// RSP alignment here is 8 due to the call. // RSP alignment here is 8 due to the call.
ABI_PushRegistersAndAdjustStack(registersInUse, 8); ABI_PushRegistersAndAdjustStack(registersInUse, 8);
int dataRegSize = info.operandSize == 8 ? 64 : 32;
if (dataReg == ABI_PARAM1)
{
if (addrReg == ABI_PARAM2)
{
XCHG(dataRegSize, R(ABI_PARAM1), R(ABI_PARAM2));
}
else
{
MOV(dataRegSize, R(ABI_PARAM2), R(dataReg));
MOV(32, R(ABI_PARAM1), R(addrReg));
}
}
else
{
if (addrReg != ABI_PARAM1) if (addrReg != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R(addrReg)); MOV(32, R(ABI_PARAM1), R(addrReg));
if (dataReg != ABI_PARAM2)
MOV(dataRegSize, R(ABI_PARAM2), R(dataReg));
}
if (info.displacement) if (info.displacement)
ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
switch (info.operandSize) switch (info.operandSize)
{ {
case 8:
CALL((void *)&Memory::Read_U64_Val);
break;
case 4: case 4:
CALL((void *)&Memory::Read_U32); CALL((void *)&Memory::Read_U32_Val);
break; break;
case 2: case 2:
CALL((void *)&Memory::Read_U16); CALL(info.signExtend ? (void *)&Memory::Read_S16_Val : (void *)&Memory::Read_U16_Val);
SHL(32, R(ABI_RETURN), Imm8(16));
break; break;
case 1: case 1:
CALL((void *)&Memory::Read_U8); CALL(info.signExtend ? (void *)&Memory::Read_S8_Val : (void *)&Memory::Read_U8_Val);
break; break;
} }
if (info.signExtend && info.operandSize == 1) MOV(dataRegSize, R(dataReg), R(ABI_RETURN));
{
// Need to sign extend value from Read_U8.
MOVSX(32, 8, dataReg, R(ABI_RETURN));
}
else if (dataReg != EAX)
{
MOV(32, R(dataReg), R(ABI_RETURN));
}
ABI_PopRegistersAndAdjustStack(registersInUse, 8); ABI_PopRegistersAndAdjustStack(registersInUse, 8);
RET(); RET();