From b1296a7825078ac8354e363117613d5e3fbb64df Mon Sep 17 00:00:00 2001 From: Matt Mastracci Date: Sun, 28 Feb 2016 14:33:53 -0700 Subject: [PATCH] Refactor fastmem/trampoline code. Simplication to avoid reading back the generated instructions, allowing us to handle all possible cases. --- Source/Core/Common/CMakeLists.txt | 1 - Source/Core/Common/Common.vcxproj | 2 - Source/Core/Common/Common.vcxproj.filters | 2 - Source/Core/Common/x64Analyzer.cpp | 233 --------------- Source/Core/Common/x64Analyzer.h | 44 --- Source/Core/Common/x64Emitter.cpp | 24 +- Source/Core/Common/x64Emitter.h | 35 ++- Source/Core/Core/MemTools.cpp | 1 - Source/Core/Core/PowerPC/Jit64/Jit.h | 1 + .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 12 - .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 4 - .../PowerPC/Jit64/Jit_LoadStorePaired.cpp | 234 ++------------- .../PowerPC/Jit64Common/Jit64AsmCommon.cpp | 2 - Source/Core/Core/PowerPC/Jit64IL/JitIL.h | 1 + .../Core/PowerPC/JitCommon/JitBackpatch.cpp | 180 ++++------- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 3 + .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 279 +++++++++++------- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 66 ++++- .../PowerPC/JitCommon/TrampolineCache.cpp | 143 ++------- .../Core/PowerPC/JitCommon/TrampolineCache.h | 12 +- 20 files changed, 385 insertions(+), 894 deletions(-) delete mode 100644 Source/Core/Common/x64Analyzer.cpp delete mode 100644 Source/Core/Common/x64Analyzer.h diff --git a/Source/Core/Common/CMakeLists.txt b/Source/Core/Common/CMakeLists.txt index d1ec894a70..e35b7cd746 100644 --- a/Source/Core/Common/CMakeLists.txt +++ b/Source/Core/Common/CMakeLists.txt @@ -29,7 +29,6 @@ set(SRCS Analytics.cpp TraversalClient.cpp Version.cpp x64ABI.cpp - x64Analyzer.cpp x64Emitter.cpp Crypto/bn.cpp Crypto/ec.cpp diff --git a/Source/Core/Common/Common.vcxproj b/Source/Core/Common/Common.vcxproj index 101b082f0e..20edab8a65 100644 --- a/Source/Core/Common/Common.vcxproj +++ b/Source/Core/Common/Common.vcxproj @@ -133,7 +133,6 @@ - @@ -178,7 +177,6 @@ - diff --git a/Source/Core/Common/Common.vcxproj.filters b/Source/Core/Common/Common.vcxproj.filters index 6bb69f6a17..634730bf0a 100644 --- a/Source/Core/Common/Common.vcxproj.filters +++ b/Source/Core/Common/Common.vcxproj.filters @@ -62,7 +62,6 @@ - Logging @@ -253,7 +252,6 @@ - diff --git a/Source/Core/Common/x64Analyzer.cpp b/Source/Core/Common/x64Analyzer.cpp deleted file mode 100644 index 773f6ebdbc..0000000000 --- a/Source/Core/Common/x64Analyzer.cpp +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright 2008 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license.txt file included. - -#include "Common/x64Analyzer.h" - -bool DisassembleMov(const unsigned char* codePtr, InstructionInfo* info) -{ - unsigned const char* startCodePtr = codePtr; - u8 rex = 0; - u32 opcode; - int opcode_length; - - // Check for regular prefix - info->operandSize = 4; - info->zeroExtend = false; - info->signExtend = false; - info->hasImmediate = false; - info->isMemoryWrite = false; - info->byteSwap = false; - - u8 modRMbyte = 0; - u8 sibByte = 0; - bool hasModRM = false; - - int displacementSize = 0; - - if (*codePtr == 0x66) - { - info->operandSize = 2; - codePtr++; - } - else if (*codePtr == 0x67) - { - codePtr++; - } - - // Check for REX prefix - if ((*codePtr & 0xF0) == 0x40) - { - rex = *codePtr; - if (rex & 8) // REX.W - { - info->operandSize = 8; - } - codePtr++; - } - - opcode = *codePtr++; - opcode_length = 1; - if (opcode == 0x0F) - { - opcode = (opcode << 8) | *codePtr++; - opcode_length = 2; - if ((opcode & 0xFB) == 0x38) - { - opcode = (opcode << 8) | *codePtr++; - opcode_length = 3; - } - } - - switch (opcode_length) - { - case 1: - if ((opcode & 0xF0) == 0x80 || ((opcode & 0xF8) == 0xC0 && (opcode & 0x0E) != 0x02)) - { - modRMbyte = *codePtr++; - hasModRM = true; - } - break; - case 2: - if (((opcode & 0xF0) == 0x00 && (opcode & 0x0F) >= 0x04 && (opcode & 0x0D) != 0x0D) || - ((opcode & 0xF0) == 0xA0 && (opcode & 0x07) <= 0x02) || (opcode & 0xF0) == 0x30 || - (opcode & 0xFF) == 0x77 || (opcode & 0xF0) == 0x80 || (opcode & 0xF8) == 0xC8) - { - // No mod R/M byte - } - else - { - modRMbyte = *codePtr++; - hasModRM = true; - } - break; - case 3: - // TODO: support more 3-byte opcode instructions - if ((opcode & 0xFE) == 0xF0) - { - modRMbyte = *codePtr++; - hasModRM = true; - } - break; - } - - if (hasModRM) - { - ModRM mrm(modRMbyte, rex); - info->regOperandReg = mrm.reg; - if (mrm.mod < 3) - { - if (mrm.rm == 4) - { - // SIB byte - sibByte = *codePtr++; - info->scaledReg = (sibByte >> 3) & 7; - info->otherReg = (sibByte & 7); - if (rex & 2) - info->scaledReg += 8; - if (rex & 1) - info->otherReg += 8; - } - else - { - // info->scaledReg = - } - } - if (mrm.mod == 1 || mrm.mod == 2) - { - if (mrm.mod == 1) - displacementSize = 1; - else - displacementSize = 4; - } - } - - if (displacementSize == 1) - info->displacement = (s32)(s8)*codePtr; - else - info->displacement = *((s32*)codePtr); - codePtr += displacementSize; - - switch (opcode) - { - case 0xC6: // mem <- imm8 - info->isMemoryWrite = true; - info->hasImmediate = true; - info->immediate = *codePtr; - info->operandSize = 1; - codePtr++; - break; - - case 0xC7: // mem <- imm16/32 - info->isMemoryWrite = true; - switch (info->operandSize) - { - case 2: - info->hasImmediate = true; - info->immediate = *(u16*)codePtr; - codePtr += 2; - break; - - case 4: - info->hasImmediate = true; - info->immediate = *(u32*)codePtr; - codePtr += 4; - break; - - case 8: - info->zeroExtend = true; - info->immediate = *(u32*)codePtr; - codePtr += 4; - break; - } - break; - - case 0x88: // mem <- r8 - info->isMemoryWrite = true; - if (info->operandSize != 4) - { - return false; - } - info->operandSize = 1; - break; - - case 0x89: // mem <- r16/32/64 - info->isMemoryWrite = true; - break; - - case 0x8A: // r8 <- mem - if (info->operandSize != 4) - { - return false; - } - info->operandSize = 1; - break; - - case 0x8B: // r16/32/64 <- mem - break; - - case 0x0FB6: // movzx on byte - info->zeroExtend = true; - info->operandSize = 1; - break; - - case 0x0FB7: // movzx on short - info->zeroExtend = true; - info->operandSize = 2; - break; - - case 0x0FBE: // movsx on byte - info->signExtend = true; - info->operandSize = 1; - break; - - case 0x0FBF: // movsx on short - info->signExtend = true; - info->operandSize = 2; - break; - - case 0x0F38F0: // movbe read - info->byteSwap = true; - break; - - case 0x0F38F1: // movbe write - info->byteSwap = true; - info->isMemoryWrite = true; - break; - - default: - return false; - } - info->instructionSize = (int)(codePtr - startCodePtr); - return true; -} - -bool InstructionInfo::operator==(const InstructionInfo& other) const -{ - return operandSize == other.operandSize && instructionSize == other.instructionSize && - regOperandReg == other.regOperandReg && otherReg == other.otherReg && - scaledReg == other.scaledReg && zeroExtend == other.zeroExtend && - signExtend == other.signExtend && hasImmediate == other.hasImmediate && - isMemoryWrite == other.isMemoryWrite && byteSwap == other.byteSwap && - immediate == other.immediate && displacement == other.displacement; -} diff --git a/Source/Core/Common/x64Analyzer.h b/Source/Core/Common/x64Analyzer.h deleted file mode 100644 index de21f6ff8f..0000000000 --- a/Source/Core/Common/x64Analyzer.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2008 Dolphin Emulator Project -// Licensed under GPLv2+ -// Refer to the license.txt file included. - -#pragma once - -#include "Common/CommonTypes.h" - -struct InstructionInfo -{ - int operandSize; // 8, 16, 32, 64 - int instructionSize; - int regOperandReg; - int otherReg; - int scaledReg; - bool zeroExtend; - bool signExtend; - bool hasImmediate; - bool isMemoryWrite; - bool byteSwap; - u64 immediate; - s32 displacement; - - bool operator==(const InstructionInfo& other) const; -}; - -struct ModRM -{ - int mod, reg, rm; - ModRM(u8 modRM, u8 rex) - { - mod = modRM >> 6; - reg = ((modRM >> 3) & 7) | ((rex & 4) ? 8 : 0); - rm = modRM & 7; - } -}; - -enum AccessType -{ - OP_ACCESS_READ = 0, - OP_ACCESS_WRITE = 1 -}; - -bool DisassembleMov(const unsigned char* codePtr, InstructionInfo* info); diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index 841d90a431..8ad1d46e40 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1046,8 +1046,14 @@ void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src) WriteMOVBE(bits, 0xF1, src, dest); } -void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend) +void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend, MovInfo* info) { + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = false; + } + switch (size) { case 8: @@ -1083,20 +1089,28 @@ void XEmitter::LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_ext } } -u8* XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src) +void XEmitter::SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info) { - u8* mov_location = GetWritableCodePtr(); if (cpu_info.bMOVBE) { + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = false; + } MOVBE(size, dst, src); } else { BSWAP(size, src); - mov_location = GetWritableCodePtr(); + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = true; + info->nonAtomicSwapStoreSrc = src; + } MOV(size, dst, R(src)); } - return mov_location; } void XEmitter::LEA(int bits, X64Reg dest, OpArg src) diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index b939e79cd5..73f1d69721 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -203,6 +203,15 @@ enum FloatOp class XEmitter; +// Information about a generated MOV op +struct MovInfo final +{ + u8* address; + bool nonAtomicSwapStore; + // valid iff nonAtomicSwapStore is true + X64Reg nonAtomicSwapStoreSrc; +}; + // RIP addressing does not benefit from micro op fusion on Core arch struct OpArg { @@ -272,6 +281,27 @@ struct OpArg return (s8)offset; } + OpArg AsImm64() const + { + _dbg_assert_(DYNA_REC, IsImm()); + return OpArg((u64)offset, SCALE_IMM64); + } + OpArg AsImm32() const + { + _dbg_assert_(DYNA_REC, IsImm()); + return OpArg((u32)offset, SCALE_IMM32); + } + OpArg AsImm16() const + { + _dbg_assert_(DYNA_REC, IsImm()); + return OpArg((u16)offset, SCALE_IMM16); + } + OpArg AsImm8() const + { + _dbg_assert_(DYNA_REC, IsImm()); + return OpArg((u8)offset, SCALE_IMM8); + } + void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, int bits) const; bool IsImm() const { @@ -625,8 +655,9 @@ public: // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE. void MOVBE(int bits, X64Reg dest, const OpArg& src); void MOVBE(int bits, const OpArg& dest, X64Reg src); - void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false); - u8* SwapAndStore(int size, const OpArg& dst, X64Reg src); + void LoadAndSwap(int size, X64Reg dst, const OpArg& src, bool sign_extend = false, + MovInfo* info = nullptr); + void SwapAndStore(int size, const OpArg& dst, X64Reg src, MovInfo* info = nullptr); // Available only on AMD >= Phenom or Intel >= Haswell void LZCNT(int bits, X64Reg dest, const OpArg& src); diff --git a/Source/Core/Core/MemTools.cpp b/Source/Core/Core/MemTools.cpp index 0788e8f176..a54cd0cfab 100644 --- a/Source/Core/Core/MemTools.cpp +++ b/Source/Core/Core/MemTools.cpp @@ -8,7 +8,6 @@ #include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" #include "Common/Thread.h" -#include "Common/x64Analyzer.h" #include "Core/HW/Memmap.h" #include "Core/MachineContext.h" diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 990bbd6e17..b00b63845b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -19,6 +19,7 @@ #pragma once #include "Common/CommonTypes.h" +#include "Common/x64ABI.h" #include "Common/x64Emitter.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 66ad4da6f1..b28297df0a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -287,17 +287,11 @@ void Jit64::lXXx(UGeckoInstruction inst) SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend); if (update && storeAddress) - { - MemoryExceptionCheck(); MOV(32, gpr.R(a), opAddress); - } // TODO: support no-swap in SafeLoadToReg instead if (byte_reversed) - { - MemoryExceptionCheck(); BSWAP(accessSize, gpr.RX(d)); - } gpr.UnlockAll(); gpr.UnlockAllX(); @@ -507,10 +501,7 @@ void Jit64::stX(UGeckoInstruction inst) } if (update) - { - MemoryExceptionCheck(); ADD(32, gpr.R(a), Imm32((u32)offset)); - } } gpr.UnlockAll(); } @@ -589,10 +580,7 @@ void Jit64::stXx(UGeckoInstruction inst) } if (update) - { - MemoryExceptionCheck(); MOV(32, gpr.R(a), R(RSCRATCH2)); - } gpr.UnlockAll(); gpr.UnlockAllX(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 6dbf8b14c4..aba308c458 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -80,7 +80,6 @@ void Jit64::lfXXX(UGeckoInstruction inst) registersInUse[RSCRATCH2] = true; SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false); - MemoryExceptionCheck(); if (single) { ConvertSingleToDouble(fpr.RX(d), RSCRATCH, true); @@ -193,10 +192,7 @@ void Jit64::stfXXX(UGeckoInstruction inst) SafeWriteRegToReg(RSCRATCH, RSCRATCH2, accessSize, offset, registersInUse); if (update) - { - MemoryExceptionCheck(); MOV(32, gpr.R(a), R(RSCRATCH2)); - } fpr.UnlockAll(); gpr.UnlockAll(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index a1d1a223c9..0784fa1f77 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -40,74 +40,6 @@ void Jit64::psq_stXX(UGeckoInstruction inst) u32 gqrValue = gqrIsConstant ? it->second & 0xffff : 0; gpr.Lock(a, b); - if (gqrIsConstant && gqrValue == 0) - { - int storeOffset = 0; - gpr.BindToRegister(a, true, update); - X64Reg addr = gpr.RX(a); - // TODO: this is kind of ugly :/ we should probably create a universal load/store address - // calculation - // function that handles all these weird cases, e.g. how non-fastmem loadstores clobber - // addresses. - bool storeAddress = (update && jo.memcheck) || !jo.fastmem; - if (storeAddress) - { - addr = RSCRATCH2; - MOV(32, R(addr), gpr.R(a)); - } - if (indexed) - { - if (update) - { - ADD(32, R(addr), gpr.R(b)); - } - else - { - addr = RSCRATCH2; - if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) - { - LEA(32, addr, MRegSum(gpr.RX(a), gpr.RX(b))); - } - else - { - MOV(32, R(addr), gpr.R(b)); - if (a) - ADD(32, R(addr), gpr.R(a)); - } - } - } - else - { - if (update) - ADD(32, R(addr), Imm32(offset)); - else - storeOffset = offset; - } - - fpr.Lock(s); - if (w) - { - CVTSD2SS(XMM0, fpr.R(s)); - MOVD_xmm(R(RSCRATCH), XMM0); - } - else - { - CVTPD2PS(XMM0, fpr.R(s)); - MOVQ_xmm(R(RSCRATCH), XMM0); - ROL(64, R(RSCRATCH), Imm8(32)); - } - - BitSet32 registersInUse = CallerSavedRegistersInUse(); - if (update && storeAddress) - registersInUse[addr] = true; - SafeWriteRegToReg(RSCRATCH, addr, w ? 32 : 64, storeOffset, registersInUse); - MemoryExceptionCheck(); - if (update && storeAddress) - MOV(32, gpr.R(a), R(addr)); - gpr.UnlockAll(); - fpr.UnlockAll(); - return; - } gpr.FlushLockX(RSCRATCH_EXTRA); if (update) gpr.BindToRegister(a, true, true); @@ -130,44 +62,35 @@ void Jit64::psq_stXX(UGeckoInstruction inst) if (update && !jo.memcheck) MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); + if (w) + CVTSD2SS(XMM0, fpr.R(s)); // one + else + CVTPD2PS(XMM0, fpr.R(s)); // pair + if (gqrIsConstant) { -// Paired stores don't yield any real change in performance right now, but if we can -// improve fastmem support this might change -//#define INLINE_PAIRED_STORES -#ifdef INLINE_PAIRED_STORES - if (w) - { - // One value - CVTSD2SS(XMM0, fpr.R(s)); - GenQuantizedStore(true, static_cast(gqrValue & 0x7), (gqrValue & 0x3F00) >> 8); - } - else - { - // Pair of values - CVTPD2PS(XMM0, fpr.R(s)); - GenQuantizedStore(false, static_cast(gqrValue & 0x7), - (gqrValue & 0x3F00) >> 8); - } -#else - // We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly - // with just the scale bits. int type = gqrValue & 0x7; - MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00)); - if (w) + // Paired stores (other than w/type zero) don't yield any real change in + // performance right now, but if we can improve fastmem support this might change + if (gqrValue == 0) { - // One value - CVTSD2SS(XMM0, fpr.R(s)); - CALL(asm_routines.singleStoreQuantized[type]); + if (w) + GenQuantizedStore(true, static_cast(type), (gqrValue & 0x3F00) >> 8); + else + GenQuantizedStore(false, static_cast(type), (gqrValue & 0x3F00) >> 8); } else { - // Pair of values - CVTPD2PS(XMM0, fpr.R(s)); - CALL(asm_routines.pairedStoreQuantized[type]); + // We know what GQR is here, so we can load RSCRATCH2 and call into the store method directly + // with just the scale bits. + MOV(32, R(RSCRATCH2), Imm32(gqrValue & 0x3F00)); + + if (w) + CALL(asm_routines.singleStoreQuantized[type]); + else + CALL(asm_routines.pairedStoreQuantized[type]); } -#endif } else { @@ -180,22 +103,13 @@ void Jit64::psq_stXX(UGeckoInstruction inst) MOVZX(32, 8, RSCRATCH, R(RSCRATCH2)); if (w) - { - // One value - CVTSD2SS(XMM0, fpr.R(s)); CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); - } else - { - // Pair of values - CVTPD2PS(XMM0, fpr.R(s)); CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); - } } if (update && jo.memcheck) { - MemoryExceptionCheck(); if (indexed) ADD(32, gpr.R(a), gpr.R(b)); else @@ -226,113 +140,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst) gpr.Lock(a, b); - if (gqrIsConstant && gqrValue == 0) - { - s32 loadOffset = 0; - gpr.BindToRegister(a, true, update); - X64Reg addr = gpr.RX(a); - if (update && jo.memcheck) - { - addr = RSCRATCH2; - MOV(32, R(addr), gpr.R(a)); - } - if (indexed) - { - if (update) - { - ADD(32, R(addr), gpr.R(b)); - } - else - { - addr = RSCRATCH2; - if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) - { - LEA(32, addr, MRegSum(gpr.RX(a), gpr.RX(b))); - } - else - { - MOV(32, R(addr), gpr.R(b)); - if (a) - ADD(32, R(addr), gpr.R(a)); - } - } - } - else - { - if (update) - ADD(32, R(addr), Imm32(offset)); - else - loadOffset = offset; - } - - fpr.Lock(s); - if (jo.memcheck) - { - fpr.StoreFromRegister(s); - js.revertFprLoad = s; - } - fpr.BindToRegister(s, false); - - // Let's mirror the JitAsmCommon code and assume all non-MMU loads go to RAM. - if (!jo.memcheck) - { - if (w) - { - if (cpu_info.bSSSE3) - { - MOVD_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset)); - PSHUFB(XMM0, M(pbswapShuffle1x4)); - UNPCKLPS(XMM0, M(m_one)); - } - else - { - LoadAndSwap(32, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset)); - MOVD_xmm(XMM0, R(RSCRATCH)); - UNPCKLPS(XMM0, M(m_one)); - } - } - else - { - if (cpu_info.bSSSE3) - { - MOVQ_xmm(XMM0, MComplex(RMEM, addr, SCALE_1, loadOffset)); - PSHUFB(XMM0, M(pbswapShuffle2x4)); - } - else - { - LoadAndSwap(64, RSCRATCH, MComplex(RMEM, addr, SCALE_1, loadOffset)); - ROL(64, R(RSCRATCH), Imm8(32)); - MOVQ_xmm(XMM0, R(RSCRATCH)); - } - } - CVTPS2PD(fpr.RX(s), R(XMM0)); - } - else - { - BitSet32 registersInUse = CallerSavedRegistersInUse(); - registersInUse[fpr.RX(s) << 16] = false; - if (update) - registersInUse[addr] = true; - SafeLoadToReg(RSCRATCH, R(addr), w ? 32 : 64, loadOffset, registersInUse, false); - MemoryExceptionCheck(); - if (w) - { - MOVD_xmm(XMM0, R(RSCRATCH)); - UNPCKLPS(XMM0, M(m_one)); - } - else - { - ROL(64, R(RSCRATCH), Imm8(32)); - MOVQ_xmm(XMM0, R(RSCRATCH)); - } - CVTPS2PD(fpr.RX(s), R(XMM0)); - if (update) - MOV(32, gpr.R(a), R(addr)); - } - gpr.UnlockAll(); - fpr.UnlockAll(); - return; - } gpr.FlushLockX(RSCRATCH_EXTRA); gpr.BindToRegister(a, true, update); fpr.BindToRegister(s, false, true); @@ -373,7 +180,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst) CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8]))); } - MemoryExceptionCheck(); CVTPS2PD(fpr.RX(s), R(XMM0)); if (update && jo.memcheck) { diff --git a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp index 38d0e4a6e5..1dacdca430 100644 --- a/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp +++ b/Source/Core/Core/PowerPC/Jit64Common/Jit64AsmCommon.cpp @@ -572,8 +572,6 @@ void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type, MULPS(XMM0, R(XMM1)); } } - - return; } void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline) diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.h b/Source/Core/Core/PowerPC/Jit64IL/JitIL.h index ac9950883e..90f0f95a49 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.h +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.h @@ -17,6 +17,7 @@ #pragma once #include "Common/CommonTypes.h" +#include "Common/x64ABI.h" #include "Common/x64Emitter.h" #include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/Jit64/JitAsm.h" diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp index 33efc2ee18..83119d2189 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp @@ -12,27 +12,12 @@ #include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" #include "Common/MsgHandler.h" -#include "Common/x64Analyzer.h" #include "Common/x64Emitter.h" #include "Core/HW/Memmap.h" #include "Core/PowerPC/JitCommon/JitBase.h" using namespace Gen; -static void BackPatchError(const std::string& text, u8* codePtr, u32 emAddress) -{ - u64 code_addr = (u64)codePtr; - disassembler disasm; - char disbuf[256]; - memset(disbuf, 0, 256); - disasm.disasm64(0, code_addr, codePtr, disbuf); - PanicAlert("%s\n\n" - "Error encountered accessing emulated address %08x.\n" - "Culprit instruction: \n%s\nat %#" PRIx64, - text.c_str(), emAddress, disbuf, code_addr); - return; -} - // This generates some fairly heavy trampolines, but it doesn't really hurt. // Only instructions that access I/O will get these, and there won't be that // many of them in a typical program/game. @@ -56,36 +41,14 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) if (!IsInSpace(codePtr)) return false; // this will become a regular crash real soon after this - InstructionInfo info = {}; - - if (!DisassembleMov(codePtr, &info)) - { - BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress); - return false; - } - - if (info.otherReg != RMEM) - { - PanicAlert("BackPatch : Base reg not RMEM." - "\n\nAttempted to access %08x.", - emAddress); - return false; - } - - if (info.byteSwap && info.instructionSize < BACKPATCH_SIZE) - { - PanicAlert("BackPatch: MOVBE is too small"); - return false; - } - - auto it = registersInUseAtLoc.find(codePtr); - if (it == registersInUseAtLoc.end()) + auto it = backPatchInfo.find(codePtr); + if (it == backPatchInfo.end()) { PanicAlert("BackPatch: no register use entry for address %p", codePtr); return false; } - BitSet32 registersInUse = it->second; + TrampolineInfo& info = it->second; u8* exceptionHandler = nullptr; if (jit->jo.memcheck) @@ -95,110 +58,67 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) exceptionHandler = it2->second; } - // Compute the start and length of the memory operation, including - // any byteswapping. - int totalSize = info.instructionSize; - u8* start = codePtr; - if (!info.isMemoryWrite) - { - // MOVBE and single bytes don't need to be swapped. - if (!info.byteSwap && info.operandSize > 1) - { - // REX - if ((codePtr[totalSize] & 0xF0) == 0x40) - totalSize++; - - // BSWAP - if (codePtr[totalSize] == 0x0F && (codePtr[totalSize + 1] & 0xF8) == 0xC8) - totalSize += 2; - - if (info.operandSize == 2) - { - // operand size override - if (codePtr[totalSize] == 0x66) - totalSize++; - // REX - if ((codePtr[totalSize] & 0xF0) == 0x40) - totalSize++; - // SAR/ROL - _assert_(codePtr[totalSize] == 0xC1 && - (codePtr[totalSize + 2] == 0x10 || codePtr[totalSize + 2] == 0x08)); - info.signExtend = (codePtr[totalSize + 1] & 0x10) != 0; - totalSize += 3; - } - } - } - else - { - if (info.byteSwap || info.hasImmediate) - { - // The instruction is a MOVBE but it failed so the value is still in little-endian byte order. - } - else - { - // We entered here with a BSWAP-ed register. We'll have to swap it back. - u64* ptr = ContextRN(ctx, info.regOperandReg); - int bswapSize = 0; - switch (info.operandSize) - { - case 1: - bswapSize = 0; - break; - case 2: - bswapSize = 4 + (info.regOperandReg >= 8 ? 1 : 0); - *ptr = Common::swap16((u16)*ptr); - break; - case 4: - bswapSize = 2 + (info.regOperandReg >= 8 ? 1 : 0); - *ptr = Common::swap32((u32)*ptr); - break; - case 8: - bswapSize = 3; - *ptr = Common::swap64(*ptr); - break; - } - start = codePtr - bswapSize; - totalSize += bswapSize; - } - } - // In the trampoline code, we jump back into the block at the beginning // of the next instruction. The next instruction comes immediately // after the backpatched operation, or BACKPATCH_SIZE bytes after the start // of the backpatched operation, whichever comes last. (The JIT inserts NOPs // into the original code if necessary to ensure there is enough space // to insert the backpatch jump.) - int padding = totalSize > BACKPATCH_SIZE ? totalSize - BACKPATCH_SIZE : 0; - u8* returnPtr = start + 5 + padding; + + jit->js.generatingTrampoline = true; + jit->js.trampolineExceptionHandler = exceptionHandler; // Generate the trampoline. - const u8* trampoline; - if (info.isMemoryWrite) - { - // TODO: special case FIFO writes. - auto it3 = pcAtLoc.find(codePtr); - if (it3 == pcAtLoc.end()) - { - PanicAlert("BackPatch: no pc entry for address %p", codePtr); - return false; - } + const u8* trampoline = trampolines.GenerateTrampoline(info); + jit->js.generatingTrampoline = false; + jit->js.trampolineExceptionHandler = nullptr; - u32 pc = it3->second; - trampoline = - trampolines.GenerateWriteTrampoline(info, registersInUse, exceptionHandler, returnPtr, pc); - } - else - { - trampoline = - trampolines.GenerateReadTrampoline(info, registersInUse, exceptionHandler, returnPtr); - } + u8* start = info.start; // Patch the original memory operation. XEmitter emitter(start); emitter.JMP(trampoline, true); - for (int i = 0; i < padding; ++i) + // NOPs become dead code + const u8* end = info.start + info.len; + for (const u8* i = emitter.GetCodePtr(); i < end; ++i) emitter.INT3(); - ctx->CTX_PC = (u64)start; + + // Rewind time to just before the start of the write block. If we swapped memory + // before faulting (eg: the store+swap was not an atomic op like MOVBE), let's + // swap it back so that the swap can happen again (this double swap isn't ideal but + // only happens the first time we fault). + if (info.nonAtomicSwapStoreSrc != INVALID_REG) + { + u64* ptr = ContextRN(ctx, info.nonAtomicSwapStoreSrc); + switch (info.accessSize << 3) + { + case 8: + // No need to swap a byte + break; + case 16: + *ptr = Common::swap16(static_cast(*ptr)); + break; + case 32: + *ptr = Common::swap32(static_cast(*ptr)); + break; + case 64: + *ptr = Common::swap64(static_cast(*ptr)); + break; + default: + _dbg_assert_(DYNA_REC, 0); + break; + } + } + + // This is special code to undo the LEA in SafeLoadToReg if it clobbered the address + // register in the case where reg_value shared the same location as opAddress. + if (info.offsetAddedToAddress) + { + u64* ptr = ContextRN(ctx, info.op_arg.GetSimpleReg()); + *ptr -= static_cast(info.offset); + } + + ctx->CTX_PC = reinterpret_cast(trampoline); return true; } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 245f73df66..af69116fd6 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -96,6 +96,9 @@ protected: bool carryFlagSet; bool carryFlagInverted; + bool generatingTrampoline; + u8* trampolineExceptionHandler; + int fifoBytesThisBlock; PPCAnalyst::BlockStats st; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index ef4d5f3eef..fb0998f284 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -18,6 +18,26 @@ using namespace Gen; void EmuCodeBlock::MemoryExceptionCheck() { + // TODO: We really should untangle the trampolines, exception handlers and + // memory checks. + + // If we are currently generating a trampoline for a failed fastmem + // load/store, the trampoline generator will have stashed the exception + // handler (that we previously generated after the fastmem instruction) in + // trampolineExceptionHandler. + if (jit->js.generatingTrampoline) + { + if (jit->js.trampolineExceptionHandler) + { + TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); + J_CC(CC_NZ, jit->js.trampolineExceptionHandler); + } + return; + } + + // If memcheck (ie: MMU) mode is enabled and we haven't generated an + // exception handler for this instruction yet, we will generate an + // exception check. if (jit->jo.memcheck && !jit->js.fastmemLoadStore && !jit->js.fixupExceptionHandler) { TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); @@ -42,10 +62,10 @@ void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, i MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); } -u8* EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessSize, s32 offset, - bool signExtend) +bool EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessSize, s32 offset, + bool signExtend, MovInfo* info) { - u8* result; + bool offsetAddedToAddress = false; OpArg memOperand; if (opAddress.IsSimpleReg()) { @@ -57,6 +77,11 @@ u8* EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessS // place to address the issue.) if ((u32)offset >= 0x1000) { + // This method can potentially clobber the address if it shares a register + // with the load target. In this case we can just subtract offset from the + // register (see JitBackpatch for this implementation). + offsetAddedToAddress = (reg_value == opAddress.GetSimpleReg()); + LEA(32, reg_value, MDisp(opAddress.GetSimpleReg(), offset)); opAddress = R(reg_value); offset = 0; @@ -74,9 +99,8 @@ u8* EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessS memOperand = MComplex(RMEM, reg_value, SCALE_1, offset); } - result = GetWritableCodePtr(); - LoadAndSwap(accessSize, reg_value, memOperand, signExtend); - return result; + LoadAndSwap(accessSize, reg_value, memOperand, signExtend, info); + return offsetAddedToAddress; } // Visitor that generates code to read a MMIO value. @@ -231,72 +255,43 @@ FixupBranch EmuCodeBlock::CheckIfSafeAddress(const OpArg& reg_value, X64Reg reg_ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg& opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags) { - registersInUse[reg_value] = false; - if (jit->jo.fastmem && !opAddress.IsImm() && - !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM))) - { - u8* mov = UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend); + bool slowmem = (flags & SAFE_LOADSTORE_FORCE_SLOWMEM) != 0; - registersInUseAtLoc[mov] = registersInUse; - jit->js.fastmemLoadStore = mov; + registersInUse[reg_value] = false; + if (jit->jo.fastmem && !(flags & SAFE_LOADSTORE_NO_FASTMEM) && !slowmem) + { + u8* backpatchStart = GetWritableCodePtr(); + MovInfo mov; + bool offsetAddedToAddress = + UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend, &mov); + TrampolineInfo& info = backPatchInfo[mov.address]; + info.pc = jit->js.compilerPC; + info.nonAtomicSwapStoreSrc = mov.nonAtomicSwapStore ? mov.nonAtomicSwapStoreSrc : INVALID_REG; + info.start = backpatchStart; + info.read = true; + info.op_reg = reg_value; + info.op_arg = opAddress; + info.offsetAddedToAddress = offsetAddedToAddress; + info.accessSize = accessSize >> 3; + info.offset = offset; + info.registersInUse = registersInUse; + info.flags = flags; + info.signExtend = signExtend; + ptrdiff_t padding = BACKPATCH_SIZE - (GetCodePtr() - backpatchStart); + if (padding > 0) + { + NOP(padding); + } + info.len = static_cast(GetCodePtr() - info.start); + + jit->js.fastmemLoadStore = mov.address; return; } - u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; - - // The following masks the region used by the GC/Wii virtual memory lib - mem_mask |= Memory::ADDR_MASK_MEM1; - if (opAddress.IsImm()) { u32 address = opAddress.Imm32() + offset; - - // If the address is known to be RAM, just load it directly. - if (PowerPC::IsOptimizableRAMAddress(address)) - { - UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend); - return; - } - - // If the address maps to an MMIO register, inline MMIO read code. - u32 mmioAddress = PowerPC::IsOptimizableMMIOAccess(address, accessSize); - if (accessSize != 64 && mmioAddress) - { - MMIOLoadToReg(Memory::mmio_mapping.get(), reg_value, registersInUse, mmioAddress, accessSize, - signExtend); - return; - } - - // Fall back to general-case code. - ABI_PushRegistersAndAdjustStack(registersInUse, 0); - switch (accessSize) - { - case 64: - ABI_CallFunctionC((void*)&PowerPC::Read_U64, address); - break; - case 32: - ABI_CallFunctionC((void*)&PowerPC::Read_U32, address); - break; - case 16: - ABI_CallFunctionC((void*)&PowerPC::Read_U16_ZX, address); - break; - case 8: - ABI_CallFunctionC((void*)&PowerPC::Read_U8_ZX, address); - break; - } - ABI_PopRegistersAndAdjustStack(registersInUse, 0); - - MemoryExceptionCheck(); - if (signExtend && accessSize < 32) - { - // Need to sign extend values coming from the Read_U* functions. - MOVSX(32, accessSize, reg_value, R(ABI_RETURN)); - } - else if (reg_value != ABI_RETURN) - { - MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); - } - + SafeLoadToRegImmediate(reg_value, address, accessSize, registersInUse, signExtend); return; } @@ -310,8 +305,13 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg& opAddress, } FixupBranch exit; - if (!jit->jo.alwaysUseMemFuncs) + if (!jit->jo.alwaysUseMemFuncs && !slowmem) { + u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; + + // The following masks the region used by the GC/Wii virtual memory lib + mem_mask |= Memory::ADDR_MASK_MEM1; + FixupBranch slow = CheckIfSafeAddress(R(reg_value), reg_addr, registersInUse, mem_mask); UnsafeLoadToReg(reg_value, R(reg_addr), accessSize, 0, signExtend); if (farcode.Enabled()) @@ -350,7 +350,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg& opAddress, MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); } - if (!jit->jo.alwaysUseMemFuncs) + if (!jit->jo.alwaysUseMemFuncs && !slowmem) { if (farcode.Enabled()) { @@ -361,6 +361,56 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg& opAddress, } } +void EmuCodeBlock::SafeLoadToRegImmediate(X64Reg reg_value, u32 address, int accessSize, + BitSet32 registersInUse, bool signExtend) +{ + // If the address is known to be RAM, just load it directly. + if (PowerPC::IsOptimizableRAMAddress(address)) + { + UnsafeLoadToReg(reg_value, Imm32(address), accessSize, 0, signExtend); + return; + } + + // If the address maps to an MMIO register, inline MMIO read code. + u32 mmioAddress = PowerPC::IsOptimizableMMIOAccess(address, accessSize); + if (accessSize != 64 && mmioAddress) + { + MMIOLoadToReg(Memory::mmio_mapping.get(), reg_value, registersInUse, mmioAddress, accessSize, + signExtend); + return; + } + + // Fall back to general-case code. + ABI_PushRegistersAndAdjustStack(registersInUse, 0); + switch (accessSize) + { + case 64: + ABI_CallFunctionC(reinterpret_cast(&PowerPC::Read_U64), address); + break; + case 32: + ABI_CallFunctionC(reinterpret_cast(&PowerPC::Read_U32), address); + break; + case 16: + ABI_CallFunctionC(reinterpret_cast(&PowerPC::Read_U16_ZX), address); + break; + case 8: + ABI_CallFunctionC(reinterpret_cast(&PowerPC::Read_U8_ZX), address); + break; + } + ABI_PopRegistersAndAdjustStack(registersInUse, 0); + + MemoryExceptionCheck(); + if (signExtend && accessSize < 32) + { + // Need to sign extend values coming from the Read_U* functions. + MOVSX(32, accessSize, reg_value, R(ABI_RETURN)); + } + else if (reg_value != ABI_RETURN) + { + MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); + } +} + static OpArg SwapImmediate(int accessSize, const OpArg& reg_value) { if (accessSize == 32) @@ -371,10 +421,15 @@ static OpArg SwapImmediate(int accessSize, const OpArg& reg_value) return Imm8(reg_value.Imm8()); } -u8* EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, - bool swap) +void EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, + bool swap, MovInfo* info) { - u8* result = GetWritableCodePtr(); + if (info) + { + info->address = GetWritableCodePtr(); + info->nonAtomicSwapStore = false; + } + OpArg dest = MComplex(RMEM, reg_addr, SCALE_1, offset); if (reg_value.IsImm()) { @@ -384,22 +439,19 @@ u8* EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce } else if (swap) { - result = SwapAndStore(accessSize, dest, reg_value.GetSimpleReg()); + SwapAndStore(accessSize, dest, reg_value.GetSimpleReg(), info); } else { MOV(accessSize, dest, reg_value); } - - return result; } static OpArg FixImmediate(int accessSize, OpArg arg) { if (arg.IsImm()) { - arg = accessSize == 8 ? Imm8((u8)arg.Imm32()) : accessSize == 16 ? Imm16((u16)arg.Imm32()) : - Imm32((u32)arg.Imm32()); + arg = accessSize == 8 ? arg.AsImm8() : accessSize == 16 ? arg.AsImm16() : arg.AsImm32(); } return arg; } @@ -475,25 +527,38 @@ bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address, void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int accessSize, s32 offset, BitSet32 registersInUse, int flags) { + bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP); + bool slowmem = (flags & SAFE_LOADSTORE_FORCE_SLOWMEM) != 0; + // set the correct immediate format reg_value = FixImmediate(accessSize, reg_value); - // TODO: support byte-swapped non-immediate fastmem stores - if (jit->jo.fastmem && !(flags & SAFE_LOADSTORE_NO_FASTMEM) && - (reg_value.IsImm() || !(flags & SAFE_LOADSTORE_NO_SWAP))) + if (jit->jo.fastmem && !(flags & SAFE_LOADSTORE_NO_FASTMEM) && !slowmem) { - const u8* backpatchStart = GetCodePtr(); - u8* mov = UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, offset, - !(flags & SAFE_LOADSTORE_NO_SWAP)); + u8* backpatchStart = GetWritableCodePtr(); + MovInfo mov; + UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, offset, swap, &mov); + TrampolineInfo& info = backPatchInfo[mov.address]; + info.pc = jit->js.compilerPC; + info.nonAtomicSwapStoreSrc = mov.nonAtomicSwapStore ? mov.nonAtomicSwapStoreSrc : INVALID_REG; + info.start = backpatchStart; + info.read = false; + info.op_arg = reg_value; + info.op_reg = reg_addr; + info.offsetAddedToAddress = false; + info.accessSize = accessSize >> 3; + info.offset = offset; + info.registersInUse = registersInUse; + info.flags = flags; ptrdiff_t padding = BACKPATCH_SIZE - (GetCodePtr() - backpatchStart); if (padding > 0) { NOP(padding); } + info.len = static_cast(GetCodePtr() - info.start); + + jit->js.fastmemLoadStore = mov.address; - registersInUseAtLoc[mov] = registersInUse; - pcAtLoc[mov] = jit->js.compilerPC; - jit->js.fastmemLoadStore = mov; return; } @@ -510,21 +575,22 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces } } - u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; - - // The following masks the region used by the GC/Wii virtual memory lib - mem_mask |= Memory::ADDR_MASK_MEM1; - - bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP); - FixupBranch slow, exit; - slow = CheckIfSafeAddress(reg_value, reg_addr, registersInUse, mem_mask); - UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap); - if (farcode.Enabled()) - SwitchToFarCode(); - else - exit = J(true); - SetJumpTarget(slow); + if (!slowmem) + { + u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; + + // The following masks the region used by the GC/Wii virtual memory lib + mem_mask |= Memory::ADDR_MASK_MEM1; + + slow = CheckIfSafeAddress(reg_value, reg_addr, registersInUse, mem_mask); + UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap); + if (farcode.Enabled()) + SwitchToFarCode(); + else + exit = J(true); + SetJumpTarget(slow); + } // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); @@ -563,12 +629,18 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces break; } ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); - if (farcode.Enabled()) + + MemoryExceptionCheck(); + + if (!slowmem) { - exit = J(true); - SwitchToNearCode(); + if (farcode.Enabled()) + { + exit = J(true); + SwitchToNearCode(); + } + SetJumpTarget(exit); } - SetJumpTarget(exit); } void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address, bool swap) @@ -1055,7 +1127,6 @@ void EmuCodeBlock::JitClearCA() void EmuCodeBlock::Clear() { - registersInUseAtLoc.clear(); - pcAtLoc.clear(); + backPatchInfo.clear(); exceptionHandlerAtLoc.clear(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index e50ad9af6e..54deb7e7a1 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -59,6 +59,47 @@ static const int FARCODE_SIZE_MMU = 1024 * 1024 * 48; static const int TRAMPOLINE_CODE_SIZE = 1024 * 1024 * 8; static const int TRAMPOLINE_CODE_SIZE_MMU = 1024 * 1024 * 32; +// Stores information we need to batch-patch a MOV with a call to the slow read/write path after +// it faults. There will be 10s of thousands of these structs live, so be wary of making this too +// big. +struct TrampolineInfo final +{ + // The start of the store operation that failed -- we will patch a JMP here + u8* start; + + // The start + len = end of the store operation (points to the next instruction) + u32 len; + + // The PPC PC for the current load/store block + u32 pc; + + // Saved because we need these to make the ABI call in the trampoline + BitSet32 registersInUse; + + // The MOV operation + Gen::X64Reg nonAtomicSwapStoreSrc; + + // src/dest for load/store + s32 offset; + Gen::X64Reg op_reg; + Gen::OpArg op_arg; + + // Original SafeLoadXXX/SafeStoreXXX flags + u8 flags; + + // Memory access size (in bytes) + u8 accessSize : 4; + + // true if this is a read op vs a write + bool read : 1; + + // for read operations, true if needs sign-extension after load + bool signExtend : 1; + + // Set to true if we added the offset to the address and need to undo it + bool offsetAddedToAddress : 1; +}; + // Like XCodeBlock but has some utilities for memory access. class EmuCodeBlock : public Gen::X64CodeBlock { @@ -88,15 +129,15 @@ public: void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false); // these return the address of the MOV, for backpatching - u8* UnsafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, - s32 offset = 0, bool swap = true); - u8* UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, - s32 offset = 0, bool swap = true) + void UnsafeWriteRegToReg(Gen::OpArg reg_value, Gen::X64Reg reg_addr, int accessSize, + s32 offset = 0, bool swap = true, Gen::MovInfo* info = nullptr); + void UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, + s32 offset = 0, bool swap = true, Gen::MovInfo* info = nullptr) { - return UnsafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, swap); + UnsafeWriteRegToReg(R(reg_value), reg_addr, accessSize, offset, swap, info); } - u8* UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, - bool signExtend); + bool UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, + bool signExtend, Gen::MovInfo* info = nullptr); void UnsafeWriteGatherPipe(int accessSize); // Generate a load/write from the MMIO handler for a given address. Only @@ -108,12 +149,18 @@ public: { SAFE_LOADSTORE_NO_SWAP = 1, SAFE_LOADSTORE_NO_PROLOG = 2, + // This indicates that the write being generated cannot be patched (and thus can't use fastmem) SAFE_LOADSTORE_NO_FASTMEM = 4, - SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR = 8 + SAFE_LOADSTORE_CLOBBER_RSCRATCH_INSTEAD_OF_ADDR = 8, + // Force slowmem (used when generating fallbacks in trampolines) + SAFE_LOADSTORE_FORCE_SLOWMEM = 16, }; void SafeLoadToReg(Gen::X64Reg reg_value, const Gen::OpArg& opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags = 0); + void SafeLoadToRegImmediate(Gen::X64Reg reg_value, u32 address, int accessSize, + BitSet32 registersInUse, bool signExtend); + // Clobbers RSCRATCH or reg_addr depending on the relevant flag. Preserves // reg_value if the load fails and js.memcheck is enabled. // Works with immediate inputs and simple registers only. @@ -158,7 +205,6 @@ public: void Clear(); protected: - std::unordered_map registersInUseAtLoc; - std::unordered_map pcAtLoc; + std::unordered_map backPatchInfo; std::unordered_map exceptionHandlerAtLoc; }; diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 12b72b1035..79c0a5abee 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -9,7 +9,6 @@ #include "Common/CommonTypes.h" #include "Common/JitRegister.h" #include "Common/x64ABI.h" -#include "Common/x64Analyzer.h" #include "Common/x64Emitter.h" #include "Core/PowerPC/JitCommon/JitBase.h" #include "Core/PowerPC/JitCommon/Jit_Util.h" @@ -37,150 +36,50 @@ void TrampolineCache::Shutdown() FreeCodeSpace(); } -const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo& info, - BitSet32 registersInUse, u8* exceptionHandler, - u8* returnPtr) +const u8* TrampolineCache::GenerateTrampoline(const TrampolineInfo& info) +{ + if (info.read) + { + return GenerateReadTrampoline(info); + } + + return GenerateWriteTrampoline(info); +} + +const u8* TrampolineCache::GenerateReadTrampoline(const TrampolineInfo& info) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); const u8* trampoline = GetCodePtr(); - X64Reg addrReg = (X64Reg)info.scaledReg; - X64Reg dataReg = (X64Reg)info.regOperandReg; - int stack_offset = 0; - bool push_param1 = registersInUse[ABI_PARAM1]; - if (push_param1) - { - PUSH(ABI_PARAM1); - stack_offset = 8; - registersInUse[ABI_PARAM1] = 0; - } + SafeLoadToReg(info.op_reg, info.op_arg, info.accessSize << 3, info.offset, info.registersInUse, + info.signExtend, info.flags | SAFE_LOADSTORE_FORCE_SLOWMEM); - int dataRegSize = info.operandSize == 8 ? 64 : 32; - if (addrReg != ABI_PARAM1 && info.displacement) - LEA(32, ABI_PARAM1, MDisp(addrReg, info.displacement)); - else if (addrReg != ABI_PARAM1) - MOV(32, R(ABI_PARAM1), R(addrReg)); - else if (info.displacement) - ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); + JMP(info.start + info.len, true); - ABI_PushRegistersAndAdjustStack(registersInUse, stack_offset); - - switch (info.operandSize) - { - case 8: - CALL((void*)&PowerPC::Read_U64); - break; - case 4: - CALL((void*)&PowerPC::Read_U32); - break; - case 2: - CALL((void*)&PowerPC::Read_U16); - break; - case 1: - CALL((void*)&PowerPC::Read_U8); - break; - } - - ABI_PopRegistersAndAdjustStack(registersInUse, stack_offset); - - if (push_param1) - POP(ABI_PARAM1); - - if (exceptionHandler) - { - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); - J_CC(CC_NZ, exceptionHandler); - } - - if (info.signExtend) - MOVSX(dataRegSize, info.operandSize * 8, dataReg, R(ABI_RETURN)); - else if (dataReg != ABI_RETURN || info.operandSize < 4) - MOVZX(dataRegSize, info.operandSize * 8, dataReg, R(ABI_RETURN)); - - JMP(returnPtr, true); - - JitRegister::Register(trampoline, GetCodePtr(), "JIT_ReadTrampoline"); + JitRegister::Register(trampoline, GetCodePtr(), "JIT_ReadTrampoline_%x", info.pc); return trampoline; } -const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo& info, - BitSet32 registersInUse, u8* exceptionHandler, - u8* returnPtr, u32 pc) +const u8* TrampolineCache::GenerateWriteTrampoline(const TrampolineInfo& info) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); const u8* trampoline = GetCodePtr(); - X64Reg dataReg = (X64Reg)info.regOperandReg; - X64Reg addrReg = (X64Reg)info.scaledReg; - // Don't treat FIFO writes specially for now because they require a burst // check anyway. // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs - MOV(32, PPCSTATE(pc), Imm32(pc)); + MOV(32, PPCSTATE(pc), Imm32(info.pc)); - ABI_PushRegistersAndAdjustStack(registersInUse, 0); + SafeWriteRegToReg(info.op_arg, info.op_reg, info.accessSize << 3, info.offset, + info.registersInUse, info.flags | SAFE_LOADSTORE_FORCE_SLOWMEM); - if (info.hasImmediate) - { - if (addrReg != ABI_PARAM2 && info.displacement) - LEA(32, ABI_PARAM2, MDisp(addrReg, info.displacement)); - else if (addrReg != ABI_PARAM2) - MOV(32, R(ABI_PARAM2), R(addrReg)); - else if (info.displacement) - ADD(32, R(ABI_PARAM2), Imm32(info.displacement)); + JMP(info.start + info.len, true); - // we have to swap back the immediate to pass it to the write functions - switch (info.operandSize) - { - case 8: - PanicAlert("Invalid 64-bit immediate!"); - break; - case 4: - MOV(32, R(ABI_PARAM1), Imm32(Common::swap32((u32)info.immediate))); - break; - case 2: - MOV(16, R(ABI_PARAM1), Imm16(Common::swap16((u16)info.immediate))); - break; - case 1: - MOV(8, R(ABI_PARAM1), Imm8((u8)info.immediate)); - break; - } - } - else - { - int dataRegSize = info.operandSize == 8 ? 64 : 32; - MOVTwo(dataRegSize, ABI_PARAM2, addrReg, info.displacement, ABI_PARAM1, dataReg); - } - - switch (info.operandSize) - { - case 8: - CALL((void*)&PowerPC::Write_U64); - break; - case 4: - CALL((void*)&PowerPC::Write_U32); - break; - case 2: - CALL((void*)&PowerPC::Write_U16); - break; - case 1: - CALL((void*)&PowerPC::Write_U8); - break; - } - - ABI_PopRegistersAndAdjustStack(registersInUse, 0); - if (exceptionHandler) - { - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); - J_CC(CC_NZ, exceptionHandler); - } - JMP(returnPtr, true); - - JitRegister::Register(trampoline, GetCodePtr(), "JIT_WriteTrampoline_%x", pc); + JitRegister::Register(trampoline, GetCodePtr(), "JIT_WriteTrampoline_%x", info.pc); return trampoline; } diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h index 7852bae6b9..c43668dc8d 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h @@ -7,21 +7,21 @@ #include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Common/x64Emitter.h" +#include "Core/PowerPC/JitCommon/Jit_Util.h" struct InstructionInfo; // We need at least this many bytes for backpatching. const int BACKPATCH_SIZE = 5; -class TrampolineCache : public Gen::X64CodeBlock +class TrampolineCache : public EmuCodeBlock { + const u8* GenerateReadTrampoline(const TrampolineInfo& info); + const u8* GenerateWriteTrampoline(const TrampolineInfo& info); + public: void Init(int size); void Shutdown(); - - const u8* GenerateReadTrampoline(const InstructionInfo& info, BitSet32 registersInUse, - u8* exceptionHandler, u8* returnPtr); - const u8* GenerateWriteTrampoline(const InstructionInfo& info, BitSet32 registersInUse, - u8* exceptionHandler, u8* returnPtr, u32 pc); + const u8* GenerateTrampoline(const TrampolineInfo& info); void ClearCodeSpace(); };