From ea23ce2726a32ee8920c6f5cf560efd60351ddcd Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 1 Jan 2015 12:38:43 -0800 Subject: [PATCH 01/19] MMU: fix rollback in DSIs on page-crossing stores I don't know if this affected anything, but it was subtly wrong. Also reorganize the loads to match, for consistency. --- Source/Core/Core/HW/MemmapFunctions.cpp | 93 +++++++++++-------------- 1 file changed, 40 insertions(+), 53 deletions(-) diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 0f91e4ea87..8db4be1920 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -130,43 +130,35 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address, Memory::XChec // Handle loads that cross page boundaries (ewwww) if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) { - _var = 0; // This could be unaligned down to the byte level... hopefully this is rare, so doing it this // way isn't too terrible. // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! u32 tlb_addr = TranslateAddress(em_address, flag); + u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); + u32 tlb_addr_next_page = TranslateAddress(em_address_next_page, flag); + if (tlb_addr == 0 || tlb_addr_next_page == 0) + { + if (flag == FLAG_READ) + { + u32 exception_addr = tlb_addr == 0 ? em_address : em_address_next_page; + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) + PanicAlertT("Invalid Read at 0x%08x, PC = 0x%08x ", exception_addr, PC); + else + GenerateDSIException(exception_addr, false); + } + return; + } + _var = 0; for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++) { - // Start of the new page... translate the address again! - if (!(addr & (HW_PAGE_SIZE-1))) - tlb_addr = TranslateAddress(addr, flag); - // Important: we need to generate the DSI on the first store that caused the fault, NOT - // the address of the start of the load. - if (tlb_addr == 0) - { - if (flag == FLAG_READ) - { - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) - PanicAlertT("Invalid Read at 0x%08x, PC = 0x%08x ", em_address, PC); - else - GenerateDSIException(addr, false); - break; - } - } + if (addr == em_address_next_page) + tlb_addr = tlb_addr_next_page; + _var <<= 8; + if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) + _var |= m_pEXRAM[tlb_addr & EXRAM_MASK]; else - { - if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) - { - _var <<= 8; - _var |= m_pEXRAM[tlb_addr & EXRAM_MASK]; - } - else - { - _var <<= 8; - _var |= m_pRAM[tlb_addr & RAM_MASK]; - } - } + _var |= m_pRAM[tlb_addr & RAM_MASK]; } } else @@ -271,35 +263,30 @@ __forceinline void WriteToHardware(u32 em_address, const T data, Memory::XCheckT if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE-1)) > HW_PAGE_SIZE - sizeof(T)) { T val = bswap(data); + // We need to check both addresses before writing in case there's a DSI. u32 tlb_addr = TranslateAddress(em_address, flag); - for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++) + u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); + u32 tlb_addr_next_page = TranslateAddress(em_address_next_page, flag); + if (tlb_addr == 0 || tlb_addr_next_page == 0) { - if (!(addr & (HW_PAGE_SIZE-1))) - tlb_addr = TranslateAddress(addr, flag); - if (tlb_addr == 0) + if (flag == FLAG_WRITE) { - if (flag == FLAG_WRITE) - { - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) - PanicAlertT("Invalid Write to 0x%08x, PC = 0x%08x ", em_address, PC); - else - GenerateDSIException(addr, true); - break; - } - } - else - { - if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) - { - m_pEXRAM[tlb_addr & EXRAM_MASK] = (u8)val; - val >>= 8; - } + u32 exception_addr = tlb_addr == 0 ? em_address : em_address_next_page; + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) + PanicAlertT("Invalid Write to 0x%08x, PC = 0x%08x ", exception_addr, PC); else - { - m_pRAM[tlb_addr & RAM_MASK] = (u8)val; - val >>= 8; - } + GenerateDSIException(exception_addr, true); } + return; + } + for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++, val >>= 8) + { + if (addr == em_address_next_page) + tlb_addr = tlb_addr_next_page; + if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) + m_pEXRAM[tlb_addr & EXRAM_MASK] = (u8)val; + else + m_pRAM[tlb_addr & RAM_MASK] = (u8)val; } } else From c2ed29fe0d2ca221ba5ce9fae087b0c4ee4225f6 Mon Sep 17 00:00:00 2001 From: Fiora Date: Tue, 30 Dec 2014 18:12:47 -0800 Subject: [PATCH 02/19] MemmapFunctions: various MMU optimizations Small TLB lookup optimizations: this is the hot path for MMU code, so try to make it better. Template the TLB lookup functions based on the lookup type (opcode, data, no exception). Clean up the Read/Write functions and make them more consistent. Add an early-exit path for MMU accesses to ReadFromHardware/WriteToHardware. --- .../Core/Core/Debugger/PPCDebugInterface.cpp | 2 +- Source/Core/Core/HW/Memmap.h | 30 +- Source/Core/Core/HW/MemmapFunctions.cpp | 296 ++++++++---------- Source/Core/Core/PowerPC/JitInterface.cpp | 2 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 2 +- 5 files changed, 155 insertions(+), 177 deletions(-) diff --git a/Source/Core/Core/Debugger/PPCDebugInterface.cpp b/Source/Core/Core/Debugger/PPCDebugInterface.cpp index ddc2fec12e..1434fdb5bd 100644 --- a/Source/Core/Core/Debugger/PPCDebugInterface.cpp +++ b/Source/Core/Core/Debugger/PPCDebugInterface.cpp @@ -28,7 +28,7 @@ std::string PPCDebugInterface::Disassemble(unsigned int address) if (!Memory::IsRAMAddress(address, true, true)) { if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU || !((address & JIT_ICACHE_VMEM_BIT) && - Memory::TranslateAddress(address, Memory::FLAG_NO_EXCEPTION))) + Memory::TranslateAddress(address))) { return "(No RAM here)"; } diff --git a/Source/Core/Core/HW/Memmap.h b/Source/Core/Core/HW/Memmap.h index 8a8b5b42e1..31e2e9979f 100644 --- a/Source/Core/Core/HW/Memmap.h +++ b/Source/Core/Core/HW/Memmap.h @@ -96,12 +96,12 @@ u16 Read_U16(const u32 _Address); u32 Read_U32(const u32 _Address); u64 Read_U64(const u32 _Address); -u32 Read_S8_Val(u32 address, u32 val); -u32 Read_U8_Val(u32 address, u32 val); -u32 Read_S16_Val(u32 address, u32 val); -u32 Read_U16_Val(u32 address, u32 val); -u32 Read_U32_Val(u32 address, u32 val); -u64 Read_U64_Val(u32 address, u64 val); +u32 Read_S8_Val(const u32 _Address, u32 _var); +u32 Read_U8_Val(const u32 _Address, u32 _var); +u32 Read_S16_Val(const u32 _Address, u32 _var); +u32 Read_U16_Val(const u32 _Address, u32 _var); +u32 Read_U32_Val(const u32 _Address, u32 _var); +u64 Read_U64_Val(const u32 _Address, u64 _var); // Useful helper functions, used by ARM JIT float Read_F32(const u32 _Address); @@ -111,17 +111,17 @@ double Read_F64(const u32 _Address); u32 Read_U8_ZX(const u32 _Address); u32 Read_U16_ZX(const u32 _Address); -void Write_U8(const u8 _Data, const u32 _Address); -void Write_U16(const u16 _Data, const u32 _Address); -void Write_U32(const u32 _Data, const u32 _Address); -void Write_U64(const u64 _Data, const u32 _Address); +void Write_U8(const u8 _var, const u32 _Address); +void Write_U16(const u16 _var, const u32 _Address); +void Write_U32(const u32 _var, const u32 _Address); +void Write_U64(const u64 _var, const u32 _Address); -void Write_U16_Swap(const u16 _Data, const u32 _Address); -void Write_U32_Swap(const u32 _Data, const u32 _Address); -void Write_U64_Swap(const u64 _Data, const u32 _Address); +void Write_U16_Swap(const u16 _var, const u32 _Address); +void Write_U32_Swap(const u32 _var, const u32 _Address); +void Write_U64_Swap(const u64 _var, const u32 _Address); // Useful helper functions, used by ARM JIT -void Write_F64(const double _Data, const u32 _Address); +void Write_F64(const double _var, const u32 _Address); std::string GetString(u32 em_address, size_t size = 0); @@ -142,7 +142,7 @@ enum XCheckTLBFlag FLAG_WRITE, FLAG_OPCODE, }; -u32 TranslateAddress(u32 _Address, XCheckTLBFlag _Flag); +template u32 TranslateAddress(const u32 _Address); void InvalidateTLBEntry(u32 _Address); extern u32 pagetable_base; extern u32 pagetable_hashmask; diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 8db4be1920..a8e3e8be63 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -16,6 +16,7 @@ // https://github.com/dolphin-emu/dolphin #include "Common/Atomic.h" +#include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Core/ConfigManager.h" @@ -91,9 +92,14 @@ static u32 EFB_Read(const u32 addr) static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite); -template -__forceinline void ReadFromHardware(U &_var, const u32 em_address, Memory::XCheckTLBFlag flag) +template +__forceinline void ReadFromHardware(U &_var, const u32 em_address) { + int segment = em_address >> 28; + // Quick check for an address that can't meet any of the following conditions, + // to speed up the MMU path. + if (BitSet32(0xCFC)[segment]) + goto translateaddress; // TODO: Figure out the fastest order of tests for both read and write (they are probably different). if ((em_address & 0xC8000000) == 0xC8000000) { @@ -102,30 +108,28 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address, Memory::XChec else _var = (T)mmio_mapping->Read::type>(em_address); } - else if (((em_address & 0xF0000000) == 0x80000000) || - ((em_address & 0xF0000000) == 0xC0000000) || - ((em_address & 0xF0000000) == 0x00000000)) + else if (segment == 0x8 || segment == 0xC || segment == 0x0) { _var = bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); } - else if (m_pEXRAM && (((em_address & 0xF0000000) == 0x90000000) || - ((em_address & 0xF0000000) == 0xD0000000) || - ((em_address & 0xF0000000) == 0x10000000))) + else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) { _var = bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); } - else if ((em_address >= 0xE0000000) && (em_address < (0xE0000000+L1_CACHE_SIZE))) + else if (segment == 0xE && (em_address < (0xE0000000+L1_CACHE_SIZE))) { _var = bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); } - else if ((bFakeVMEM && ((em_address &0xF0000000) == 0x70000000)) || - (bFakeVMEM && ((em_address &0xF0000000) == 0x40000000))) - { - // fake VMEM - _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); - } else { +translateaddress: + if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) + { + // fake VMEM + _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); + return; + } + // MMU // Handle loads that cross page boundaries (ewwww) if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) @@ -134,9 +138,9 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address, Memory::XChec // way isn't too terrible. // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! - u32 tlb_addr = TranslateAddress(em_address, flag); + u32 tlb_addr = TranslateAddress(em_address); u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); - u32 tlb_addr_next_page = TranslateAddress(em_address_next_page, flag); + u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); if (tlb_addr == 0 || tlb_addr_next_page == 0) { if (flag == FLAG_READ) @@ -163,7 +167,7 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address, Memory::XChec } else { - u32 tlb_addr = TranslateAddress(em_address, flag); + u32 tlb_addr = TranslateAddress(em_address); if (tlb_addr == 0) { if (flag == FLAG_READ) @@ -190,9 +194,14 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address, Memory::XChec } -template -__forceinline void WriteToHardware(u32 em_address, const T data, Memory::XCheckTLBFlag flag) +template +__forceinline void WriteToHardware(u32 em_address, const T data) { + int segment = em_address >> 28; + // Quick check for an address that can't meet any of the following conditions, + // to speed up the MMU path. + if (BitSet32(0xCFC)[segment]) + goto translateaddress; // First, let's check for FIFO writes, since they are probably the most common // reason we end up in this function: if ((em_address & 0xFFFFF000) == 0xCC008000) @@ -231,42 +240,40 @@ __forceinline void WriteToHardware(u32 em_address, const T data, Memory::XCheckT return; } } - else if (((em_address & 0xF0000000) == 0x80000000) || - ((em_address & 0xF0000000) == 0xC0000000) || - ((em_address & 0xF0000000) == 0x00000000)) + else if (segment == 0x8 || segment == 0xC || segment == 0x0) { *(T*)&m_pRAM[em_address & RAM_MASK] = bswap(data); return; } - else if (m_pEXRAM && (((em_address & 0xF0000000) == 0x90000000) || - ((em_address & 0xF0000000) == 0xD0000000) || - ((em_address & 0xF0000000) == 0x10000000))) + else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) { *(T*)&m_pEXRAM[em_address & EXRAM_MASK] = bswap(data); return; } - else if ((em_address >= 0xE0000000) && (em_address < (0xE0000000+L1_CACHE_SIZE))) + else if (segment == 0xE && (em_address < (0xE0000000+L1_CACHE_SIZE))) { *(T*)&m_pL1Cache[em_address & L1_CACHE_MASK] = bswap(data); return; } - else if ((bFakeVMEM && ((em_address &0xF0000000) == 0x70000000)) || - (bFakeVMEM && ((em_address &0xF0000000) == 0x40000000))) - { - // fake VMEM - *(T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK] = bswap(data); - } - else + else { +translateaddress: + if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) + { + // fake VMEM + *(T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK] = bswap(data); + return; + } + // MMU // Handle stores that cross page boundaries (ewwww) - if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE-1)) > HW_PAGE_SIZE - sizeof(T)) + if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) { T val = bswap(data); // We need to check both addresses before writing in case there's a DSI. - u32 tlb_addr = TranslateAddress(em_address, flag); + u32 tlb_addr = TranslateAddress(em_address); u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); - u32 tlb_addr_next_page = TranslateAddress(em_address_next_page, flag); + u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); if (tlb_addr == 0 || tlb_addr_next_page == 0) { if (flag == FLAG_WRITE) @@ -291,7 +298,7 @@ __forceinline void WriteToHardware(u32 em_address, const T data, Memory::XCheckT } else { - u32 tlb_addr = TranslateAddress(em_address, flag); + u32 tlb_addr = TranslateAddress(em_address); if (tlb_addr == 0) { if (flag == FLAG_WRITE) @@ -339,7 +346,7 @@ u32 Read_Opcode(u32 _Address) (_Address & ADDR_MASK_MEM1)) { // TODO: Check for MSR instruction address translation flag before translating - u32 tlb_addr = Memory::TranslateAddress(_Address, FLAG_OPCODE); + u32 tlb_addr = TranslateAddress(_Address); if (tlb_addr == 0) { GenerateISIException(_Address); @@ -354,63 +361,49 @@ u32 Read_Opcode(u32 _Address) return PowerPC::ppcState.iCache.ReadInstruction(_Address); } +#ifdef ENABLE_MEM_CHECK +#define MEMCHECK(write, size)\ +{\ +TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address);\ +if (mc)\ +{\ + mc->numHits++;\ + mc->Action(&PowerPC::debug_interface, (u32)_var, _Address, write, size, PC);\ +}\ +} +#else +#define MEMCHECK(write, size) +#endif + u8 Read_U8(const u32 _Address) { u8 _var = 0; - ReadFromHardware(_var, _Address, FLAG_READ); -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, _var, _Address, false, 1, PC); - } -#endif + ReadFromHardware(_var, _Address); + MEMCHECK(false, 1); return (u8)_var; } u16 Read_U16(const u32 _Address) { u16 _var = 0; - ReadFromHardware(_var, _Address, FLAG_READ); -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, _var, _Address, false, 2, PC); - } -#endif + ReadFromHardware(_var, _Address); + MEMCHECK(false, 2); return (u16)_var; } u32 Read_U32(const u32 _Address) { u32 _var = 0; - ReadFromHardware(_var, _Address, FLAG_READ); -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, _var, _Address, false, 4, PC); - } -#endif + ReadFromHardware(_var, _Address); + MEMCHECK(false, 4); return _var; } u64 Read_U64(const u32 _Address) { u64 _var = 0; - ReadFromHardware(_var, _Address, FLAG_READ); -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, (u32)_var, _Address, false, 8, PC); - } -#endif + ReadFromHardware(_var, _Address); + MEMCHECK(false, 8); return _var; } @@ -438,40 +431,46 @@ float Read_F32(const u32 _Address) return cvt.d; } -u32 Read_U8_Val(u32 address, u32 val) +u32 Read_U8_Val(const u32 _Address, u32 _var) { - ReadFromHardware(val, address, FLAG_READ); - return val; + ReadFromHardware(_var, _Address); + MEMCHECK(false, 1); + return _var; } -u32 Read_S8_Val(u32 address, u32 val) +u32 Read_S8_Val(const u32 _Address, u32 _var) { - ReadFromHardware(val, address, FLAG_READ); - return val; + ReadFromHardware(_var, _Address); + MEMCHECK(false, 1); + return _var; } -u32 Read_U16_Val(u32 address, u32 val) +u32 Read_U16_Val(const u32 _Address, u32 _var) { - ReadFromHardware(val, address, FLAG_READ); - return val; + ReadFromHardware(_var, _Address); + MEMCHECK(false, 2); + return _var; } -u32 Read_S16_Val(u32 address, u32 val) +u32 Read_S16_Val(const u32 _Address, u32 _var) { - ReadFromHardware(val, address, FLAG_READ); - return val; + ReadFromHardware(_var, _Address); + MEMCHECK(false, 2); + return _var; } -u32 Read_U32_Val(u32 address, u32 val) +u32 Read_U32_Val(const u32 _Address, u32 _var) { - ReadFromHardware(val, address, FLAG_READ); - return val; + ReadFromHardware(_var, _Address); + MEMCHECK(false, 4); + return _var; } -u64 Read_U64_Val(u32 address, u64 val) +u64 Read_U64_Val(const u32 _Address, u64 _var) { - ReadFromHardware(val, address, FLAG_READ); - return val; + ReadFromHardware(_var, _Address); + MEMCHECK(false, 8); + return _var; } u32 Read_U8_ZX(const u32 _Address) @@ -484,88 +483,60 @@ u32 Read_U16_ZX(const u32 _Address) return (u32)Read_U16(_Address); } -void Write_U8(const u8 _Data, const u32 _Address) +void Write_U8(const u8 _var, const u32 _Address) { -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, _Data,_Address,true,1,PC); - } -#endif - WriteToHardware(_Address, _Data, FLAG_WRITE); + MEMCHECK(true, 1); + WriteToHardware(_Address, _var); +} + +void Write_U16(const u16 _var, const u32 _Address) +{ + MEMCHECK(true, 2); + WriteToHardware(_Address, _var); +} +void Write_U16_Swap(const u16 _var, const u32 _Address) +{ + MEMCHECK(true, 2); + Write_U16(Common::swap16(_var), _Address); } -void Write_U16(const u16 _Data, const u32 _Address) +void Write_U32(const u32 _var, const u32 _Address) { -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, _Data,_Address,true,2,PC); - } -#endif - - WriteToHardware(_Address, _Data, FLAG_WRITE); + MEMCHECK(true, 4); + WriteToHardware(_Address, _var); } -void Write_U16_Swap(const u16 _Data, const u32 _Address) +void Write_U32_Swap(const u32 _var, const u32 _Address) { - Write_U16(Common::swap16(_Data), _Address); + MEMCHECK(true, 4); + Write_U32(Common::swap32(_var), _Address); } - -void Write_U32(const u32 _Data, const u32 _Address) +void Write_U64(const u64 _var, const u32 _Address) { -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, _Data,_Address,true,4,PC); - } -#endif - WriteToHardware(_Address, _Data, FLAG_WRITE); + MEMCHECK(true, 8); + WriteToHardware(_Address, _var); } -void Write_U32_Swap(const u32 _Data, const u32 _Address) +void Write_U64_Swap(const u64 _var, const u32 _Address) { - Write_U32(Common::swap32(_Data), _Address); + MEMCHECK(true, 8); + Write_U64(Common::swap64(_var), _Address); } -void Write_U64(const u64 _Data, const u32 _Address) -{ -#ifdef ENABLE_MEM_CHECK - TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address); - if (mc) - { - mc->numHits++; - mc->Action(&PowerPC::debug_interface, (u32)_Data,_Address,true,8,PC); - } -#endif - - WriteToHardware(_Address, _Data, FLAG_WRITE); -} -void Write_U64_Swap(const u64 _Data, const u32 _Address) -{ - Write_U64(Common::swap64(_Data), _Address); -} - -void Write_F64(const double _Data, const u32 _Address) +void Write_F64(const double _var, const u32 _Address) { union { u64 i; double d; } cvt; - cvt.d = _Data; + cvt.d = _var; Write_U64(cvt.i, _Address); } u8 ReadUnchecked_U8(const u32 _Address) { u8 _var = 0; - ReadFromHardware(_var, _Address, FLAG_NO_EXCEPTION); + ReadFromHardware(_var, _Address); return _var; } @@ -573,19 +544,19 @@ u8 ReadUnchecked_U8(const u32 _Address) u32 ReadUnchecked_U32(const u32 _Address) { u32 _var = 0; - ReadFromHardware(_var, _Address, FLAG_NO_EXCEPTION); + ReadFromHardware(_var, _Address); return _var; } void WriteUnchecked_U8(const u8 _iValue, const u32 _Address) { - WriteToHardware(_Address, _iValue, FLAG_NO_EXCEPTION); + WriteToHardware(_Address, _iValue); } void WriteUnchecked_U32(const u32 _iValue, const u32 _Address) { - WriteToHardware(_Address, _iValue, FLAG_NO_EXCEPTION); + WriteToHardware(_Address, _iValue); } // ********************************************************************************* @@ -731,8 +702,9 @@ void SDRUpdated() static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u32 vpa, u32 *paddr) { - PowerPC::tlb_entry *tlbe = PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; - if (tlbe[0].tag == (vpa & ~0xfff) && !(tlbe[0].flags & TLB_FLAG_INVALID)) + int tag = vpa >> HW_PAGE_INDEX_SHIFT; + PowerPC::tlb_entry *tlbe = PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; + if (tlbe[0].tag == tag && !(tlbe[0].flags & TLB_FLAG_INVALID)) { // Check if C bit requires updating if (_Flag == FLAG_WRITE) @@ -757,7 +729,7 @@ static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u return 1; } - if (tlbe[1].tag == (vpa & ~0xfff) && !(tlbe[1].flags & TLB_FLAG_INVALID)) + if (tlbe[1].tag == tag && !(tlbe[1].flags & TLB_FLAG_INVALID)) { // Check if C bit requires updating if (_Flag == FLAG_WRITE) @@ -797,7 +769,7 @@ static __forceinline void UpdateTLBEntry(const XCheckTLBFlag _Flag, UPTE2 PTE2, tlbe[1].flags &= ~TLB_FLAG_MOST_RECENT; tlbe[0].paddr = PTE2.RPN << HW_PAGE_INDEX_SHIFT; tlbe[0].pte = PTE2.Hex; - tlbe[0].tag = vpa & ~0xfff; + tlbe[0].tag = vpa >> HW_PAGE_INDEX_SHIFT; } else { @@ -805,7 +777,7 @@ static __forceinline void UpdateTLBEntry(const XCheckTLBFlag _Flag, UPTE2 PTE2, tlbe[0].flags &= ~TLB_FLAG_MOST_RECENT; tlbe[1].paddr = PTE2.RPN << HW_PAGE_INDEX_SHIFT; tlbe[1].pte = PTE2.Hex; - tlbe[1].tag = vpa & ~0xfff; + tlbe[1].tag = vpa >> HW_PAGE_INDEX_SHIFT; } } @@ -952,7 +924,8 @@ static u32 TranslateBlockAddress(const u32 addr, const XCheckTLBFlag _Flag) } // Translate effective address using BAT or PAT. Returns 0 if the address cannot be translated. -u32 TranslateAddress(const u32 _Address, const XCheckTLBFlag _Flag) +template +u32 TranslateAddress(const u32 _Address) { // Check MSR[IR] bit before translating instruction addresses. Rogue Leader clears IR and DR?? //if ((_Flag == FLAG_OPCODE) && !(MSR & (1 << (31 - 26)))) return _Address; @@ -970,4 +943,9 @@ u32 TranslateAddress(const u32 _Address, const XCheckTLBFlag _Flag) } return TranslatePageAddress(_Address, _Flag); } + +template u32 TranslateAddress(const u32 _Address); +template u32 TranslateAddress(const u32 _Address); +template u32 TranslateAddress(const u32 _Address); +template u32 TranslateAddress(const u32 _Address); } // namespace diff --git a/Source/Core/Core/PowerPC/JitInterface.cpp b/Source/Core/Core/PowerPC/JitInterface.cpp index 86c17653d5..87cb0c6d43 100644 --- a/Source/Core/Core/PowerPC/JitInterface.cpp +++ b/Source/Core/Core/PowerPC/JitInterface.cpp @@ -211,7 +211,7 @@ namespace JitInterface { if (bMMU && !bFakeVMEM && (_Address & Memory::ADDR_MASK_MEM1)) { - _Address = Memory::TranslateAddress(_Address, Memory::FLAG_OPCODE); + _Address = Memory::TranslateAddress(_Address); if (_Address == 0) { return 0; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 1064e6af2e..5c6dd390c8 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -649,7 +649,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 bool virtualAddr = SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU && (address & JIT_ICACHE_VMEM_BIT); if (virtualAddr) { - if (!Memory::TranslateAddress(address, Memory::FLAG_NO_EXCEPTION)) + if (!Memory::TranslateAddress(address)) { // Memory exception occurred during instruction fetch block->m_memory_exception = true; From a0e5c76a1f6182f12a0203061407c5b4414ef20e Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 1 Jan 2015 13:02:33 -0800 Subject: [PATCH 03/19] Memmapfunctions: more refactoring and optimizations Try to clean up some redundant code, simplify a few checks, and simplify page accesses. --- Source/Core/Core/HW/Memmap.h | 44 ++-- Source/Core/Core/HW/MemmapFunctions.cpp | 318 +++++++++++------------- 2 files changed, 163 insertions(+), 199 deletions(-) diff --git a/Source/Core/Core/HW/Memmap.h b/Source/Core/Core/HW/Memmap.h index 31e2e9979f..c22d2b6a67 100644 --- a/Source/Core/Core/HW/Memmap.h +++ b/Source/Core/Core/HW/Memmap.h @@ -91,37 +91,37 @@ u32 Read_Instruction(const u32 _Address); // For use by emulator -u8 Read_U8(const u32 _Address); -u16 Read_U16(const u32 _Address); -u32 Read_U32(const u32 _Address); -u64 Read_U64(const u32 _Address); +u8 Read_U8(const u32 address); +u16 Read_U16(const u32 address); +u32 Read_U32(const u32 address); +u64 Read_U64(const u32 address); -u32 Read_S8_Val(const u32 _Address, u32 _var); -u32 Read_U8_Val(const u32 _Address, u32 _var); -u32 Read_S16_Val(const u32 _Address, u32 _var); -u32 Read_U16_Val(const u32 _Address, u32 _var); -u32 Read_U32_Val(const u32 _Address, u32 _var); -u64 Read_U64_Val(const u32 _Address, u64 _var); +u32 Read_S8_Val(const u32 address, u32 var); +u32 Read_U8_Val(const u32 address, u32 var); +u32 Read_S16_Val(const u32 address, u32 var); +u32 Read_U16_Val(const u32 address, u32 var); +u32 Read_U32_Val(const u32 address, u32 var); +u64 Read_U64_Val(const u32 address, u64 var); // Useful helper functions, used by ARM JIT -float Read_F32(const u32 _Address); -double Read_F64(const u32 _Address); +float Read_F32(const u32 address); +double Read_F64(const u32 address); // used by JIT. Return zero-extended 32bit values -u32 Read_U8_ZX(const u32 _Address); -u32 Read_U16_ZX(const u32 _Address); +u32 Read_U8_ZX(const u32 address); +u32 Read_U16_ZX(const u32 address); -void Write_U8(const u8 _var, const u32 _Address); -void Write_U16(const u16 _var, const u32 _Address); -void Write_U32(const u32 _var, const u32 _Address); -void Write_U64(const u64 _var, const u32 _Address); +void Write_U8(const u8 var, const u32 address); +void Write_U16(const u16 var, const u32 address); +void Write_U32(const u32 var, const u32 address); +void Write_U64(const u64 var, const u32 address); -void Write_U16_Swap(const u16 _var, const u32 _Address); -void Write_U32_Swap(const u32 _var, const u32 _Address); -void Write_U64_Swap(const u64 _var, const u32 _Address); +void Write_U16_Swap(const u16 var, const u32 address); +void Write_U32_Swap(const u32 var, const u32 address); +void Write_U64_Swap(const u64 var, const u32 address); // Useful helper functions, used by ARM JIT -void Write_F64(const double _var, const u32 _Address); +void Write_F64(const double var, const u32 address); std::string GetString(u32 em_address, size_t size = 0); diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index a8e3e8be63..d5a4bc94a3 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -129,28 +129,31 @@ translateaddress: _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); return; } - - // MMU + + // MMU: Do page table translation + u32 tlb_addr = TranslateAddress(em_address); + if (tlb_addr == 0) + { + if (flag == FLAG_READ) + GenerateDSIException(em_address, false); + return; + } + // Handle loads that cross page boundaries (ewwww) - if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) + // The alignment check isn't strictly necessary, but since this is a rare slow path, it provides a faster + // (1 instruction on x86) bailout. + if (sizeof(T) > 1 && (em_address & (sizeof(T) - 1)) && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) { // This could be unaligned down to the byte level... hopefully this is rare, so doing it this // way isn't too terrible. // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! - u32 tlb_addr = TranslateAddress(em_address); u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); if (tlb_addr == 0 || tlb_addr_next_page == 0) { if (flag == FLAG_READ) - { - u32 exception_addr = tlb_addr == 0 ? em_address : em_address_next_page; - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) - PanicAlertT("Invalid Read at 0x%08x, PC = 0x%08x ", exception_addr, PC); - else - GenerateDSIException(exception_addr, false); - } + GenerateDSIException(em_address_next_page, false); return; } _var = 0; @@ -158,37 +161,14 @@ translateaddress: { if (addr == em_address_next_page) tlb_addr = tlb_addr_next_page; - _var <<= 8; - if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) - _var |= m_pEXRAM[tlb_addr & EXRAM_MASK]; - else - _var |= m_pRAM[tlb_addr & RAM_MASK]; + _var = (_var << 8) | Memory::base[tlb_addr]; } + return; } else { - u32 tlb_addr = TranslateAddress(em_address); - if (tlb_addr == 0) - { - if (flag == FLAG_READ) - { - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) - PanicAlertT("Invalid Read at 0x%08x, PC = 0x%08x ", em_address, PC); - else - GenerateDSIException(em_address, false); - } - } - else - { - if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) - { - _var = bswap((*(const T*)&m_pEXRAM[tlb_addr & EXRAM_MASK])); - } - else - { - _var = bswap((*(const T*)&m_pRAM[tlb_addr & RAM_MASK])); - } - } + // The easy case! + _var = bswap(*(const T*)&Memory::base[tlb_addr]); } } } @@ -255,7 +235,7 @@ __forceinline void WriteToHardware(u32 em_address, const T data) *(T*)&m_pL1Cache[em_address & L1_CACHE_MASK] = bswap(data); return; } - else + else { translateaddress: if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) @@ -265,61 +245,40 @@ translateaddress: return; } - // MMU + // MMU: Do page table translation + u32 tlb_addr = TranslateAddress(em_address); + if (tlb_addr == 0) + { + if (flag == FLAG_WRITE) + GenerateDSIException(em_address, true); + return; + } + // Handle stores that cross page boundaries (ewwww) - if (sizeof(T) > 1 && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) + if (sizeof(T) > 1 && (em_address & (sizeof(T) - 1)) && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) { T val = bswap(data); + // We need to check both addresses before writing in case there's a DSI. - u32 tlb_addr = TranslateAddress(em_address); u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); - if (tlb_addr == 0 || tlb_addr_next_page == 0) + if (tlb_addr_next_page == 0) { if (flag == FLAG_WRITE) - { - u32 exception_addr = tlb_addr == 0 ? em_address : em_address_next_page; - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) - PanicAlertT("Invalid Write to 0x%08x, PC = 0x%08x ", exception_addr, PC); - else - GenerateDSIException(exception_addr, true); - } + GenerateDSIException(em_address_next_page, true); return; } for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++, val >>= 8) { if (addr == em_address_next_page) tlb_addr = tlb_addr_next_page; - if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) - m_pEXRAM[tlb_addr & EXRAM_MASK] = (u8)val; - else - m_pRAM[tlb_addr & RAM_MASK] = (u8)val; + Memory::base[tlb_addr] = (u8)val; } } else { - u32 tlb_addr = TranslateAddress(em_address); - if (tlb_addr == 0) - { - if (flag == FLAG_WRITE) - { - if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) - PanicAlertT("Invalid Write to 0x%08x, PC = 0x%08x ", em_address, PC); - else - GenerateDSIException(em_address, true); - } - } - else - { - if (m_pEXRAM && (tlb_addr & 0xF0000000) == 0x10000000) - { - *(T*)&m_pEXRAM[tlb_addr & EXRAM_MASK] = bswap(data); - } - else - { - *(T*)&m_pRAM[tlb_addr & RAM_MASK] = bswap(data); - } - } + // The easy case! + *(T*)&Memory::base[tlb_addr] = bswap(data); } } } @@ -361,53 +320,51 @@ u32 Read_Opcode(u32 _Address) return PowerPC::ppcState.iCache.ReadInstruction(_Address); } +static __forceinline void Memcheck(u32 address, u32 var, bool write, int size) +{ #ifdef ENABLE_MEM_CHECK -#define MEMCHECK(write, size)\ -{\ -TMemCheck *mc = PowerPC::memchecks.GetMemCheck(_Address);\ -if (mc)\ -{\ - mc->numHits++;\ - mc->Action(&PowerPC::debug_interface, (u32)_var, _Address, write, size, PC);\ -}\ -} -#else -#define MEMCHECK(write, size) + TMemCheck *mc = PowerPC::memchecks.GetMemCheck(address); + if (mc) + { + mc->numHits++; + mc->Action(&PowerPC::debug_interface, var, address, write, size, PC); + } #endif - -u8 Read_U8(const u32 _Address) -{ - u8 _var = 0; - ReadFromHardware(_var, _Address); - MEMCHECK(false, 1); - return (u8)_var; } -u16 Read_U16(const u32 _Address) +u8 Read_U8(const u32 address) { - u16 _var = 0; - ReadFromHardware(_var, _Address); - MEMCHECK(false, 2); - return (u16)_var; + u8 var = 0; + ReadFromHardware(var, address); + Memcheck(address, var, false, 1); + return (u8)var; } -u32 Read_U32(const u32 _Address) +u16 Read_U16(const u32 address) { - u32 _var = 0; - ReadFromHardware(_var, _Address); - MEMCHECK(false, 4); - return _var; + u16 var = 0; + ReadFromHardware(var, address); + Memcheck(address, var, false, 2); + return (u16)var; } -u64 Read_U64(const u32 _Address) +u32 Read_U32(const u32 address) { - u64 _var = 0; - ReadFromHardware(_var, _Address); - MEMCHECK(false, 8); - return _var; + u32 var = 0; + ReadFromHardware(var, address); + Memcheck(address, var, false, 4); + return var; } -double Read_F64(const u32 _Address) +u64 Read_U64(const u32 address) +{ + u64 var = 0; + ReadFromHardware(var, address); + Memcheck(address, (u32)var, false, 8); + return var; +} + +double Read_F64(const u32 address) { union { @@ -415,11 +372,11 @@ double Read_F64(const u32 _Address) double d; } cvt; - cvt.i = Read_U64(_Address); + cvt.i = Read_U64(address); return cvt.d; } -float Read_F32(const u32 _Address) +float Read_F32(const u32 address) { union { @@ -427,136 +384,136 @@ float Read_F32(const u32 _Address) float d; } cvt; - cvt.i = Read_U32(_Address); + cvt.i = Read_U32(address); return cvt.d; } -u32 Read_U8_Val(const u32 _Address, u32 _var) +u32 Read_U8_Val(const u32 address, u32 var) { - ReadFromHardware(_var, _Address); - MEMCHECK(false, 1); - return _var; + ReadFromHardware(var, address); + Memcheck(address, var, false, 1); + return var; } -u32 Read_S8_Val(const u32 _Address, u32 _var) +u32 Read_S8_Val(const u32 address, u32 var) { - ReadFromHardware(_var, _Address); - MEMCHECK(false, 1); - return _var; + ReadFromHardware(var, address); + Memcheck(address, var, false, 1); + return var; } -u32 Read_U16_Val(const u32 _Address, u32 _var) +u32 Read_U16_Val(const u32 address, u32 var) { - ReadFromHardware(_var, _Address); - MEMCHECK(false, 2); - return _var; + ReadFromHardware(var, address); + Memcheck(address, var, false, 2); + return var; } -u32 Read_S16_Val(const u32 _Address, u32 _var) +u32 Read_S16_Val(const u32 address, u32 var) { - ReadFromHardware(_var, _Address); - MEMCHECK(false, 2); - return _var; + ReadFromHardware(var, address); + Memcheck(address, var, false, 2); + return var; } -u32 Read_U32_Val(const u32 _Address, u32 _var) +u32 Read_U32_Val(const u32 address, u32 var) { - ReadFromHardware(_var, _Address); - MEMCHECK(false, 4); - return _var; + ReadFromHardware(var, address); + Memcheck(address, var, false, 4); + return var; } -u64 Read_U64_Val(const u32 _Address, u64 _var) +u64 Read_U64_Val(const u32 address, u64 var) { - ReadFromHardware(_var, _Address); - MEMCHECK(false, 8); - return _var; + ReadFromHardware(var, address); + Memcheck(address, (u32)var, false, 8); + return var; } -u32 Read_U8_ZX(const u32 _Address) +u32 Read_U8_ZX(const u32 address) { - return (u32)Read_U8(_Address); + return (u32)Read_U8(address); } -u32 Read_U16_ZX(const u32 _Address) +u32 Read_U16_ZX(const u32 address) { - return (u32)Read_U16(_Address); + return (u32)Read_U16(address); } -void Write_U8(const u8 _var, const u32 _Address) +void Write_U8(const u8 var, const u32 address) { - MEMCHECK(true, 1); - WriteToHardware(_Address, _var); + Memcheck(address, var, true, 1); + WriteToHardware(address, var); } -void Write_U16(const u16 _var, const u32 _Address) +void Write_U16(const u16 var, const u32 address) { - MEMCHECK(true, 2); - WriteToHardware(_Address, _var); + Memcheck(address, var, true, 2); + WriteToHardware(address, var); } -void Write_U16_Swap(const u16 _var, const u32 _Address) +void Write_U16_Swap(const u16 var, const u32 address) { - MEMCHECK(true, 2); - Write_U16(Common::swap16(_var), _Address); + Memcheck(address, var, true, 2); + Write_U16(Common::swap16(var), address); } -void Write_U32(const u32 _var, const u32 _Address) +void Write_U32(const u32 var, const u32 address) { - MEMCHECK(true, 4); - WriteToHardware(_Address, _var); + Memcheck(address, var, true, 4); + WriteToHardware(address, var); } -void Write_U32_Swap(const u32 _var, const u32 _Address) +void Write_U32_Swap(const u32 var, const u32 address) { - MEMCHECK(true, 4); - Write_U32(Common::swap32(_var), _Address); + Memcheck(address, var, true, 4); + Write_U32(Common::swap32(var), address); } -void Write_U64(const u64 _var, const u32 _Address) +void Write_U64(const u64 var, const u32 address) { - MEMCHECK(true, 8); - WriteToHardware(_Address, _var); + Memcheck(address, (u32)var, true, 8); + WriteToHardware(address, var); } -void Write_U64_Swap(const u64 _var, const u32 _Address) +void Write_U64_Swap(const u64 var, const u32 address) { - MEMCHECK(true, 8); - Write_U64(Common::swap64(_var), _Address); + Memcheck(address, (u32)var, true, 8); + Write_U64(Common::swap64(var), address); } -void Write_F64(const double _var, const u32 _Address) +void Write_F64(const double var, const u32 address) { union { u64 i; double d; } cvt; - cvt.d = _var; - Write_U64(cvt.i, _Address); + cvt.d = var; + Write_U64(cvt.i, address); } -u8 ReadUnchecked_U8(const u32 _Address) +u8 ReadUnchecked_U8(const u32 address) { - u8 _var = 0; - ReadFromHardware(_var, _Address); - return _var; + u8 var = 0; + ReadFromHardware(var, address); + return var; } -u32 ReadUnchecked_U32(const u32 _Address) +u32 ReadUnchecked_U32(const u32 address) { - u32 _var = 0; - ReadFromHardware(_var, _Address); - return _var; + u32 var = 0; + ReadFromHardware(var, address); + return var; } -void WriteUnchecked_U8(const u8 _iValue, const u32 _Address) +void WriteUnchecked_U8(const u8 var, const u32 address) { - WriteToHardware(_Address, _iValue); + WriteToHardware(address, var); } -void WriteUnchecked_U32(const u32 _iValue, const u32 _Address) +void WriteUnchecked_U32(const u32 var, const u32 address) { - WriteToHardware(_Address, _iValue); + WriteToHardware(address, var); } // ********************************************************************************* @@ -654,6 +611,13 @@ union UPTE2 static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite) { + // DSI exceptions are only supported in MMU mode. + if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) + { + PanicAlertT("Invalid %s to 0x%08x, PC = 0x%08x ", _bWrite ? "Write to" : "Read from", _EffectiveAddress, PC); + return; + } + if (_bWrite) PowerPC::ppcState.spr[SPR_DSISR] = PPC_EXC_DSISR_PAGE | PPC_EXC_DSISR_STORE; else From dde8b24d004100414748921d59e51a0e89ae8ecb Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 1 Jan 2015 14:35:17 -0800 Subject: [PATCH 04/19] MMU: small simplification of TLB structure We only need one "recent" per set, not NUM_WAYS recents. Slightly faster. Breaks savestate compatibility. --- Source/Core/Core/HW/MemmapFunctions.cpp | 66 ++++++++++--------------- Source/Core/Core/PowerPC/PowerPC.cpp | 8 +-- Source/Core/Core/PowerPC/PowerPC.h | 15 +++--- Source/Core/Core/State.cpp | 2 +- 4 files changed, 38 insertions(+), 53 deletions(-) diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index d5a4bc94a3..64674168b3 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -667,54 +667,48 @@ void SDRUpdated() static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u32 vpa, u32 *paddr) { int tag = vpa >> HW_PAGE_INDEX_SHIFT; - PowerPC::tlb_entry *tlbe = PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; - if (tlbe[0].tag == tag && !(tlbe[0].flags & TLB_FLAG_INVALID)) + PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; + if (tlbe->tag[0] == tag) { // Check if C bit requires updating if (_Flag == FLAG_WRITE) { UPTE2 PTE2; - PTE2.Hex = tlbe[0].pte; + PTE2.Hex = tlbe->pte[0]; if (PTE2.C == 0) { PTE2.C = 1; - tlbe[0].pte = PTE2.Hex; + tlbe->pte[0] = PTE2.Hex; return 0; } } if (_Flag != FLAG_NO_EXCEPTION) - { - tlbe[0].flags |= TLB_FLAG_MOST_RECENT; - tlbe[1].flags &= ~TLB_FLAG_MOST_RECENT; - } + tlbe->recent = 0; - *paddr = tlbe[0].paddr | (vpa & 0xfff); + *paddr = tlbe->paddr[0] | (vpa & 0xfff); return 1; } - if (tlbe[1].tag == tag && !(tlbe[1].flags & TLB_FLAG_INVALID)) + if (tlbe->tag[1] == tag) { // Check if C bit requires updating if (_Flag == FLAG_WRITE) { UPTE2 PTE2; - PTE2.Hex = tlbe[1].pte; + PTE2.Hex = tlbe->pte[1]; if (PTE2.C == 0) { PTE2.C = 1; - tlbe[1].pte = PTE2.Hex; + tlbe->pte[1] = PTE2.Hex; return 0; } } if (_Flag != FLAG_NO_EXCEPTION) - { - tlbe[1].flags |= TLB_FLAG_MOST_RECENT; - tlbe[0].flags &= ~TLB_FLAG_MOST_RECENT; - } + tlbe->recent = 1; - *paddr = tlbe[1].paddr | (vpa & 0xfff); + *paddr = tlbe->paddr[1] | (vpa & 0xfff); return 1; } @@ -726,39 +720,31 @@ static __forceinline void UpdateTLBEntry(const XCheckTLBFlag _Flag, UPTE2 PTE2, if (_Flag == FLAG_NO_EXCEPTION) return; - PowerPC::tlb_entry *tlbe = PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; - if ((tlbe[0].flags & TLB_FLAG_MOST_RECENT) == 0 || (tlbe[0].flags & TLB_FLAG_INVALID)) - { - tlbe[0].flags = TLB_FLAG_MOST_RECENT; - tlbe[1].flags &= ~TLB_FLAG_MOST_RECENT; - tlbe[0].paddr = PTE2.RPN << HW_PAGE_INDEX_SHIFT; - tlbe[0].pte = PTE2.Hex; - tlbe[0].tag = vpa >> HW_PAGE_INDEX_SHIFT; - } - else - { - tlbe[1].flags = TLB_FLAG_MOST_RECENT; - tlbe[0].flags &= ~TLB_FLAG_MOST_RECENT; - tlbe[1].paddr = PTE2.RPN << HW_PAGE_INDEX_SHIFT; - tlbe[1].pte = PTE2.Hex; - tlbe[1].tag = vpa >> HW_PAGE_INDEX_SHIFT; - } + int tag = vpa >> HW_PAGE_INDEX_SHIFT; + PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; + int index = tlbe->recent == 0 && tlbe->tag[0] != TLB_TAG_INVALID; + tlbe->recent = index; + tlbe->paddr[index] = PTE2.RPN << HW_PAGE_INDEX_SHIFT; + tlbe->pte[index] = PTE2.Hex; + tlbe->tag[index] = tag; } void InvalidateTLBEntry(u32 vpa) { - PowerPC::tlb_entry *tlbe = PowerPC::ppcState.tlb[0][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; - tlbe[0].flags |= TLB_FLAG_INVALID; - tlbe[1].flags |= TLB_FLAG_INVALID; - PowerPC::tlb_entry *tlbe_i = PowerPC::ppcState.tlb[1][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; - tlbe_i[0].flags |= TLB_FLAG_INVALID; - tlbe_i[1].flags |= TLB_FLAG_INVALID; + PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[0][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; + tlbe->tag[0] = TLB_TAG_INVALID; + tlbe->tag[1] = TLB_TAG_INVALID; + PowerPC::tlb_entry *tlbe_i = &PowerPC::ppcState.tlb[1][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; + tlbe_i->tag[0] = TLB_TAG_INVALID; + tlbe_i->tag[1] = TLB_TAG_INVALID; } // Page Address Translation static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTLBFlag _Flag) { // TLB cache + // This catches 99%+ of lookups in practice, so the actual page table entry code below doesn't benefit + // much from optimization. u32 translatedAddress = 0; if (LookupTLBPageAddress(_Flag, _Address, &translatedAddress)) return translatedAddress; diff --git a/Source/Core/Core/PowerPC/PowerPC.cpp b/Source/Core/Core/PowerPC/PowerPC.cpp index 35f96fb495..5f8bc9de00 100644 --- a/Source/Core/Core/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/PowerPC/PowerPC.cpp @@ -125,12 +125,12 @@ void Init(int cpu_core) { for (int set = 0; set < 64; set++) { + ppcState.tlb[tlb][set].recent = 0; for (int way = 0; way < 2; way++) { - ppcState.tlb[tlb][set][way].flags = TLB_FLAG_INVALID; - ppcState.tlb[tlb][set][way].paddr = 0; - ppcState.tlb[tlb][set][way].pte = 0; - ppcState.tlb[tlb][set][way].tag = 0; + ppcState.tlb[tlb][set].paddr[way] = 0; + ppcState.tlb[tlb][set].pte[way] = 0; + ppcState.tlb[tlb][set].tag[way] = TLB_TAG_INVALID; } } } diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index 69eb0da28b..bb6d418065 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -29,22 +29,21 @@ enum CoreMode // TLB cache #define TLB_SIZE 128 -#define TLB_WAYS 2 #define NUM_TLBS 2 +#define TLB_WAYS 2 #define HW_PAGE_INDEX_SHIFT 12 #define HW_PAGE_INDEX_MASK 0x3f #define HW_PAGE_TAG_SHIFT 18 -#define TLB_FLAG_MOST_RECENT 0x01 -#define TLB_FLAG_INVALID 0x02 +#define TLB_TAG_INVALID 0xffffffff struct tlb_entry { - u32 tag; - u32 paddr; - u32 pte; - u8 flags; + u32 tag[TLB_WAYS]; + u32 paddr[TLB_WAYS]; + u32 pte[TLB_WAYS]; + u8 recent; }; // This contains the entire state of the emulated PowerPC "Gekko" CPU. @@ -107,7 +106,7 @@ struct GC_ALIGNED64(PowerPCState) // also for power management, but we don't care about that. u32 spr[1024]; - tlb_entry tlb[NUM_TLBS][TLB_SIZE / TLB_WAYS][TLB_WAYS]; + tlb_entry tlb[NUM_TLBS][TLB_SIZE / TLB_WAYS]; u32 pagetable_base; u32 pagetable_hashmask; diff --git a/Source/Core/Core/State.cpp b/Source/Core/Core/State.cpp index d156bb4adc..a63fafc2a5 100644 --- a/Source/Core/Core/State.cpp +++ b/Source/Core/Core/State.cpp @@ -64,7 +64,7 @@ static Common::Event g_compressAndDumpStateSyncEvent; static std::thread g_save_thread; // Don't forget to increase this after doing changes on the savestate system -static const u32 STATE_VERSION = 37; +static const u32 STATE_VERSION = 38; enum { From 6f028257d7277b3f5f671c8f7878ac97c021a365 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 1 Jan 2015 14:47:30 -0800 Subject: [PATCH 05/19] MMU: remove code that looks totally wrong I don't think this affects any games (who puts PTEs in MEM2?) but it didn't make any sense. --- Source/Core/Core/HW/MemmapFunctions.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 64674168b3..22f21a2602 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -773,9 +773,6 @@ static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTL u32 pteg_addr = ((hash & PowerPC::ppcState.pagetable_hashmask) << 6) | PowerPC::ppcState.pagetable_base; - if ((pteg_addr >> 28) == 1) - base_mem = Memory::m_pEXRAM; - for (int i = 0; i < 8; i++) { u32 pte = bswap(*(u32*)&base_mem[pteg_addr]); From 8e1c92f2e705b6681c91cff7284dd02c87a304fe Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 1 Jan 2015 17:56:14 -0800 Subject: [PATCH 06/19] MMU: remove goto in MMU fast past check Split into a separate patch to avoid rebasing conflicts and to split from functional changes. --- Source/Core/Core/HW/MemmapFunctions.cpp | 297 ++++++++++++------------ 1 file changed, 147 insertions(+), 150 deletions(-) diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 22f21a2602..04736a650c 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -98,79 +98,79 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) int segment = em_address >> 28; // Quick check for an address that can't meet any of the following conditions, // to speed up the MMU path. - if (BitSet32(0xCFC)[segment]) - goto translateaddress; - // TODO: Figure out the fastest order of tests for both read and write (they are probably different). - if ((em_address & 0xC8000000) == 0xC8000000) + if (!BitSet32(0xCFC)[segment]) { - if (em_address < 0xcc000000) - _var = EFB_Read(em_address); - else - _var = (T)mmio_mapping->Read::type>(em_address); - } - else if (segment == 0x8 || segment == 0xC || segment == 0x0) - { - _var = bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); - } - else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) - { - _var = bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); - } - else if (segment == 0xE && (em_address < (0xE0000000+L1_CACHE_SIZE))) - { - _var = bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); - } - else - { -translateaddress: - if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) + // TODO: Figure out the fastest order of tests for both read and write (they are probably different). + if ((em_address & 0xC8000000) == 0xC8000000) { - // fake VMEM - _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); + if (em_address < 0xcc000000) + _var = EFB_Read(em_address); + else + _var = (T)mmio_mapping->Read::type>(em_address); return; } + else if (segment == 0x8 || segment == 0xC || segment == 0x0) + { + _var = bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); + return; + } + else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) + { + _var = bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); + return; + } + else if (segment == 0xE && (em_address < (0xE0000000 + L1_CACHE_SIZE))) + { + _var = bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); + return; + } + } - // MMU: Do page table translation - u32 tlb_addr = TranslateAddress(em_address); - if (tlb_addr == 0) + if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) + { + // fake VMEM + _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); + return; + } + + // MMU: Do page table translation + u32 tlb_addr = TranslateAddress(em_address); + if (tlb_addr == 0) + { + if (flag == FLAG_READ) + GenerateDSIException(em_address, false); + return; + } + + // Handle loads that cross page boundaries (ewwww) + // The alignment check isn't strictly necessary, but since this is a rare slow path, it provides a faster + // (1 instruction on x86) bailout. + if (sizeof(T) > 1 && (em_address & (sizeof(T) - 1)) && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) + { + // This could be unaligned down to the byte level... hopefully this is rare, so doing it this + // way isn't too terrible. + // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. + // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! + u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); + u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); + if (tlb_addr == 0 || tlb_addr_next_page == 0) { if (flag == FLAG_READ) - GenerateDSIException(em_address, false); + GenerateDSIException(em_address_next_page, false); return; } - - // Handle loads that cross page boundaries (ewwww) - // The alignment check isn't strictly necessary, but since this is a rare slow path, it provides a faster - // (1 instruction on x86) bailout. - if (sizeof(T) > 1 && (em_address & (sizeof(T) - 1)) && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) + _var = 0; + for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++) { - // This could be unaligned down to the byte level... hopefully this is rare, so doing it this - // way isn't too terrible. - // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. - // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! - u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); - u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); - if (tlb_addr == 0 || tlb_addr_next_page == 0) - { - if (flag == FLAG_READ) - GenerateDSIException(em_address_next_page, false); - return; - } - _var = 0; - for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++) - { - if (addr == em_address_next_page) - tlb_addr = tlb_addr_next_page; - _var = (_var << 8) | Memory::base[tlb_addr]; - } - return; - } - else - { - // The easy case! - _var = bswap(*(const T*)&Memory::base[tlb_addr]); + if (addr == em_address_next_page) + tlb_addr = tlb_addr_next_page; + _var = (_var << 8) | Memory::base[tlb_addr]; } + return; } + + // The easy case! + _var = bswap(*(const T*)&Memory::base[tlb_addr]); } @@ -180,107 +180,104 @@ __forceinline void WriteToHardware(u32 em_address, const T data) int segment = em_address >> 28; // Quick check for an address that can't meet any of the following conditions, // to speed up the MMU path. - if (BitSet32(0xCFC)[segment]) - goto translateaddress; - // First, let's check for FIFO writes, since they are probably the most common - // reason we end up in this function: - if ((em_address & 0xFFFFF000) == 0xCC008000) + if (!BitSet32(0xCFC)[segment]) { - switch (sizeof(T)) + // First, let's check for FIFO writes, since they are probably the most common + // reason we end up in this function: + if ((em_address & 0xFFFFF000) == 0xCC008000) { - case 1: GPFifo::Write8((u8)data, em_address); return; - case 2: GPFifo::Write16((u16)data, em_address); return; - case 4: GPFifo::Write32((u32)data, em_address); return; - case 8: GPFifo::Write64((u64)data, em_address); return; - } - } - if ((em_address & 0xC8000000) == 0xC8000000) - { - if (em_address < 0xcc000000) - { - int x = (em_address & 0xfff) >> 2; - int y = (em_address >> 12) & 0x3ff; - - // TODO figure out a way to send data without falling into the template trap - if (em_address & 0x00400000) + switch (sizeof(T)) { - g_video_backend->Video_AccessEFB(POKE_Z, x, y, (u32)data); - DEBUG_LOG(MEMMAP, "EFB Z Write %08x @ %i, %i", (u32)data, x, y); + case 1: GPFifo::Write8((u8)data, em_address); return; + case 2: GPFifo::Write16((u16)data, em_address); return; + case 4: GPFifo::Write32((u32)data, em_address); return; + case 8: GPFifo::Write64((u64)data, em_address); return; + } + } + if ((em_address & 0xC8000000) == 0xC8000000) + { + if (em_address < 0xcc000000) + { + int x = (em_address & 0xfff) >> 2; + int y = (em_address >> 12) & 0x3ff; + + // TODO figure out a way to send data without falling into the template trap + if (em_address & 0x00400000) + { + g_video_backend->Video_AccessEFB(POKE_Z, x, y, (u32)data); + DEBUG_LOG(MEMMAP, "EFB Z Write %08x @ %i, %i", (u32)data, x, y); + } + else + { + g_video_backend->Video_AccessEFB(POKE_COLOR, x, y, (u32)data); + DEBUG_LOG(MEMMAP, "EFB Color Write %08x @ %i, %i", (u32)data, x, y); + } + return; } else { - g_video_backend->Video_AccessEFB(POKE_COLOR, x, y,(u32)data); - DEBUG_LOG(MEMMAP, "EFB Color Write %08x @ %i, %i", (u32)data, x, y); - } - return; - } - else - { - mmio_mapping->Write(em_address, data); - return; - } - } - else if (segment == 0x8 || segment == 0xC || segment == 0x0) - { - *(T*)&m_pRAM[em_address & RAM_MASK] = bswap(data); - return; - } - else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) - { - *(T*)&m_pEXRAM[em_address & EXRAM_MASK] = bswap(data); - return; - } - else if (segment == 0xE && (em_address < (0xE0000000+L1_CACHE_SIZE))) - { - *(T*)&m_pL1Cache[em_address & L1_CACHE_MASK] = bswap(data); - return; - } - else - { -translateaddress: - if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) - { - // fake VMEM - *(T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK] = bswap(data); - return; - } - - // MMU: Do page table translation - u32 tlb_addr = TranslateAddress(em_address); - if (tlb_addr == 0) - { - if (flag == FLAG_WRITE) - GenerateDSIException(em_address, true); - return; - } - - // Handle stores that cross page boundaries (ewwww) - if (sizeof(T) > 1 && (em_address & (sizeof(T) - 1)) && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) - { - T val = bswap(data); - - // We need to check both addresses before writing in case there's a DSI. - u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); - u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); - if (tlb_addr_next_page == 0) - { - if (flag == FLAG_WRITE) - GenerateDSIException(em_address_next_page, true); + mmio_mapping->Write(em_address, data); return; } - for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++, val >>= 8) - { - if (addr == em_address_next_page) - tlb_addr = tlb_addr_next_page; - Memory::base[tlb_addr] = (u8)val; - } } - else + else if (segment == 0x8 || segment == 0xC || segment == 0x0) { - // The easy case! - *(T*)&Memory::base[tlb_addr] = bswap(data); + *(T*)&m_pRAM[em_address & RAM_MASK] = bswap(data); + return; + } + else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) + { + *(T*)&m_pEXRAM[em_address & EXRAM_MASK] = bswap(data); + return; + } + else if (segment == 0xE && (em_address < (0xE0000000 + L1_CACHE_SIZE))) + { + *(T*)&m_pL1Cache[em_address & L1_CACHE_MASK] = bswap(data); + return; } } + + if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) + { + // fake VMEM + *(T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK] = bswap(data); + return; + } + + // MMU: Do page table translation + u32 tlb_addr = TranslateAddress(em_address); + if (tlb_addr == 0) + { + if (flag == FLAG_WRITE) + GenerateDSIException(em_address, true); + return; + } + + // Handle stores that cross page boundaries (ewwww) + if (sizeof(T) > 1 && (em_address & (sizeof(T) - 1)) && (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) + { + T val = bswap(data); + + // We need to check both addresses before writing in case there's a DSI. + u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); + u32 tlb_addr_next_page = TranslateAddress(em_address_next_page); + if (tlb_addr_next_page == 0) + { + if (flag == FLAG_WRITE) + GenerateDSIException(em_address_next_page, true); + return; + } + for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++, val >>= 8) + { + if (addr == em_address_next_page) + tlb_addr = tlb_addr_next_page; + Memory::base[tlb_addr] = (u8)val; + } + return; + } + + // The easy case! + *(T*)&Memory::base[tlb_addr] = bswap(data); } // ===================== From 190312e1a618a2895197b8c0f22ab82ee13d50bc Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 13:02:50 -0800 Subject: [PATCH 07/19] MMU: enable fastmem stores --- Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index ca217ee63d..7cd9cc57d6 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -547,8 +547,7 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces reg_value = FixImmediate(accessSize, reg_value); // TODO: support byte-swapped non-immediate fastmem stores - if (!jit->js.memcheck && - SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && !(flags & SAFE_LOADSTORE_NO_FASTMEM) && (reg_value.IsImm() || !(flags & SAFE_LOADSTORE_NO_SWAP)) #ifdef ENABLE_MEM_CHECK From 0ff6ad5734fa66b994c3b9ae6e047bb50f2a16c7 Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 13:11:01 -0800 Subject: [PATCH 08/19] MMU: handle exception checks in fastmem Inspired by a patch by magumagu. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 35 +++++++++---- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 40 +++++++------- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 52 +++++++++---------- .../PowerPC/Jit64/Jit_LoadStorePaired.cpp | 4 +- .../Core/PowerPC/JitCommon/JitBackpatch.cpp | 18 +++++-- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 1 + .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 6 ++- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 11 ++-- .../PowerPC/JitCommon/TrampolineCache.cpp | 28 +++++++--- .../Core/PowerPC/JitCommon/TrampolineCache.h | 9 ++-- 10 files changed, 121 insertions(+), 83 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 99fea369f8..81d126ba3b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -178,6 +178,7 @@ void Jit64::Init() jo.optimizeGatherPipe = true; jo.accurateSinglePrecision = true; js.memcheck = SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU; + js.fastmemLoadStore = NULL; gpr.SetEmitter(this); fpr.SetEmitter(this); @@ -612,6 +613,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.instructionsLeft = (code_block.m_num_instructions - 1) - i; const GekkoOPInfo *opinfo = ops[i].opinfo; js.downcountAmount += opinfo->numCycles; + js.fastmemLoadStore = NULL; if (i == (code_block.m_num_instructions - 1)) { @@ -761,19 +763,28 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc Jit64Tables::CompileInstruction(ops[i]); - // If we have a register that will never be used again, flush it. - for (int j : ~ops[i].gprInUse) - gpr.StoreFromRegister(j); - for (int j : ~ops[i].fprInUse) - fpr.StoreFromRegister(j); - if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); - FixupBranch memException = J_CC(CC_NZ, true); + // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle it. + FixupBranch memException; + if (!js.fastmemLoadStore) + { + TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); + memException = J_CC(CC_NZ, true); + } SwitchToFarCode(); - SetJumpTarget(memException); + if (!js.fastmemLoadStore) + { + exceptionHandlerAtLoc[js.fastmemLoadStore] = NULL; + SetJumpTarget(memException); + } + else + { + exceptionHandlerAtLoc[js.fastmemLoadStore] = GetWritableCodePtr(); + // the fastmem trampoline is jumping here, so we need to pop the return stack + ADD(64, R(RSP), Imm8(8)); + } gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); @@ -785,6 +796,12 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc SwitchToNearCode(); } + // If we have a register that will never be used again, flush it. + for (int j : ~ops[i].gprInUse) + gpr.StoreFromRegister(j); + for (int j : ~ops[i].fprInUse) + fpr.StoreFromRegister(j); + if (opinfo->flags & FL_LOADSTORE) ++jit->js.numLoadStoreInst; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index bf6f76beda..ae6b50f9cf 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -246,19 +246,19 @@ void Jit64::lXXx(UGeckoInstruction inst) } gpr.Lock(a, b, d); + if (update && storeAddress) + gpr.BindToRegister(a, true, true); gpr.BindToRegister(d, js.memcheck, true); BitSet32 registersInUse = CallerSavedRegistersInUse(); + // We need to save the (usually scratch) address register for the update. if (update && storeAddress) - { - // We need to save the (usually scratch) address register for the update. registersInUse[RSCRATCH2] = true; - } + SafeLoadToReg(gpr.RX(d), opAddress, accessSize, loadOffset, registersInUse, signExtend); if (update && storeAddress) { - gpr.BindToRegister(a, true, true); - MEMCHECK_START(false) + MEMCHECK_START MOV(32, gpr.R(a), opAddress); MEMCHECK_END } @@ -266,7 +266,7 @@ void Jit64::lXXx(UGeckoInstruction inst) // TODO: support no-swap in SafeLoadToReg instead if (byte_reversed) { - MEMCHECK_START(false) + MEMCHECK_START BSWAP(accessSize, gpr.RX(d)); MEMCHECK_END } @@ -372,7 +372,7 @@ void Jit64::stX(UGeckoInstruction inst) else { gpr.KillImmediate(a, true, true); - MEMCHECK_START(false) + MEMCHECK_START ADD(32, gpr.R(a), Imm32((u32)offset)); MEMCHECK_END } @@ -404,7 +404,7 @@ void Jit64::stX(UGeckoInstruction inst) if (update) { - MEMCHECK_START(false) + MEMCHECK_START ADD(32, gpr.R(a), Imm32((u32)offset)); MEMCHECK_END } @@ -425,12 +425,9 @@ void Jit64::stXx(UGeckoInstruction inst) gpr.Lock(a, b, s); if (update) - { gpr.BindToRegister(a, true, true); - ADD(32, gpr.R(a), gpr.R(b)); - MOV(32, R(RSCRATCH2), gpr.R(a)); - } - else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + + if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) { LEA(32, RSCRATCH2, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); } @@ -462,7 +459,10 @@ void Jit64::stXx(UGeckoInstruction inst) if (gpr.R(s).IsImm()) { - SafeWriteRegToReg(gpr.R(s), RSCRATCH2, accessSize, 0, CallerSavedRegistersInUse(), byte_reverse ? SAFE_LOADSTORE_NO_SWAP : 0); + BitSet32 registersInUse = CallerSavedRegistersInUse(); + if (update) + registersInUse[RSCRATCH2] = true; + SafeWriteRegToReg(gpr.R(s), RSCRATCH2, accessSize, 0, registersInUse, byte_reverse ? SAFE_LOADSTORE_NO_SWAP : 0); } else { @@ -477,14 +477,16 @@ void Jit64::stXx(UGeckoInstruction inst) gpr.BindToRegister(s, true, false); reg_value = gpr.RX(s); } - SafeWriteRegToReg(reg_value, RSCRATCH2, accessSize, 0, CallerSavedRegistersInUse(), byte_reverse ? SAFE_LOADSTORE_NO_SWAP : 0); + BitSet32 registersInUse = CallerSavedRegistersInUse(); + if (update) + registersInUse[RSCRATCH2] = true; + SafeWriteRegToReg(reg_value, RSCRATCH2, accessSize, 0, registersInUse, byte_reverse ? SAFE_LOADSTORE_NO_SWAP : 0); } - if (update && js.memcheck) + if (update) { - // revert the address change if an exception occurred - MEMCHECK_START(true) - SUB(32, gpr.R(a), gpr.R(b)); + MEMCHECK_START + MOV(32, gpr.R(a), R(RSCRATCH2)); MEMCHECK_END; } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 2a246b3a0b..061759857c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -46,9 +46,9 @@ void Jit64::lfXXX(UGeckoInstruction inst) } else { - addr = R(RSCRATCH); + addr = R(RSCRATCH2); if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) - LEA(32, RSCRATCH, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); + LEA(32, RSCRATCH2, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); else { MOV(32, addr, gpr.R(b)); @@ -65,14 +65,14 @@ void Jit64::lfXXX(UGeckoInstruction inst) offset = (s16)inst.SIMM_16; } + fpr.Lock(d); + fpr.BindToRegister(d, js.memcheck || !single); BitSet32 registersInUse = CallerSavedRegistersInUse(); if (update && js.memcheck) registersInUse[RSCRATCH2] = true; SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false); - fpr.Lock(d); - fpr.BindToRegister(d, js.memcheck || !single); - MEMCHECK_START(false) + MEMCHECK_START if (single) { ConvertSingleToDouble(fpr.RX(d), RSCRATCH, true); @@ -141,7 +141,7 @@ void Jit64::stfXXX(UGeckoInstruction inst) else { gpr.KillImmediate(a, true, true); - MEMCHECK_START(false) + MEMCHECK_START ADD(32, gpr.R(a), Imm32((u32)imm)); MEMCHECK_END } @@ -152,47 +152,43 @@ void Jit64::stfXXX(UGeckoInstruction inst) } s32 offset = 0; + if (update) + gpr.BindToRegister(a, true, true); if (indexed) { - if (update) - { - gpr.BindToRegister(a, true, true); - ADD(32, gpr.R(a), gpr.R(b)); - MOV(32, R(RSCRATCH2), gpr.R(a)); - } + if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) + LEA(32, RSCRATCH2, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); else { - if (a && gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg()) - LEA(32, RSCRATCH2, MComplex(gpr.RX(a), gpr.RX(b), SCALE_1, 0)); - else - { - MOV(32, R(RSCRATCH2), gpr.R(b)); - if (a) - ADD(32, R(RSCRATCH2), gpr.R(a)); - } + MOV(32, R(RSCRATCH2), gpr.R(b)); + if (a) + ADD(32, R(RSCRATCH2), gpr.R(a)); } } else { if (update) { - gpr.BindToRegister(a, true, true); - ADD(32, gpr.R(a), Imm32(imm)); + LEA(32, RSCRATCH2, MDisp(gpr.RX(a), imm)); } else { offset = imm; + MOV(32, R(RSCRATCH2), gpr.R(a)); } - MOV(32, R(RSCRATCH2), gpr.R(a)); } - SafeWriteRegToReg(RSCRATCH, RSCRATCH2, accessSize, offset, CallerSavedRegistersInUse()); + BitSet32 registersInUse = CallerSavedRegistersInUse(); + // We need to save the (usually scratch) address register for the update. + if (update) + registersInUse[RSCRATCH2] = true; - if (js.memcheck && update) + SafeWriteRegToReg(RSCRATCH, RSCRATCH2, accessSize, offset, registersInUse); + + if (update) { - // revert the address change if an exception occurred - MEMCHECK_START(true) - SUB(32, gpr.R(a), indexed ? gpr.R(b) : Imm32(imm)); + MEMCHECK_START + MOV(32, gpr.R(a), R(RSCRATCH2)); MEMCHECK_END } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 26a4f022e7..160547bf9a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -78,7 +78,7 @@ void Jit64::psq_stXX(UGeckoInstruction inst) if (update && js.memcheck) { - MEMCHECK_START(false) + MEMCHECK_START if (indexed) ADD(32, gpr.R(a), gpr.R(b)); else @@ -137,7 +137,7 @@ void Jit64::psq_lXX(UGeckoInstruction inst) CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8]))); - MEMCHECK_START(false) + MEMCHECK_START CVTPS2PD(fpr.RX(s), R(XMM0)); if (update && js.memcheck) { diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp index 3a693a3c71..d165915c99 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp @@ -73,6 +73,14 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) BitSet32 registersInUse = it->second; + u8* exceptionHandler = NULL; + if (jit->js.memcheck) + { + auto it2 = exceptionHandlerAtLoc.find(codePtr); + if (it2 != exceptionHandlerAtLoc.end()) + exceptionHandler = it2->second; + } + if (!info.isMemoryWrite) { XEmitter emitter(codePtr); @@ -101,7 +109,7 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) totalSize += 3; } - const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse); + const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse, exceptionHandler); emitter.CALL((void *)trampoline); int padding = totalSize - BACKPATCH_SIZE; if (padding > 0) @@ -113,14 +121,14 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) else { // TODO: special case FIFO writes. Also, support 32-bit mode. - auto it2 = pcAtLoc.find(codePtr); - if (it2 == pcAtLoc.end()) + auto it3 = pcAtLoc.find(codePtr); + if (it3 == pcAtLoc.end()) { PanicAlert("BackPatch: no pc entry for address %p", codePtr); return nullptr; } - u32 pc = it2->second; + u32 pc = it3->second; u8 *start; if (info.byteSwap || info.hasImmediate) @@ -154,7 +162,7 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) start = codePtr - bswapSize; } XEmitter emitter(start); - const u8 *trampoline = trampolines.GetWriteTrampoline(info, registersInUse, pc); + const u8 *trampoline = trampolines.GetWriteTrampoline(info, registersInUse, exceptionHandler, pc); emitter.CALL((void *)trampoline); ptrdiff_t padding = (codePtr - emitter.GetCodePtr()) + info.instructionSize; if (padding > 0) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 8af315a7fa..99d89e548e 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -73,6 +73,7 @@ protected: int downcountAmount; u32 numLoadStoreInst; u32 numFloatingPointInst; + u8* fastmemLoadStore; bool firstFPInstructionFound; bool isLastInstruction; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 7cd9cc57d6..35a74201ae 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -307,6 +307,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, u8 *mov = UnsafeLoadToReg(reg_value, opAddress, accessSize, offset, signExtend); registersInUseAtLoc[mov] = registersInUse; + jit->js.fastmemLoadStore = mov; } else { @@ -349,7 +350,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } ABI_PopRegistersAndAdjustStack(registersInUse, 0); - MEMCHECK_START(false) + MEMCHECK_START if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -399,7 +400,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); - MEMCHECK_START(false) + MEMCHECK_START if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -565,6 +566,7 @@ void EmuCodeBlock::SafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acces registersInUseAtLoc[mov] = registersInUse; pcAtLoc[mov] = jit->js.compilerPC; + jit->js.fastmemLoadStore = mov; return; } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 378465120c..322425b9d4 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -12,16 +12,14 @@ namespace MMIO { class Mapping; } -// If inv is true, invert the check (i.e. skip over the associated code if an exception hits, -// instead of skipping over the code if an exception isn't hit). -#define MEMCHECK_START(inv) \ +#define MEMCHECK_START \ Gen::FixupBranch memException; \ - if (jit->js.memcheck) \ + if (jit->js.memcheck && !jit->js.fastmemLoadStore) \ { TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); \ - memException = J_CC((inv) ? Gen::CC_Z : Gen::CC_NZ, true); } + memException = J_CC(Gen::CC_NZ, true); } #define MEMCHECK_END \ - if (jit->js.memcheck) \ + if (jit->js.memcheck && !jit->js.fastmemLoadStore) \ SetJumpTarget(memException); // We offset by 0x80 because the range of one byte memory offsets is @@ -141,4 +139,5 @@ public: protected: std::unordered_map registersInUseAtLoc; std::unordered_map pcAtLoc; + std::unordered_map exceptionHandlerAtLoc; }; diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 2561b436af..4f900621b2 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -36,20 +36,20 @@ void TrampolineCache::Shutdown() cachedTrampolines.clear(); } -const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse) +const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler) { - TrampolineCacheKey key = { registersInUse, 0, info }; + TrampolineCacheKey key = { registersInUse, exceptionHandler, 0, info }; auto it = cachedTrampolines.find(key); if (it != cachedTrampolines.end()) return it->second; - const u8* trampoline = GenerateReadTrampoline(info, registersInUse); + const u8* trampoline = GenerateReadTrampoline(info, registersInUse, exceptionHandler); cachedTrampolines[key] = trampoline; return trampoline; } -const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse) +const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); @@ -90,24 +90,29 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B MOV(dataRegSize, R(dataReg), R(ABI_RETURN)); ABI_PopRegistersAndAdjustStack(registersInUse, 8); + if (exceptionHandler) + { + TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); + J_CC(CC_NZ, exceptionHandler); + } RET(); return trampoline; } -const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc) +const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc) { - TrampolineCacheKey key = { registersInUse, pc, info }; + TrampolineCacheKey key = { registersInUse, exceptionHandler, pc, info }; auto it = cachedTrampolines.find(key); if (it != cachedTrampolines.end()) return it->second; - const u8* trampoline = GenerateWriteTrampoline(info, registersInUse, pc); + const u8* trampoline = GenerateWriteTrampoline(info, registersInUse, exceptionHandler, pc); cachedTrampolines[key] = trampoline; return trampoline; } -const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc) +const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); @@ -174,6 +179,11 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, } ABI_PopRegistersAndAdjustStack(registersInUse, 8); + if (exceptionHandler) + { + TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); + J_CC(CC_NZ, exceptionHandler); + } RET(); return trampoline; @@ -191,6 +201,7 @@ size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const res ^= std::hash()(k.info.signExtend) << 2; res ^= std::hash()(k.info.hasImmediate) << 3; res ^= std::hash()(k.info.isMemoryWrite) << 4; + res ^= std::hash()(k.exceptionHandler) << 5; return res; } @@ -199,5 +210,6 @@ bool TrampolineCacheKey::operator==(const TrampolineCacheKey &other) const { return pc == other.pc && registersInUse == other.registersInUse && + exceptionHandler == other.exceptionHandler && info == other.info; } diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h index 16e293bce0..f2cdb2ba92 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h @@ -17,6 +17,7 @@ const int BACKPATCH_SIZE = 5; struct TrampolineCacheKey { BitSet32 registersInUse; + u8* exceptionHandler; u32 pc; InstructionInfo info; @@ -34,13 +35,13 @@ public: void Init(); void Shutdown(); - const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse); - const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc); + const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler); + const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc); void ClearCodeSpace(); private: - const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse); - const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u32 pc); + const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler); + const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc); std::unordered_map cachedTrampolines; }; From 6dc7cf29f3b938330ea0947756b827b5de493daf Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 13:38:24 -0800 Subject: [PATCH 09/19] JIT: implement crset special case Rebel Strike seems to use this one. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 1 + .../PowerPC/Jit64/Jit_SystemRegisters.cpp | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 48b8e5ada3..7da231d70e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -133,6 +133,7 @@ public: // Clobbers RDX. void SetCRFieldBit(int field, int bit, Gen::X64Reg in); void ClearCRFieldBit(int field, int bit); + void SetCRFieldBit(int field, int bit); // Generates a branch that will check if a given bit of a CR register part // is set or not. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index af5a187f91..b5b91e9fb7 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -112,6 +112,41 @@ void Jit64::ClearCRFieldBit(int field, int bit) // We don't need to set bit 32; the cases where that's needed only come up when setting bits, not clearing. } +void Jit64::SetCRFieldBit(int field, int bit) +{ + MOV(64, R(RSCRATCH), PPCSTATE(cr_val[field])); + if (bit != CR_GT_BIT) + { + TEST(64, R(RSCRATCH), R(RSCRATCH)); + FixupBranch dont_clear_gt = J_CC(CC_NZ); + BTS(64, R(RSCRATCH), Imm8(63)); + SetJumpTarget(dont_clear_gt); + } + + switch (bit) + { + case CR_SO_BIT: + BTS(64, PPCSTATE(cr_val[field]), Imm8(61)); + break; + + case CR_EQ_BIT: + SHR(64, R(RSCRATCH), Imm8(32)); + SHL(64, R(RSCRATCH), Imm8(32)); + break; + + case CR_GT_BIT: + BTR(64, PPCSTATE(cr_val[field]), Imm8(63)); + break; + + case CR_LT_BIT: + BTS(64, PPCSTATE(cr_val[field]), Imm8(62)); + break; + } + + BTS(64, R(RSCRATCH), Imm8(32)); + MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH)); +} + FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set) { switch (bit) @@ -506,6 +541,13 @@ void Jit64::crXXX(UGeckoInstruction inst) return; } + // Special case: crset + if (inst.CRBA == inst.CRBB && inst.CRBA == inst.CRBD && inst.SUBOP10 == 289) + { + SetCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3)); + return; + } + // TODO(delroth): Potential optimizations could be applied here. For // instance, if the two CR bits being loaded are the same, two loads are // not required. From 9923d705df90fb276ced10cafda94e050305ca0c Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 13:55:16 -0800 Subject: [PATCH 10/19] JIT: simplify and optimize memcheck macros Instead of jumping over update code and similar, just jump directly to the handler. This avoids redundant exception checks in the case where we can't do fastmem memory operations (e.g. paired loadstore). --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 7 +++++-- Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 15 +++++---------- .../Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 9 +++------ .../Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp | 6 ++---- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 5 +++++ Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 16 ++++++++++++---- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 12 ++---------- 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 81d126ba3b..77905ca8ae 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -614,6 +614,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc const GekkoOPInfo *opinfo = ops[i].opinfo; js.downcountAmount += opinfo->numCycles; js.fastmemLoadStore = NULL; + js.fixupExceptionHandler = false; if (i == (code_block.m_num_instructions - 1)) { @@ -767,7 +768,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc { // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle it. FixupBranch memException; - if (!js.fastmemLoadStore) + _assert_msg_(DYNA_REC, !(js.fastmemLoadStore && js.fixupExceptionHandler), + "Fastmem loadstores shouldn't have exception handler fixups (PC=%x)!", ops[i].address); + if (!js.fastmemLoadStore && !js.fixupExceptionHandler) { TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); memException = J_CC(CC_NZ, true); @@ -777,7 +780,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc if (!js.fastmemLoadStore) { exceptionHandlerAtLoc[js.fastmemLoadStore] = NULL; - SetJumpTarget(memException); + SetJumpTarget(js.fixupExceptionHandler ? js.exceptionHandler : memException); } else { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index ae6b50f9cf..1edd6b3868 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -258,17 +258,15 @@ void Jit64::lXXx(UGeckoInstruction inst) if (update && storeAddress) { - MEMCHECK_START + MemoryExceptionCheck(); MOV(32, gpr.R(a), opAddress); - MEMCHECK_END } // TODO: support no-swap in SafeLoadToReg instead if (byte_reversed) { - MEMCHECK_START + MemoryExceptionCheck(); BSWAP(accessSize, gpr.RX(d)); - MEMCHECK_END } gpr.UnlockAll(); @@ -372,9 +370,8 @@ void Jit64::stX(UGeckoInstruction inst) else { gpr.KillImmediate(a, true, true); - MEMCHECK_START + MemoryExceptionCheck(); ADD(32, gpr.R(a), Imm32((u32)offset)); - MEMCHECK_END } } } @@ -404,9 +401,8 @@ void Jit64::stX(UGeckoInstruction inst) if (update) { - MEMCHECK_START + MemoryExceptionCheck(); ADD(32, gpr.R(a), Imm32((u32)offset)); - MEMCHECK_END } } gpr.UnlockAll(); @@ -485,9 +481,8 @@ void Jit64::stXx(UGeckoInstruction inst) if (update) { - MEMCHECK_START + MemoryExceptionCheck(); MOV(32, gpr.R(a), R(RSCRATCH2)); - MEMCHECK_END; } gpr.UnlockAll(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 061759857c..4cfbc3b756 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -72,7 +72,7 @@ void Jit64::lfXXX(UGeckoInstruction inst) registersInUse[RSCRATCH2] = true; SafeLoadToReg(RSCRATCH, addr, single ? 32 : 64, offset, registersInUse, false); - MEMCHECK_START + MemoryExceptionCheck(); if (single) { ConvertSingleToDouble(fpr.RX(d), RSCRATCH, true); @@ -84,7 +84,6 @@ void Jit64::lfXXX(UGeckoInstruction inst) } if (update && js.memcheck) MOV(32, gpr.R(a), addr); - MEMCHECK_END fpr.UnlockAll(); gpr.UnlockAll(); } @@ -141,9 +140,8 @@ void Jit64::stfXXX(UGeckoInstruction inst) else { gpr.KillImmediate(a, true, true); - MEMCHECK_START + MemoryExceptionCheck(); ADD(32, gpr.R(a), Imm32((u32)imm)); - MEMCHECK_END } } fpr.UnlockAll(); @@ -187,9 +185,8 @@ void Jit64::stfXXX(UGeckoInstruction inst) if (update) { - MEMCHECK_START + MemoryExceptionCheck(); MOV(32, gpr.R(a), R(RSCRATCH2)); - MEMCHECK_END } fpr.UnlockAll(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 160547bf9a..b6dac78f86 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -78,12 +78,11 @@ void Jit64::psq_stXX(UGeckoInstruction inst) if (update && js.memcheck) { - MEMCHECK_START + MemoryExceptionCheck(); if (indexed) ADD(32, gpr.R(a), gpr.R(b)); else ADD(32, gpr.R(a), Imm32((u32)offset)); - MEMCHECK_END } gpr.UnlockAll(); gpr.UnlockAllX(); @@ -137,7 +136,7 @@ void Jit64::psq_lXX(UGeckoInstruction inst) CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(&asm_routines.pairedLoadQuantized[w * 8]))); - MEMCHECK_START + MemoryExceptionCheck(); CVTPS2PD(fpr.RX(s), R(XMM0)); if (update && js.memcheck) { @@ -146,7 +145,6 @@ void Jit64::psq_lXX(UGeckoInstruction inst) else ADD(32, gpr.R(a), Imm32((u32)offset)); } - MEMCHECK_END gpr.UnlockAll(); gpr.UnlockAllX(); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 99d89e548e..5a526f8f48 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -73,7 +73,12 @@ protected: int downcountAmount; u32 numLoadStoreInst; u32 numFloatingPointInst; + // If this is set, we need to generate an exception handler for the fastmem load. u8* fastmemLoadStore; + // If this is set, a load or store already prepared a jump to the exception handler for us, + // so just fixup that branch instead of testing for a DSI again. + bool fixupExceptionHandler; + Gen::FixupBranch exceptionHandler; bool firstFPInstructionFound; bool isLastInstruction; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 35a74201ae..fef3e90677 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -13,6 +13,16 @@ using namespace Gen; +void EmuCodeBlock::MemoryExceptionCheck() +{ + if (jit->js.memcheck && !jit->js.fastmemLoadStore && !jit->js.fixupExceptionHandler) + { + TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); + jit->js.exceptionHandler = J_CC(Gen::CC_NZ, true); + jit->js.fixupExceptionHandler = true; + } +} + void EmuCodeBlock::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src) { if (cpu_info.bMOVBE) @@ -350,7 +360,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } ABI_PopRegistersAndAdjustStack(registersInUse, 0); - MEMCHECK_START + MemoryExceptionCheck(); if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -360,7 +370,6 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, { MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); } - MEMCHECK_END } } else @@ -400,7 +409,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); - MEMCHECK_START + MemoryExceptionCheck(); if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -410,7 +419,6 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, { MOVZX(64, accessSize, reg_value, R(ABI_RETURN)); } - MEMCHECK_END if (farcode.Enabled()) { diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 322425b9d4..6681808aa7 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -12,16 +12,6 @@ namespace MMIO { class Mapping; } -#define MEMCHECK_START \ - Gen::FixupBranch memException; \ - if (jit->js.memcheck && !jit->js.fastmemLoadStore) \ - { TEST(32, PPCSTATE(Exceptions), Gen::Imm32(EXCEPTION_DSI)); \ - memException = J_CC(Gen::CC_NZ, true); } - -#define MEMCHECK_END \ - if (jit->js.memcheck && !jit->js.fastmemLoadStore) \ - SetJumpTarget(memException); - // We offset by 0x80 because the range of one byte memory offsets is // -0x80..0x7f. #define PPCSTATE(x) MDisp(RPPCSTATE, \ @@ -59,6 +49,8 @@ public: FarCodeCache farcode; u8* nearcode; // Backed up when we switch to far code. + void MemoryExceptionCheck(); + // Simple functions to switch between near and far code emitting void SwitchToFarCode() { From 2a8936312e37cc787270972671b01f5a64b2d7ed Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 14:47:44 -0800 Subject: [PATCH 11/19] Fastmem: jump to trampolines instead of calling them Should be slightly faster, and also lets us skip the nops on the way back. Remove the trampoline cache, since it isn't really useful anymore with this. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 2 - .../Core/PowerPC/JitCommon/JitBackpatch.cpp | 14 ++-- .../PowerPC/JitCommon/TrampolineCache.cpp | 73 ++----------------- .../Core/PowerPC/JitCommon/TrampolineCache.h | 25 +------ 4 files changed, 18 insertions(+), 96 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 77905ca8ae..436364d0de 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -785,8 +785,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc else { exceptionHandlerAtLoc[js.fastmemLoadStore] = GetWritableCodePtr(); - // the fastmem trampoline is jumping here, so we need to pop the return stack - ADD(64, R(RSP), Imm8(8)); } gpr.Flush(FLUSH_MAINTAIN_STATE); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp index d165915c99..566ce38109 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp @@ -83,7 +83,6 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) if (!info.isMemoryWrite) { - XEmitter emitter(codePtr); int bswapNopCount; if (info.byteSwap || info.operandSize == 1) bswapNopCount = 0; @@ -109,9 +108,11 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) totalSize += 3; } - const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse, exceptionHandler); - emitter.CALL((void *)trampoline); + XEmitter emitter(codePtr); int padding = totalSize - BACKPATCH_SIZE; + u8* returnPtr = codePtr + 5 + padding; + const u8* trampoline = trampolines.GenerateReadTrampoline(info, registersInUse, exceptionHandler, returnPtr); + emitter.JMP(trampoline, true); if (padding > 0) { emitter.NOP(padding); @@ -162,9 +163,10 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) start = codePtr - bswapSize; } XEmitter emitter(start); - const u8 *trampoline = trampolines.GetWriteTrampoline(info, registersInUse, exceptionHandler, pc); - emitter.CALL((void *)trampoline); - ptrdiff_t padding = (codePtr - emitter.GetCodePtr()) + info.instructionSize; + ptrdiff_t padding = (codePtr - (start + 5)) + info.instructionSize; + u8* returnPtr = start + 5 + padding; + const u8* trampoline = trampolines.GenerateWriteTrampoline(info, registersInUse, exceptionHandler, returnPtr, pc); + emitter.JMP(trampoline, true); if (padding > 0) { emitter.NOP(padding); diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 4f900621b2..7283e51ed8 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -27,29 +27,14 @@ void TrampolineCache::Init() void TrampolineCache::ClearCodeSpace() { X64CodeBlock::ClearCodeSpace(); - cachedTrampolines.clear(); } void TrampolineCache::Shutdown() { FreeCodeSpace(); - cachedTrampolines.clear(); } -const u8* TrampolineCache::GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler) -{ - TrampolineCacheKey key = { registersInUse, exceptionHandler, 0, info }; - - auto it = cachedTrampolines.find(key); - if (it != cachedTrampolines.end()) - return it->second; - - const u8* trampoline = GenerateReadTrampoline(info, registersInUse, exceptionHandler); - cachedTrampolines[key] = trampoline; - return trampoline; -} - -const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler) +const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); @@ -60,9 +45,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B registersInUse[addrReg] = true; registersInUse[dataReg] = false; - // It's a read. Easy. - // RSP alignment here is 8 due to the call. - ABI_PushRegistersAndAdjustStack(registersInUse, 8); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); int dataRegSize = info.operandSize == 8 ? 64 : 32; MOVTwo(dataRegSize, ABI_PARAM1, addrReg, ABI_PARAM2, dataReg); @@ -89,30 +72,17 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B if (dataReg != ABI_RETURN) MOV(dataRegSize, R(dataReg), R(ABI_RETURN)); - ABI_PopRegistersAndAdjustStack(registersInUse, 8); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); if (exceptionHandler) { TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); J_CC(CC_NZ, exceptionHandler); } - RET(); + JMP(returnPtr, true); return trampoline; } -const u8* TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc) -{ - TrampolineCacheKey key = { registersInUse, exceptionHandler, pc, info }; - - auto it = cachedTrampolines.find(key); - if (it != cachedTrampolines.end()) - return it->second; - - const u8* trampoline = GenerateWriteTrampoline(info, registersInUse, exceptionHandler, pc); - cachedTrampolines[key] = trampoline; - return trampoline; -} - -const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc) +const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr, u32 pc) { if (GetSpaceLeft() < 1024) PanicAlert("Trampoline cache full"); @@ -122,15 +92,13 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, X64Reg dataReg = (X64Reg)info.regOperandReg; X64Reg addrReg = (X64Reg)info.scaledReg; - // It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a - // hardware access - we can take shortcuts. // Don't treat FIFO writes specially for now because they require a burst // check anyway. // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs MOV(32, PPCSTATE(pc), Imm32(pc)); - ABI_PushRegistersAndAdjustStack(registersInUse, 8); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); if (info.hasImmediate) { @@ -178,38 +146,13 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, break; } - ABI_PopRegistersAndAdjustStack(registersInUse, 8); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); if (exceptionHandler) { TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); J_CC(CC_NZ, exceptionHandler); } - RET(); + JMP(returnPtr, true); return trampoline; } - -size_t TrampolineCacheKeyHasher::operator()(const TrampolineCacheKey& k) const -{ - size_t res = std::hash()(k.registersInUse.m_val); - res ^= std::hash()(k.info.operandSize) >> 1; - res ^= std::hash()(k.info.regOperandReg) >> 2; - res ^= std::hash()(k.info.scaledReg) >> 3; - res ^= std::hash()(k.info.immediate) >> 4; - res ^= std::hash()(k.pc) >> 5; - res ^= std::hash()(k.info.displacement) << 1; - res ^= std::hash()(k.info.signExtend) << 2; - res ^= std::hash()(k.info.hasImmediate) << 3; - res ^= std::hash()(k.info.isMemoryWrite) << 4; - res ^= std::hash()(k.exceptionHandler) << 5; - - return res; -} - -bool TrampolineCacheKey::operator==(const TrampolineCacheKey &other) const -{ - return pc == other.pc && - registersInUse == other.registersInUse && - exceptionHandler == other.exceptionHandler && - info == other.info; -} diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h index f2cdb2ba92..9b35950b66 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h @@ -14,34 +14,13 @@ // We need at least this many bytes for backpatching. const int BACKPATCH_SIZE = 5; -struct TrampolineCacheKey -{ - BitSet32 registersInUse; - u8* exceptionHandler; - u32 pc; - InstructionInfo info; - - bool operator==(const TrampolineCacheKey &other) const; -}; - -struct TrampolineCacheKeyHasher -{ - size_t operator()(const TrampolineCacheKey& k) const; -}; - class TrampolineCache : public Gen::X64CodeBlock { public: void Init(); void Shutdown(); - const u8* GetReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler); - const u8* GetWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc); + const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr); + const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr, u32 pc); void ClearCodeSpace(); - -private: - const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler); - const u8* GenerateWriteTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u32 pc); - - std::unordered_map cachedTrampolines; }; From 53b44ccb3a13435a17734a9f832d95933bfabea2 Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 15:32:23 -0800 Subject: [PATCH 12/19] x64ABI: enhance MOVTwo to take an offset This lets us merge displacements into MOVTwo in trampolines. --- Source/Core/Common/x64ABI.cpp | 18 ++++++++++++---- Source/Core/Common/x64Emitter.h | 2 +- .../PowerPC/JitCommon/TrampolineCache.cpp | 21 +++++++++---------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/Source/Core/Common/x64ABI.cpp b/Source/Core/Common/x64ABI.cpp index 44818ceb67..5885031411 100644 --- a/Source/Core/Common/x64ABI.cpp +++ b/Source/Core/Common/x64ABI.cpp @@ -181,20 +181,26 @@ void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) // Pass two registers as parameters. void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2) { - MOVTwo(64, ABI_PARAM1, reg1, ABI_PARAM2, reg2); + MOVTwo(64, ABI_PARAM1, reg1, 0, ABI_PARAM2, reg2); ABI_CallFunction(func); } -void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, Gen::X64Reg dst2, Gen::X64Reg src2) +void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset1, Gen::X64Reg dst2, Gen::X64Reg src2) { if (dst1 == src2 && dst2 == src1) { XCHG(bits, R(src1), R(src2)); + if (offset1) + ADD(bits, R(dst1), Imm32(offset1)); } else if (src2 != dst1) { - if (dst1 != src1) + if (dst1 != src1 && offset1) + LEA(bits, dst1, MDisp(src1, offset1)); + else if (dst1 != src1) MOV(bits, R(dst1), R(src1)); + else if (offset1) + ADD(bits, R(dst1), Imm32(offset1)); if (dst2 != src2) MOV(bits, R(dst2), R(src2)); } @@ -202,8 +208,12 @@ void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, Gen::X64Reg { if (dst2 != src2) MOV(bits, R(dst2), R(src2)); - if (dst1 != src1) + if (dst1 != src1 && offset1) + LEA(bits, dst1, MDisp(src1, offset1)); + else if (dst1 != src1) MOV(bits, R(dst1), R(src1)); + else if (offset1) + ADD(bits, R(dst1), Imm32(offset1)); } } diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 03aeea4b13..142308e799 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -888,7 +888,7 @@ public: void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2); // Helper method for the above, or can be used separately. - void MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, Gen::X64Reg dst2, Gen::X64Reg src2); + void MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, s32 offset, Gen::X64Reg dst2, Gen::X64Reg src2); // Saves/restores the registers and adjusts the stack to be aligned as // required by the ABI, where the previous alignment was as specified. diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 7283e51ed8..b91a0f13ca 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -48,10 +48,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B ABI_PushRegistersAndAdjustStack(registersInUse, 0); int dataRegSize = info.operandSize == 8 ? 64 : 32; - MOVTwo(dataRegSize, ABI_PARAM1, addrReg, ABI_PARAM2, dataReg); - - if (info.displacement) - ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); + MOVTwo(dataRegSize, ABI_PARAM1, addrReg, info.displacement, ABI_PARAM2, dataReg); switch (info.operandSize) { @@ -102,8 +99,13 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, if (info.hasImmediate) { - if (addrReg != ABI_PARAM2) - MOV(64, R(ABI_PARAM2), R(addrReg)); + if (addrReg != ABI_PARAM2 && info.displacement) + LEA(32, ABI_PARAM2, MDisp(addrReg, info.displacement)); + else if (addrReg != ABI_PARAM2) + MOV(32, R(ABI_PARAM2), R(addrReg)); + else if (info.displacement) + ADD(32, R(ABI_PARAM2), Imm32(info.displacement)); + // we have to swap back the immediate to pass it to the write functions switch (info.operandSize) { @@ -123,11 +125,8 @@ const u8* TrampolineCache::GenerateWriteTrampoline(const InstructionInfo &info, } else { - MOVTwo(64, ABI_PARAM1, dataReg, ABI_PARAM2, addrReg); - } - if (info.displacement) - { - ADD(32, R(ABI_PARAM2), Imm32(info.displacement)); + int dataRegSize = info.operandSize == 8 ? 64 : 32; + MOVTwo(dataRegSize, ABI_PARAM2, addrReg, info.displacement, ABI_PARAM1, dataReg); } switch (info.operandSize) From 8903df7300fd76ca7202e884032107f2fb0b8734 Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 18:34:10 -0800 Subject: [PATCH 13/19] MMU: simplify code to restore original data register after failed load Instead of passing the value around constantly, just store it in the regcache, note where it is, and restore it on the exception path. This saves a whole bunch of pushing and popping and gives a ~5% speed boost in Rebel Strike. It's a bit ugly, but it simplifies a lot of code and is faster, too. --- Source/Core/Core/HW/Memmap.h | 7 -- Source/Core/Core/HW/MemmapFunctions.cpp | 93 ++++--------------- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 12 ++- .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 4 +- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 2 +- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 16 +++- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 7 +- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 4 + .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 5 +- .../PowerPC/JitCommon/TrampolineCache.cpp | 41 +++++--- 10 files changed, 89 insertions(+), 102 deletions(-) diff --git a/Source/Core/Core/HW/Memmap.h b/Source/Core/Core/HW/Memmap.h index c22d2b6a67..603f1dc10b 100644 --- a/Source/Core/Core/HW/Memmap.h +++ b/Source/Core/Core/HW/Memmap.h @@ -96,13 +96,6 @@ u16 Read_U16(const u32 address); u32 Read_U32(const u32 address); u64 Read_U64(const u32 address); -u32 Read_S8_Val(const u32 address, u32 var); -u32 Read_U8_Val(const u32 address, u32 var); -u32 Read_S16_Val(const u32 address, u32 var); -u32 Read_U16_Val(const u32 address, u32 var); -u32 Read_U32_Val(const u32 address, u32 var); -u64 Read_U64_Val(const u32 address, u64 var); - // Useful helper functions, used by ARM JIT float Read_F32(const u32 address); double Read_F64(const u32 address); diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 04736a650c..8fbe067a9f 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -92,8 +92,8 @@ static u32 EFB_Read(const u32 addr) static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite); -template -__forceinline void ReadFromHardware(U &_var, const u32 em_address) +template +__forceinline T ReadFromHardware(const u32 em_address) { int segment = em_address >> 28; // Quick check for an address that can't meet any of the following conditions, @@ -104,33 +104,28 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) if ((em_address & 0xC8000000) == 0xC8000000) { if (em_address < 0xcc000000) - _var = EFB_Read(em_address); + return EFB_Read(em_address); else - _var = (T)mmio_mapping->Read::type>(em_address); - return; + return (T)mmio_mapping->Read::type>(em_address); } else if (segment == 0x8 || segment == 0xC || segment == 0x0) { - _var = bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); - return; + return bswap((*(const T*)&m_pRAM[em_address & RAM_MASK])); } else if (m_pEXRAM && (segment == 0x9 || segment == 0xD || segment == 0x1)) { - _var = bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); - return; + return bswap((*(const T*)&m_pEXRAM[em_address & EXRAM_MASK])); } else if (segment == 0xE && (em_address < (0xE0000000 + L1_CACHE_SIZE))) { - _var = bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); - return; + return bswap((*(const T*)&m_pL1Cache[em_address & L1_CACHE_MASK])); } } if (bFakeVMEM && (segment == 0x7 || segment == 0x4)) { // fake VMEM - _var = bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); - return; + return bswap((*(const T*)&m_pFakeVMEM[em_address & FAKEVMEM_MASK])); } // MMU: Do page table translation @@ -139,7 +134,7 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) { if (flag == FLAG_READ) GenerateDSIException(em_address, false); - return; + return 0; } // Handle loads that cross page boundaries (ewwww) @@ -157,20 +152,20 @@ __forceinline void ReadFromHardware(U &_var, const u32 em_address) { if (flag == FLAG_READ) GenerateDSIException(em_address_next_page, false); - return; + return 0; } - _var = 0; + T var = 0; for (u32 addr = em_address; addr < em_address + sizeof(T); addr++, tlb_addr++) { if (addr == em_address_next_page) tlb_addr = tlb_addr_next_page; - _var = (_var << 8) | Memory::base[tlb_addr]; + var = (var << 8) | Memory::base[tlb_addr]; } - return; + return var; } // The easy case! - _var = bswap(*(const T*)&Memory::base[tlb_addr]); + return bswap(*(const T*)&Memory::base[tlb_addr]); } @@ -331,32 +326,28 @@ static __forceinline void Memcheck(u32 address, u32 var, bool write, int size) u8 Read_U8(const u32 address) { - u8 var = 0; - ReadFromHardware(var, address); + u8 var = ReadFromHardware(address); Memcheck(address, var, false, 1); return (u8)var; } u16 Read_U16(const u32 address) { - u16 var = 0; - ReadFromHardware(var, address); + u16 var = ReadFromHardware(address); Memcheck(address, var, false, 2); return (u16)var; } u32 Read_U32(const u32 address) { - u32 var = 0; - ReadFromHardware(var, address); + u32 var = ReadFromHardware(address); Memcheck(address, var, false, 4); return var; } u64 Read_U64(const u32 address) { - u64 var = 0; - ReadFromHardware(var, address); + u64 var = ReadFromHardware(address); Memcheck(address, (u32)var, false, 8); return var; } @@ -385,48 +376,6 @@ float Read_F32(const u32 address) return cvt.d; } -u32 Read_U8_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 1); - return var; -} - -u32 Read_S8_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 1); - return var; -} - -u32 Read_U16_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 2); - return var; -} - -u32 Read_S16_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 2); - return var; -} - -u32 Read_U32_Val(const u32 address, u32 var) -{ - ReadFromHardware(var, address); - Memcheck(address, var, false, 4); - return var; -} - -u64 Read_U64_Val(const u32 address, u64 var) -{ - ReadFromHardware(var, address); - Memcheck(address, (u32)var, false, 8); - return var; -} - u32 Read_U8_ZX(const u32 address) { return (u32)Read_U8(address); @@ -489,16 +438,14 @@ void Write_F64(const double var, const u32 address) } u8 ReadUnchecked_U8(const u32 address) { - u8 var = 0; - ReadFromHardware(var, address); + u8 var = ReadFromHardware(address); return var; } u32 ReadUnchecked_U32(const u32 address) { - u32 var = 0; - ReadFromHardware(var, address); + u32 var = ReadFromHardware(address); return var; } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 436364d0de..e10256f491 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -615,6 +615,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.downcountAmount += opinfo->numCycles; js.fastmemLoadStore = NULL; js.fixupExceptionHandler = false; + js.revertGprLoad = -1; + js.revertFprLoad = -1; if (i == (code_block.m_num_instructions - 1)) { @@ -787,8 +789,14 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc exceptionHandlerAtLoc[js.fastmemLoadStore] = GetWritableCodePtr(); } - gpr.Flush(FLUSH_MAINTAIN_STATE); - fpr.Flush(FLUSH_MAINTAIN_STATE); + BitSet32 gprToFlush = BitSet32::AllTrue(32); + BitSet32 fprToFlush = BitSet32::AllTrue(32); + if (js.revertGprLoad >= 0) + gprToFlush[js.revertGprLoad] = false; + if (js.revertFprLoad >= 0) + fprToFlush[js.revertFprLoad] = false; + gpr.Flush(FLUSH_MAINTAIN_STATE, gprToFlush); + fpr.Flush(FLUSH_MAINTAIN_STATE, fprToFlush); // If a memory exception occurs, the exception handler will read // from PC. Update PC with the latest value in case that happens. diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index f91694ba9e..334c46379e 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -401,7 +401,7 @@ void FPURegCache::StoreRegister(size_t preg, OpArg newLoc) emit->MOVAPD(newLoc, regs[preg].location.GetSimpleReg()); } -void RegCache::Flush(FlushMode mode) +void RegCache::Flush(FlushMode mode, BitSet32 regsToFlush) { for (unsigned int i = 0; i < xregs.size(); i++) { @@ -409,7 +409,7 @@ void RegCache::Flush(FlushMode mode) PanicAlert("Someone forgot to unlock X64 reg %u", i); } - for (unsigned int i = 0; i < regs.size(); i++) + for (unsigned int i : regsToFlush) { if (regs[i].locked) { diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 3943e83852..0e2f2ea687 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -81,7 +81,7 @@ public: LockX(reg1); LockX(reg2); } - void Flush(FlushMode mode = FLUSH_ALL); + void Flush(FlushMode mode = FLUSH_ALL, BitSet32 regsToFlush = BitSet32::AllTrue(32)); void Flush(PPCAnalyst::CodeOp *op) {Flush();} int SanityCheck() const; void KillImmediate(size_t preg, bool doLoad, bool makeDirty); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 1edd6b3868..c322c2248f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -246,9 +246,23 @@ void Jit64::lXXx(UGeckoInstruction inst) } gpr.Lock(a, b, d); + if (update && storeAddress) gpr.BindToRegister(a, true, true); - gpr.BindToRegister(d, js.memcheck, true); + + // A bit of an evil hack here. We need to retain the original value of this register for the + // exception path, but we'd rather not needlessly pass it around if we don't have to, since + // the exception path is very rare. So we store the value in the regcache, let the load path + // clobber it, then restore the value in the exception path. + // TODO: no other load has to do this at the moment, since no other loads go directly to the + // target registers, but if that ever changes, we need to do it there too. + if (js.memcheck) + { + gpr.StoreFromRegister(d); + js.revertGprLoad = d; + } + gpr.BindToRegister(d, false, true); + BitSet32 registersInUse = CallerSavedRegistersInUse(); // We need to save the (usually scratch) address register for the update. if (update && storeAddress) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 4cfbc3b756..bc61136a6c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -66,7 +66,12 @@ void Jit64::lfXXX(UGeckoInstruction inst) } fpr.Lock(d); - fpr.BindToRegister(d, js.memcheck || !single); + if (js.memcheck && single) + { + fpr.StoreFromRegister(d); + js.revertFprLoad = d; + } + fpr.BindToRegister(d, !single); BitSet32 registersInUse = CallerSavedRegistersInUse(); if (update && js.memcheck) registersInUse[RSCRATCH2] = true; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 5a526f8f48..cb79f3f511 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -79,6 +79,10 @@ protected: // so just fixup that branch instead of testing for a DSI again. bool fixupExceptionHandler; Gen::FixupBranch exceptionHandler; + // If these are set, we've stored the old value of a register which will be loaded in revertLoad, + // which lets us revert it on the exception path. + int revertGprLoad; + int revertFprLoad; bool firstFPInstructionFound; bool isLastInstruction; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index fef3e90677..1209e2bd46 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -302,10 +302,7 @@ FixupBranch EmuCodeBlock::CheckIfSafeAddress(OpArg reg_value, X64Reg reg_addr, B void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, int accessSize, s32 offset, BitSet32 registersInUse, bool signExtend, int flags) { - if (!jit->js.memcheck) - { - registersInUse[reg_value] = false; - } + registersInUse[reg_value] = false; if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && !opAddress.IsImm() && !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM)) diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index b91a0f13ca..f5bbea78dc 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -42,39 +42,58 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B const u8* trampoline = GetCodePtr(); X64Reg addrReg = (X64Reg)info.scaledReg; X64Reg dataReg = (X64Reg)info.regOperandReg; - registersInUse[addrReg] = true; - registersInUse[dataReg] = false; + int stack_offset = 0; + bool push_param1 = registersInUse[ABI_PARAM1]; - ABI_PushRegistersAndAdjustStack(registersInUse, 0); + if (push_param1) + { + PUSH(ABI_PARAM1); + stack_offset = 8; + registersInUse[ABI_PARAM1] = 0; + } int dataRegSize = info.operandSize == 8 ? 64 : 32; - MOVTwo(dataRegSize, ABI_PARAM1, addrReg, info.displacement, ABI_PARAM2, dataReg); + if (addrReg != ABI_PARAM1 && info.displacement) + LEA(32, ABI_PARAM1, MDisp(addrReg, info.displacement)); + else if (addrReg != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(addrReg)); + else if (info.displacement) + ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); + + ABI_PushRegistersAndAdjustStack(registersInUse, stack_offset); switch (info.operandSize) { case 8: - CALL((void *)&Memory::Read_U64_Val); + CALL((void *)&Memory::Read_U64); break; case 4: - CALL((void *)&Memory::Read_U32_Val); + CALL((void *)&Memory::Read_U32); break; case 2: - CALL(info.signExtend ? (void *)&Memory::Read_S16_Val : (void *)&Memory::Read_U16_Val); + CALL((void *)&Memory::Read_U16); break; case 1: - CALL(info.signExtend ? (void *)&Memory::Read_S8_Val : (void *)&Memory::Read_U8_Val); + CALL((void *)&Memory::Read_U8); break; } - if (dataReg != ABI_RETURN) - MOV(dataRegSize, R(dataReg), R(ABI_RETURN)); + ABI_PopRegistersAndAdjustStack(registersInUse, stack_offset); + + if (push_param1) + POP(ABI_PARAM1); - ABI_PopRegistersAndAdjustStack(registersInUse, 0); if (exceptionHandler) { TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); J_CC(CC_NZ, exceptionHandler); } + + if (info.signExtend) + MOVSX(dataRegSize, info.operandSize * 8, dataReg, R(ABI_RETURN)); + else if (dataReg != ABI_RETURN || info.operandSize < 4) + MOVZX(dataRegSize, info.operandSize * 8, dataReg, R(ABI_RETURN)); + JMP(returnPtr, true); return trampoline; } From 16e756cb39068e0431a2841419598a5c00c595bf Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 19:22:39 -0800 Subject: [PATCH 14/19] MMU: fix TLB behavior on setting C bit We shouldn't be updating the TLB when setting the C bit. Bug reported by tueidj. --- Source/Core/Core/HW/MemmapFunctions.cpp | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 8fbe067a9f..8f41d35059 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -607,8 +607,14 @@ void SDRUpdated() PowerPC::ppcState.pagetable_hashmask = ((xx<<10)|0x3ff); } +enum TLBLookupResult +{ + TLB_FOUND, + TLB_NOTFOUND, + TLB_UPDATE_C +}; -static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u32 vpa, u32 *paddr) +static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u32 vpa, u32 *paddr) { int tag = vpa >> HW_PAGE_INDEX_SHIFT; PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; @@ -623,7 +629,7 @@ static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u { PTE2.C = 1; tlbe->pte[0] = PTE2.Hex; - return 0; + return TLB_UPDATE_C; } } @@ -632,7 +638,7 @@ static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u *paddr = tlbe->paddr[0] | (vpa & 0xfff); - return 1; + return TLB_FOUND; } if (tlbe->tag[1] == tag) { @@ -645,7 +651,7 @@ static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u { PTE2.C = 1; tlbe->pte[1] = PTE2.Hex; - return 0; + return TLB_UPDATE_C; } } @@ -654,9 +660,9 @@ static __forceinline u32 LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u *paddr = tlbe->paddr[1] | (vpa & 0xfff); - return 1; + return TLB_FOUND; } - return 0; + return TLB_NOTFOUND; } static __forceinline void UpdateTLBEntry(const XCheckTLBFlag _Flag, UPTE2 PTE2, const u32 vpa) @@ -690,7 +696,8 @@ static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTL // This catches 99%+ of lookups in practice, so the actual page table entry code below doesn't benefit // much from optimization. u32 translatedAddress = 0; - if (LookupTLBPageAddress(_Flag, _Address, &translatedAddress)) + TLBLookupResult res = LookupTLBPageAddress(_Flag, _Address, &translatedAddress); + if (res == TLB_FOUND) return translatedAddress; u32 sr = PowerPC::ppcState.sr[EA_SR(_Address)]; @@ -744,7 +751,9 @@ static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTL if (_Flag != FLAG_NO_EXCEPTION) *(u32*)&base_mem[(pteg_addr + 4)] = bswap(PTE2.Hex); - UpdateTLBEntry(_Flag, PTE2, _Address); + // We already updated the TLB entry if this was caused by a C bit. + if (res != TLB_UPDATE_C) + UpdateTLBEntry(_Flag, PTE2, _Address); return (PTE2.RPN << 12) | offset; } From 1ee83e332e1063d709c638fb2bbfc72b9afdd377 Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 20:34:47 -0800 Subject: [PATCH 15/19] MMU: optimize PTE lookup code Pull out calculation of PTE1 instead of comparing the separate parts. --- Source/Core/Core/HW/MemmapFunctions.cpp | 54 +++++++++++-------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 8f41d35059..68c9717462 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -713,52 +713,44 @@ static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTL // hash function no 1 "xor" .360 u32 hash = (VSID ^ page_index); + u32 pte1 = bswap((VSID << 7) | api | PTE1_V); for (int hash_func = 0; hash_func < 2; hash_func++) { + // hash function no 2 "not" .360 if (hash_func == 1) { - // hash function no 2 "not" .360 hash = ~hash; + pte1 |= PTE1_H << 24; } u32 pteg_addr = ((hash & PowerPC::ppcState.pagetable_hashmask) << 6) | PowerPC::ppcState.pagetable_base; - for (int i = 0; i < 8; i++) + for (int i = 0; i < 8; i++, pteg_addr += 8) { - u32 pte = bswap(*(u32*)&base_mem[pteg_addr]); - bool pteh = (pte & PTE1_H) == 0; - - if (hash_func == 1) - pteh = !pteh; - - if ((pte & PTE1_V) && pteh) + if (pte1 == *(u32*)&base_mem[pteg_addr]) { - if (VSID == PTE1_VSID(pte) && (api == PTE1_API(pte))) + UPTE2 PTE2; + PTE2.Hex = bswap((*(u32*)&base_mem[(pteg_addr + 4)])); + + // set the access bits + switch (_Flag) { - UPTE2 PTE2; - PTE2.Hex = bswap((*(u32*)&base_mem[(pteg_addr + 4)])); - - // set the access bits - switch (_Flag) - { - case FLAG_NO_EXCEPTION: break; - case FLAG_READ: PTE2.R = 1; break; - case FLAG_WRITE: PTE2.R = 1; PTE2.C = 1; break; - case FLAG_OPCODE: PTE2.R = 1; break; - } - - if (_Flag != FLAG_NO_EXCEPTION) - *(u32*)&base_mem[(pteg_addr + 4)] = bswap(PTE2.Hex); - - // We already updated the TLB entry if this was caused by a C bit. - if (res != TLB_UPDATE_C) - UpdateTLBEntry(_Flag, PTE2, _Address); - - return (PTE2.RPN << 12) | offset; + case FLAG_NO_EXCEPTION: break; + case FLAG_READ: PTE2.R = 1; break; + case FLAG_WRITE: PTE2.R = 1; PTE2.C = 1; break; + case FLAG_OPCODE: PTE2.R = 1; break; } + + if (_Flag != FLAG_NO_EXCEPTION) + *(u32*)&base_mem[(pteg_addr + 4)] = bswap(PTE2.Hex); + + // We already updated the TLB entry if this was caused by a C bit. + if (res != TLB_UPDATE_C) + UpdateTLBEntry(_Flag, PTE2, _Address); + + return (PTE2.RPN << 12) | offset; } - pteg_addr += 8; } } return 0; From 3d2492627f5545c5a28c7f48e1f7cd3e7e1ebea2 Mon Sep 17 00:00:00 2001 From: Fiora Date: Fri, 2 Jan 2015 23:33:07 -0800 Subject: [PATCH 16/19] Fastmem: increase the size of trampoline cache in MMU mode, check space Fastmem in MMU mode generates way more trampolines than normal, so we need a bunch more space too, lest we run out of room. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 9 +++++---- Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp | 2 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 4 ++++ Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp | 4 ++-- Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h | 2 +- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index e10256f491..539bb1b84b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -183,7 +183,7 @@ void Jit64::Init() gpr.SetEmitter(this); fpr.SetEmitter(this); - trampolines.Init(); + trampolines.Init(js.memcheck ? TRAMPOLINE_CODE_SIZE_MMU : TRAMPOLINE_CODE_SIZE); AllocCodeSpace(CODE_SIZE); // BLR optimization has the same consequences as block linking, as well as @@ -494,9 +494,10 @@ void Jit64::Jit(u32 em_address) { if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || - blocks.IsFull() || - SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache || - m_clear_cache_asap) + trampolines.GetSpaceLeft() < 0x10000 || + blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache || + m_clear_cache_asap) { ClearCache(); } diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 45f910eca0..b30515c101 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -249,7 +249,7 @@ void JitIL::Init() jo.accurateSinglePrecision = false; js.memcheck = SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU; - trampolines.Init(); + trampolines.Init(js.memcheck ? TRAMPOLINE_CODE_SIZE_MMU : TRAMPOLINE_CODE_SIZE); AllocCodeSpace(CODE_SIZE); blocks.Init(); asm_routines.Init(nullptr); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 6681808aa7..9c11937b47 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -42,6 +42,10 @@ static const int CODE_SIZE = 1024 * 1024 * 32; static const int FARCODE_SIZE = 1024 * 1024 * 8; static const int FARCODE_SIZE_MMU = 1024 * 1024 * 48; +// same for the trampoline code cache, because fastmem results in far more backpatches in MMU mode +static const int TRAMPOLINE_CODE_SIZE = 1024 * 1024 * 8; +static const int TRAMPOLINE_CODE_SIZE_MMU = 1024 * 1024 * 32; + // Like XCodeBlock but has some utilities for memory access. class EmuCodeBlock : public Gen::X64CodeBlock { diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index f5bbea78dc..63a436511d 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -19,9 +19,9 @@ using namespace Gen; -void TrampolineCache::Init() +void TrampolineCache::Init(int size) { - AllocCodeSpace(8 * 1024 * 1024); + AllocCodeSpace(size); } void TrampolineCache::ClearCodeSpace() diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h index 9b35950b66..305ab2389a 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.h @@ -17,7 +17,7 @@ const int BACKPATCH_SIZE = 5; class TrampolineCache : public Gen::X64CodeBlock { public: - void Init(); + void Init(int size); void Shutdown(); const u8* GenerateReadTrampoline(const InstructionInfo &info, BitSet32 registersInUse, u8* exceptionHandler, u8* returnPtr); From 821db9798c62d14261116d79e17ec3ba09e64706 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 3 Jan 2015 10:22:36 -0800 Subject: [PATCH 17/19] Memmap: clean up function argument names. To be more consistent overall. --- Source/Core/Core/HW/Memmap.cpp | 78 ++++++++++---------- Source/Core/Core/HW/Memmap.h | 28 ++++---- Source/Core/Core/HW/MemmapFunctions.cpp | 94 ++++++++++++------------- 3 files changed, 100 insertions(+), 100 deletions(-) diff --git a/Source/Core/Core/HW/Memmap.cpp b/Source/Core/Core/HW/Memmap.cpp index fab79ba1bf..4f91f558ff 100644 --- a/Source/Core/Core/HW/Memmap.cpp +++ b/Source/Core/Core/HW/Memmap.cpp @@ -188,9 +188,9 @@ bool AreMemoryBreakpointsActivated() #endif } -u32 Read_Instruction(const u32 em_address) +u32 Read_Instruction(const u32 address) { - UGeckoInstruction inst = ReadUnchecked_U32(em_address); + UGeckoInstruction inst = ReadUnchecked_U32(address); return inst.hex; } @@ -235,48 +235,48 @@ void Memset(const u32 _Address, const u8 _iValue, const u32 _iLength) } } -void ClearCacheLine(const u32 _Address) +void ClearCacheLine(const u32 address) { // FIXME: does this do the right thing if dcbz is run on hardware memory, e.g. // the FIFO? Do games even do that? Probably not, but we should try to be correct... for (u32 i = 0; i < 32; i += 8) - Write_U64(0, _Address + i); + Write_U64(0, address + i); } -void DMA_LCToMemory(const u32 _MemAddr, const u32 _CacheAddr, const u32 _iNumBlocks) +void DMA_LCToMemory(const u32 memAddr, const u32 cacheAddr, const u32 numBlocks) { - const u8* src = m_pL1Cache + (_CacheAddr & 0x3FFFF); - u8* dst = GetPointer(_MemAddr); + const u8* src = m_pL1Cache + (cacheAddr & 0x3FFFF); + u8* dst = GetPointer(memAddr); - if ((dst != nullptr) && (src != nullptr) && (_MemAddr & 3) == 0 && (_CacheAddr & 3) == 0) + if ((dst != nullptr) && (src != nullptr) && (memAddr & 3) == 0 && (cacheAddr & 3) == 0) { - memcpy(dst, src, 32 * _iNumBlocks); + memcpy(dst, src, 32 * numBlocks); } else { - for (u32 i = 0; i < 32 * _iNumBlocks; i++) + for (u32 i = 0; i < 32 * numBlocks; i++) { - u8 Temp = Read_U8(_CacheAddr + i); - Write_U8(Temp, _MemAddr + i); + u8 Temp = Read_U8(cacheAddr + i); + Write_U8(Temp, memAddr + i); } } } -void DMA_MemoryToLC(const u32 _CacheAddr, const u32 _MemAddr, const u32 _iNumBlocks) +void DMA_MemoryToLC(const u32 cacheAddr, const u32 memAddr, const u32 numBlocks) { - const u8* src = GetPointer(_MemAddr); - u8* dst = m_pL1Cache + (_CacheAddr & 0x3FFFF); + const u8* src = GetPointer(memAddr); + u8* dst = m_pL1Cache + (cacheAddr & 0x3FFFF); - if ((dst != nullptr) && (src != nullptr) && (_MemAddr & 3) == 0 && (_CacheAddr & 3) == 0) + if ((dst != nullptr) && (src != nullptr) && (memAddr & 3) == 0 && (cacheAddr & 3) == 0) { - memcpy(dst, src, 32 * _iNumBlocks); + memcpy(dst, src, 32 * numBlocks); } else { - for (u32 i = 0; i < 32 * _iNumBlocks; i++) + for (u32 i = 0; i < 32 * numBlocks; i++) { - u8 Temp = Read_U8(_MemAddr + i); - Write_U8(Temp, _CacheAddr + i); + u8 Temp = Read_U8(memAddr + i); + Write_U8(Temp, cacheAddr + i); } } } @@ -301,16 +301,16 @@ std::string GetString(u32 em_address, size_t size) // GetPointer must always return an address in the bottom 32 bits of address space, so that 64-bit // programs don't have problems directly addressing any part of memory. // TODO re-think with respect to other BAT setups... -u8* GetPointer(const u32 _Address) +u8* GetPointer(const u32 address) { - switch (_Address >> 28) + switch (address >> 28) { case 0x0: case 0x8: - if ((_Address & 0xfffffff) < REALRAM_SIZE) - return m_pRAM + (_Address & RAM_MASK); + if ((address & 0xfffffff) < REALRAM_SIZE) + return m_pRAM + (address & RAM_MASK); case 0xc: - switch (_Address >> 24) + switch (address >> 24) { case 0xcc: case 0xcd: @@ -320,8 +320,8 @@ u8* GetPointer(const u32 _Address) break; default: - if ((_Address & 0xfffffff) < REALRAM_SIZE) - return m_pRAM + (_Address & RAM_MASK); + if ((address & 0xfffffff) < REALRAM_SIZE) + return m_pRAM + (address & RAM_MASK); } case 0x1: @@ -329,53 +329,53 @@ u8* GetPointer(const u32 _Address) case 0xd: if (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii) { - if ((_Address & 0xfffffff) < EXRAM_SIZE) - return m_pEXRAM + (_Address & EXRAM_MASK); + if ((address & 0xfffffff) < EXRAM_SIZE) + return m_pEXRAM + (address & EXRAM_MASK); } else break; case 0xe: - if (_Address < (0xE0000000 + L1_CACHE_SIZE)) - return m_pL1Cache + (_Address & L1_CACHE_MASK); + if (address < (0xE0000000 + L1_CACHE_SIZE)) + return m_pL1Cache + (address & L1_CACHE_MASK); else break; default: if (bFakeVMEM) - return m_pFakeVMEM + (_Address & FAKEVMEM_MASK); + return m_pFakeVMEM + (address & FAKEVMEM_MASK); } - ERROR_LOG(MEMMAP, "Unknown Pointer %#8x PC %#8x LR %#8x", _Address, PC, LR); + ERROR_LOG(MEMMAP, "Unknown Pointer %#8x PC %#8x LR %#8x", address, PC, LR); return nullptr; } -bool IsRAMAddress(const u32 addr, bool allow_locked_cache, bool allow_fake_vmem) +bool IsRAMAddress(const u32 address, bool allow_locked_cache, bool allow_fake_vmem) { - switch ((addr >> 24) & 0xFC) + switch ((address >> 24) & 0xFC) { case 0x00: case 0x80: case 0xC0: - if ((addr & 0x1FFFFFFF) < RAM_SIZE) + if ((address & 0x1FFFFFFF) < RAM_SIZE) return true; else return false; case 0x10: case 0x90: case 0xD0: - if (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && (addr & 0x0FFFFFFF) < EXRAM_SIZE) + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && (address & 0x0FFFFFFF) < EXRAM_SIZE) return true; else return false; case 0xE0: - if (allow_locked_cache && addr - 0xE0000000 < L1_CACHE_SIZE) + if (allow_locked_cache && address - 0xE0000000 < L1_CACHE_SIZE) return true; else return false; case 0x7C: - if (allow_fake_vmem && bFakeVMEM && addr >= 0x7E000000) + if (allow_fake_vmem && bFakeVMEM && address >= 0x7E000000) return true; else return false; diff --git a/Source/Core/Core/HW/Memmap.h b/Source/Core/Core/HW/Memmap.h index 603f1dc10b..f7915367cf 100644 --- a/Source/Core/Core/HW/Memmap.h +++ b/Source/Core/Core/HW/Memmap.h @@ -74,19 +74,19 @@ void Clear(); bool AreMemoryBreakpointsActivated(); // ONLY for use by GUI -u8 ReadUnchecked_U8(const u32 _Address); -u32 ReadUnchecked_U32(const u32 _Address); +u8 ReadUnchecked_U8(const u32 address); +u32 ReadUnchecked_U32(const u32 address); -void WriteUnchecked_U8(const u8 _Data, const u32 _Address); -void WriteUnchecked_U32(const u32 _Data, const u32 _Address); +void WriteUnchecked_U8(const u8 var, const u32 address); +void WriteUnchecked_U32(const u32 var, const u32 address); -bool IsRAMAddress(const u32 addr, bool allow_locked_cache = false, bool allow_fake_vmem = false); +bool IsRAMAddress(const u32 address, bool allow_locked_cache = false, bool allow_fake_vmem = false); // used by interpreter to read instructions, uses iCache -u32 Read_Opcode(const u32 _Address); +u32 Read_Opcode(const u32 address); // this is used by Debugger a lot. // For now, just reads from memory! -u32 Read_Instruction(const u32 _Address); +u32 Read_Instruction(const u32 address); // For use by emulator @@ -118,13 +118,13 @@ void Write_F64(const double var, const u32 address); std::string GetString(u32 em_address, size_t size = 0); -u8* GetPointer(const u32 _Address); -void DMA_LCToMemory(const u32 _iMemAddr, const u32 _iCacheAddr, const u32 _iNumBlocks); -void DMA_MemoryToLC(const u32 _iCacheAddr, const u32 _iMemAddr, const u32 _iNumBlocks); +u8* GetPointer(const u32 address); +void DMA_LCToMemory(const u32 memAddr, const u32 cacheAddr, const u32 numBlocks); +void DMA_MemoryToLC(const u32 cacheAddr, const u32 memAddr, const u32 numBlocks); void CopyFromEmu(void* data, u32 address, size_t size); void CopyToEmu(u32 address, const void* data, size_t size); -void Memset(const u32 _Address, const u8 _Data, const u32 _iLength); -void ClearCacheLine(const u32 _Address); // Zeroes 32 bytes; address should be 32-byte-aligned +void Memset(const u32 address, const u8 var, const u32 length); +void ClearCacheLine(const u32 address); // Zeroes 32 bytes; address should be 32-byte-aligned // TLB functions void SDRUpdated(); @@ -135,8 +135,8 @@ enum XCheckTLBFlag FLAG_WRITE, FLAG_OPCODE, }; -template u32 TranslateAddress(const u32 _Address); -void InvalidateTLBEntry(u32 _Address); +template u32 TranslateAddress(const u32 address); +void InvalidateTLBEntry(u32 address); extern u32 pagetable_base; extern u32 pagetable_hashmask; } diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 68c9717462..d52bb6f5ad 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -284,9 +284,9 @@ __forceinline void WriteToHardware(u32 em_address, const T data) static void GenerateISIException(u32 effective_address); -u32 Read_Opcode(u32 _Address) +u32 Read_Opcode(u32 address) { - if (_Address == 0x00000000) + if (address == 0x00000000) { // FIXME use assert? PanicAlert("Program tried to read an opcode from [00000000]. It has crashed."); @@ -294,22 +294,22 @@ u32 Read_Opcode(u32 _Address) } if (SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU && - (_Address & ADDR_MASK_MEM1)) + (address & ADDR_MASK_MEM1)) { // TODO: Check for MSR instruction address translation flag before translating - u32 tlb_addr = TranslateAddress(_Address); + u32 tlb_addr = TranslateAddress(address); if (tlb_addr == 0) { - GenerateISIException(_Address); + GenerateISIException(address); return 0; } else { - _Address = tlb_addr; + address = tlb_addr; } } - return PowerPC::ppcState.iCache.ReadInstruction(_Address); + return PowerPC::ppcState.iCache.ReadInstruction(address); } static __forceinline void Memcheck(u32 address, u32 var, bool write, int size) @@ -553,21 +553,21 @@ union UPTE2 u32 Hex; }; -static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite) +static void GenerateDSIException(u32 effectiveAddress, bool write) { // DSI exceptions are only supported in MMU mode. if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bMMU) { - PanicAlertT("Invalid %s to 0x%08x, PC = 0x%08x ", _bWrite ? "Write to" : "Read from", _EffectiveAddress, PC); + PanicAlertT("Invalid %s to 0x%08x, PC = 0x%08x ", write ? "Write to" : "Read from", effectiveAddress, PC); return; } - if (_bWrite) + if (effectiveAddress) PowerPC::ppcState.spr[SPR_DSISR] = PPC_EXC_DSISR_PAGE | PPC_EXC_DSISR_STORE; else PowerPC::ppcState.spr[SPR_DSISR] = PPC_EXC_DSISR_PAGE; - PowerPC::ppcState.spr[SPR_DAR] = _EffectiveAddress; + PowerPC::ppcState.spr[SPR_DAR] = effectiveAddress; Common::AtomicOr(PowerPC::ppcState.Exceptions, EXCEPTION_DSI); } @@ -614,14 +614,14 @@ enum TLBLookupResult TLB_UPDATE_C }; -static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag _Flag, const u32 vpa, u32 *paddr) +static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag flag, const u32 vpa, u32 *paddr) { int tag = vpa >> HW_PAGE_INDEX_SHIFT; - PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; + PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; if (tlbe->tag[0] == tag) { // Check if C bit requires updating - if (_Flag == FLAG_WRITE) + if (flag == FLAG_WRITE) { UPTE2 PTE2; PTE2.Hex = tlbe->pte[0]; @@ -633,7 +633,7 @@ static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag _F } } - if (_Flag != FLAG_NO_EXCEPTION) + if (flag != FLAG_NO_EXCEPTION) tlbe->recent = 0; *paddr = tlbe->paddr[0] | (vpa & 0xfff); @@ -643,7 +643,7 @@ static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag _F if (tlbe->tag[1] == tag) { // Check if C bit requires updating - if (_Flag == FLAG_WRITE) + if (flag == FLAG_WRITE) { UPTE2 PTE2; PTE2.Hex = tlbe->pte[1]; @@ -655,7 +655,7 @@ static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag _F } } - if (_Flag != FLAG_NO_EXCEPTION) + if (flag != FLAG_NO_EXCEPTION) tlbe->recent = 1; *paddr = tlbe->paddr[1] | (vpa & 0xfff); @@ -665,13 +665,13 @@ static __forceinline TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag _F return TLB_NOTFOUND; } -static __forceinline void UpdateTLBEntry(const XCheckTLBFlag _Flag, UPTE2 PTE2, const u32 vpa) +static __forceinline void UpdateTLBEntry(const XCheckTLBFlag flag, UPTE2 PTE2, const u32 address) { - if (_Flag == FLAG_NO_EXCEPTION) + if (flag == FLAG_NO_EXCEPTION) return; - int tag = vpa >> HW_PAGE_INDEX_SHIFT; - PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[_Flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; + int tag = address >> HW_PAGE_INDEX_SHIFT; + PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[flag == FLAG_OPCODE][tag & HW_PAGE_INDEX_MASK]; int index = tlbe->recent == 0 && tlbe->tag[0] != TLB_TAG_INVALID; tlbe->recent = index; tlbe->paddr[index] = PTE2.RPN << HW_PAGE_INDEX_SHIFT; @@ -679,33 +679,33 @@ static __forceinline void UpdateTLBEntry(const XCheckTLBFlag _Flag, UPTE2 PTE2, tlbe->tag[index] = tag; } -void InvalidateTLBEntry(u32 vpa) +void InvalidateTLBEntry(u32 address) { - PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[0][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; + PowerPC::tlb_entry *tlbe = &PowerPC::ppcState.tlb[0][(address >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; tlbe->tag[0] = TLB_TAG_INVALID; tlbe->tag[1] = TLB_TAG_INVALID; - PowerPC::tlb_entry *tlbe_i = &PowerPC::ppcState.tlb[1][(vpa >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; + PowerPC::tlb_entry *tlbe_i = &PowerPC::ppcState.tlb[1][(address >> HW_PAGE_INDEX_SHIFT) & HW_PAGE_INDEX_MASK]; tlbe_i->tag[0] = TLB_TAG_INVALID; tlbe_i->tag[1] = TLB_TAG_INVALID; } // Page Address Translation -static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTLBFlag _Flag) +static __forceinline u32 TranslatePageAddress(const u32 address, const XCheckTLBFlag flag) { // TLB cache // This catches 99%+ of lookups in practice, so the actual page table entry code below doesn't benefit // much from optimization. u32 translatedAddress = 0; - TLBLookupResult res = LookupTLBPageAddress(_Flag, _Address, &translatedAddress); + TLBLookupResult res = LookupTLBPageAddress(flag , address, &translatedAddress); if (res == TLB_FOUND) return translatedAddress; - u32 sr = PowerPC::ppcState.sr[EA_SR(_Address)]; + u32 sr = PowerPC::ppcState.sr[EA_SR(address)]; - u32 offset = EA_Offset(_Address); // 12 bit - u32 page_index = EA_PageIndex(_Address); // 16 bit + u32 offset = EA_Offset(address); // 12 bit + u32 page_index = EA_PageIndex(address); // 16 bit u32 VSID = SR_VSID(sr); // 24 bit - u32 api = EA_API(_Address); // 6 bit (part of page_index) + u32 api = EA_API(address); // 6 bit (part of page_index) // Direct access to the fastmem Arena // FIXME: is this the best idea for clean code? @@ -734,7 +734,7 @@ static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTL PTE2.Hex = bswap((*(u32*)&base_mem[(pteg_addr + 4)])); // set the access bits - switch (_Flag) + switch (flag) { case FLAG_NO_EXCEPTION: break; case FLAG_READ: PTE2.R = 1; break; @@ -742,12 +742,12 @@ static __forceinline u32 TranslatePageAddress(const u32 _Address, const XCheckTL case FLAG_OPCODE: PTE2.R = 1; break; } - if (_Flag != FLAG_NO_EXCEPTION) + if (flag != FLAG_NO_EXCEPTION) *(u32*)&base_mem[(pteg_addr + 4)] = bswap(PTE2.Hex); // We already updated the TLB entry if this was caused by a C bit. if (res != TLB_UPDATE_C) - UpdateTLBEntry(_Flag, PTE2, _Address); + UpdateTLBEntry(flag, PTE2, address); return (PTE2.RPN << 12) | offset; } @@ -793,7 +793,7 @@ static inline bool CheckAddrBats(const u32 addr, u32* result, u32 batu, u32 spr) } // Block Address Translation -static u32 TranslateBlockAddress(const u32 addr, const XCheckTLBFlag _Flag) +static u32 TranslateBlockAddress(const u32 address, const XCheckTLBFlag flag) { u32 result = 0; UReg_MSR& m_MSR = ((UReg_MSR&)PowerPC::ppcState.msr); @@ -802,22 +802,22 @@ static u32 TranslateBlockAddress(const u32 addr, const XCheckTLBFlag _Flag) // Check for enhanced mode (secondary BAT enable) using 8 BATs bool enhanced_bats = SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && HID4.SBE; - if (_Flag != FLAG_OPCODE) + if (flag != FLAG_OPCODE) { - if (!CheckAddrBats(addr, &result, batu, SPR_DBAT0U) && enhanced_bats) - CheckAddrBats(addr, &result, batu, SPR_DBAT4U); + if (!CheckAddrBats(address, &result, batu, SPR_DBAT0U) && enhanced_bats) + CheckAddrBats(address, &result, batu, SPR_DBAT4U); } else { - if (!CheckAddrBats(addr, &result, batu, SPR_IBAT0U) && enhanced_bats) - CheckAddrBats(addr, &result, batu, SPR_IBAT4U); + if (!CheckAddrBats(address, &result, batu, SPR_IBAT0U) && enhanced_bats) + CheckAddrBats(address, &result, batu, SPR_IBAT4U); } return result; } // Translate effective address using BAT or PAT. Returns 0 if the address cannot be translated. -template -u32 TranslateAddress(const u32 _Address) +template +u32 TranslateAddress(const u32 address) { // Check MSR[IR] bit before translating instruction addresses. Rogue Leader clears IR and DR?? //if ((_Flag == FLAG_OPCODE) && !(MSR & (1 << (31 - 26)))) return _Address; @@ -829,15 +829,15 @@ u32 TranslateAddress(const u32 _Address) // so only do it where it's really needed. if (SConfig::GetInstance().m_LocalCoreStartupParameter.bBAT) { - u32 tlb_addr = TranslateBlockAddress(_Address, _Flag); + u32 tlb_addr = TranslateBlockAddress(address, flag); if (tlb_addr) return tlb_addr; } - return TranslatePageAddress(_Address, _Flag); + return TranslatePageAddress(address, flag); } -template u32 TranslateAddress(const u32 _Address); -template u32 TranslateAddress(const u32 _Address); -template u32 TranslateAddress(const u32 _Address); -template u32 TranslateAddress(const u32 _Address); +template u32 TranslateAddress(const u32 address); +template u32 TranslateAddress(const u32 address); +template u32 TranslateAddress(const u32 address); +template u32 TranslateAddress(const u32 address); } // namespace From b058bbd2237837fc05943d3c9b0e60233937f704 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 3 Jan 2015 10:52:55 -0800 Subject: [PATCH 18/19] JIT: move mfcr code to JitAsmCommon It's like 80+ instructions, so inlining it on every use of mfcr is probably not the best for the icache. --- Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 2 + .../PowerPC/Jit64/Jit_SystemRegisters.cpp | 35 ++--------------- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 38 +++++++++++++++++++ .../Core/PowerPC/JitCommon/JitAsmCommon.h | 2 + 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 28c2b3fa97..68555e3824 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -226,6 +226,8 @@ void Jit64AsmRoutineManager::GenerateCommon() GenFrsqrte(); fres = AlignCode4(); GenFres(); + mfcr = AlignCode4(); + GenMfcr(); GenQuantizedLoads(); GenQuantizedStores(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index b5b91e9fb7..1b3772ff55 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -406,39 +406,12 @@ void Jit64::mfcr(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITSystemRegistersOff); - // USES_CR int d = inst.RD; + gpr.FlushLockX(RSCRATCH_EXTRA); + CALL((void *)asm_routines.mfcr); + gpr.Lock(d); gpr.BindToRegister(d, false, true); - XOR(32, gpr.R(d), gpr.R(d)); - - X64Reg cr_val = RSCRATCH2; - // we only need to zero the high bits of RSCRATCH once - XOR(32, R(RSCRATCH), R(RSCRATCH)); - for (int i = 0; i < 8; i++) - { - static const u8 m_flagTable[8] = {0x0,0x1,0x8,0x9,0x0,0x1,0x8,0x9}; - if (i != 0) - SHL(32, gpr.R(d), Imm8(4)); - - MOV(64, R(cr_val), PPCSTATE(cr_val[i])); - - // EQ: Bits 31-0 == 0; set flag bit 1 - TEST(32, R(cr_val), R(cr_val)); - SETcc(CC_Z, R(RSCRATCH)); - LEA(32, gpr.RX(d), MComplex(gpr.RX(d), RSCRATCH, SCALE_2, 0)); - - // GT: Value > 0; set flag bit 2 - TEST(64, R(cr_val), R(cr_val)); - SETcc(CC_G, R(RSCRATCH)); - LEA(32, gpr.RX(d), MComplex(gpr.RX(d), RSCRATCH, SCALE_4, 0)); - - // SO: Bit 61 set; set flag bit 0 - // LT: Bit 62 set; set flag bit 3 - SHR(64, R(cr_val), Imm8(61)); - MOVZX(32, 8, RSCRATCH, MDisp(cr_val, (u32)(u64)m_flagTable)); - OR(32, gpr.R(d), R(RSCRATCH)); - } - + MOV(32, gpr.R(d), R(RSCRATCH)); gpr.UnlockAll(); gpr.UnlockAllX(); } diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 8f106b287b..eecbb4c956 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -151,6 +151,44 @@ void CommonAsmRoutines::GenFres() RET(); } +void CommonAsmRoutines::GenMfcr() +{ + // Input: none + // Output: RSCRATCH + // This function clobbers all three RSCRATCH. + X64Reg dst = RSCRATCH; + X64Reg tmp = RSCRATCH2; + X64Reg cr_val = RSCRATCH_EXTRA; + XOR(32, R(dst), R(dst)); + // we only need to zero the high bits of tmp once + XOR(32, R(tmp), R(tmp)); + for (int i = 0; i < 8; i++) + { + static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 }; + if (i != 0) + SHL(32, R(dst), Imm8(4)); + + MOV(64, R(cr_val), PPCSTATE(cr_val[i])); + + // EQ: Bits 31-0 == 0; set flag bit 1 + TEST(32, R(cr_val), R(cr_val)); + // FIXME: is there a better way to do this without the partial register merging? + SETcc(CC_Z, R(tmp)); + LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0)); + + // GT: Value > 0; set flag bit 2 + TEST(64, R(cr_val), R(cr_val)); + SETcc(CC_G, R(tmp)); + LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0)); + + // SO: Bit 61 set; set flag bit 0 + // LT: Bit 62 set; set flag bit 3 + SHR(64, R(cr_val), Imm8(61)); + OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable)); + } + RET(); +} + // Safe + Fast Quantizers, originally from JITIL by magumagu static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index 847d318230..34a7232a45 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -25,6 +25,7 @@ public: const u8 *frsqrte; const u8 *fres; + const u8 *mfcr; // In: array index: GQR to use. // In: ECX: Address to read from. @@ -58,4 +59,5 @@ public: void GenFifoWrite(int size); void GenFrsqrte(); void GenFres(); + void GenMfcr(); }; From e85f0ff179d737169c2617a1d81d92b3839bee18 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 4 Jan 2015 21:15:50 -0800 Subject: [PATCH 19/19] MMU: fix problems with blocks that cross vmem page boundaries In rare cases, this can result in a violation of the JIT block cache constraint that blocks must end in the same place. This can cause instability, lockups, due to blocks not properly being invalidated properly. l Please enter the commit message for your changes. Lines starting --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 5c6dd390c8..bc1da5a230 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -670,6 +670,15 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 if (inst.hex != 0) { + // Slight hack: the JIT block cache currently assumes all blocks end at the same place, + // but broken blocks due to page faults break this assumption. Avoid this by just ending + // all virtual memory instruction blocks at page boundaries. + // FIXME: improve the JIT block cache so we don't need to do this. + if (virtualAddr && i > 0 && (address & 0xfff) == 0) + { + break; + } + num_inst++; memset(&code[i], 0, sizeof(CodeOp)); GekkoOPInfo *opinfo = GetOpInfo(inst);