From c56526d5f8ddfb8e4895133bc013c216efe86d60 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 4 Jul 2021 20:47:04 +0200 Subject: [PATCH 1/5] PowerPC: Keep track of write-through/cache-inhibited One of the following commits will add emulation of a quirk that only happens when writing to memory which is mapped as write-through or cache-inhibited, so let's keep track of which memory is mapped in this way. --- Source/Core/Core/PowerPC/MMU.cpp | 85 ++++++++++++++++++++++---------- Source/Core/Core/PowerPC/MMU.h | 6 ++- 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/Source/Core/Core/PowerPC/MMU.cpp b/Source/Core/Core/PowerPC/MMU.cpp index 3b49381d76..5a8d5e8051 100644 --- a/Source/Core/Core/PowerPC/MMU.cpp +++ b/Source/Core/Core/PowerPC/MMU.cpp @@ -106,6 +106,7 @@ struct TranslateAddressResult PAGE_FAULT } result; u32 address; + bool wi; // Set to true if the view of memory is either write-through or cache-inhibited bool Success() const { return result <= PAGE_TABLE_TRANSLATED; } }; template @@ -1015,7 +1016,8 @@ u32 IsOptimizableMMIOAccess(u32 address, u32 access_size) // Translate address // If we also optimize for TLB mappings, we'd have to clear the // JitCache on each TLB invalidation. - if (!TranslateBatAddess(dbat_table, &address)) + bool wi = false; + if (!TranslateBatAddess(dbat_table, &address, &wi)) return 0; // Check whether the address is an aligned address of an MMIO register. @@ -1037,7 +1039,8 @@ bool IsOptimizableGatherPipeWrite(u32 address) // Translate address, only check BAT mapping. // If we also optimize for TLB mappings, we'd have to clear the // JitCache on each TLB invalidation. - if (!TranslateBatAddess(dbat_table, &address)) + bool wi = false; + if (!TranslateBatAddess(dbat_table, &address, &wi)) return false; // Check whether the translated address equals the address in WPAR. @@ -1206,18 +1209,20 @@ enum class TLBLookupResult UpdateC }; -static TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag flag, const u32 vpa, u32* paddr) +static TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag flag, const u32 vpa, u32* paddr, + bool* wi) { const u32 tag = vpa >> HW_PAGE_INDEX_SHIFT; TLBEntry& tlbe = ppcState.tlb[IsOpcodeFlag(flag)][tag & HW_PAGE_INDEX_MASK]; if (tlbe.tag[0] == tag) { + UPTE2 PTE2; + PTE2.Hex = tlbe.pte[0]; + // Check if C bit requires updating if (flag == XCheckTLBFlag::Write) { - UPTE2 PTE2; - PTE2.Hex = tlbe.pte[0]; if (PTE2.C == 0) { PTE2.C = 1; @@ -1230,16 +1235,18 @@ static TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag flag, const u32 tlbe.recent = 0; *paddr = tlbe.paddr[0] | (vpa & 0xfff); + *wi = (PTE2.WIMG & 0b1100) != 0; return TLBLookupResult::Found; } if (tlbe.tag[1] == tag) { + UPTE2 PTE2; + PTE2.Hex = tlbe.pte[0]; + // Check if C bit requires updating if (flag == XCheckTLBFlag::Write) { - UPTE2 PTE2; - PTE2.Hex = tlbe.pte[1]; if (PTE2.C == 0) { PTE2.C = 1; @@ -1252,6 +1259,7 @@ static TLBLookupResult LookupTLBPageAddress(const XCheckTLBFlag flag, const u32 tlbe.recent = 1; *paddr = tlbe.paddr[1] | (vpa & 0xfff); + *wi = (PTE2.WIMG & 0b1100) != 0; return TLBLookupResult::Found; } @@ -1286,14 +1294,14 @@ void InvalidateTLBEntry(u32 address) } // Page Address Translation -static TranslateAddressResult TranslatePageAddress(const u32 address, const XCheckTLBFlag flag) +static TranslateAddressResult TranslatePageAddress(const u32 address, const XCheckTLBFlag flag, + bool* wi) { // TLB cache // This catches 99%+ of lookups in practice, so the actual page table entry code below doesn't - // benefit - // much from optimization. + // benefit much from optimization. u32 translatedAddress = 0; - TLBLookupResult res = LookupTLBPageAddress(flag, address, &translatedAddress); + TLBLookupResult res = LookupTLBPageAddress(flag, address, &translatedAddress, wi); if (res == TLBLookupResult::Found) return TranslateAddressResult{TranslateAddressResult::PAGE_TABLE_TRANSLATED, translatedAddress}; @@ -1368,6 +1376,8 @@ static TranslateAddressResult TranslatePageAddress(const u32 address, const XChe if (res != TLBLookupResult::UpdateC) UpdateTLBEntry(flag, PTE2, address); + *wi = (PTE2.WIMG & 0b1100) != 0; + return TranslateAddressResult{TranslateAddressResult::PAGE_TABLE_TRANSLATED, (PTE2.RPN << 12) | offset}; } @@ -1379,7 +1389,7 @@ static TranslateAddressResult TranslatePageAddress(const u32 address, const XChe static void UpdateBATs(BatTable& bat_table, u32 base_spr) { // TODO: Separate BATs for MSR.PR==0 and MSR.PR==1 - // TODO: Handle PP/WIMG settings. + // TODO: Handle PP settings. // TODO: Check how hardware reacts to overlapping BATs (including // BATs which should cause a DSI). // TODO: Check how hardware reacts to invalid BATs (bad mask etc). @@ -1424,19 +1434,38 @@ static void UpdateBATs(BatTable& bat_table, u32 base_spr) u32 physical_address = (batl.BRPN | j) << BAT_INDEX_SHIFT; u32 virtual_address = (batu.BEPI | j) << BAT_INDEX_SHIFT; - // The bottom bit is whether the translation is valid; the second - // bit from the bottom is whether we can use the fastmem arena. + // BAT_MAPPED_BIT is whether the translation is valid + // BAT_PHYSICAL_BIT is whether we can use the fastmem arena + // BAT_WI_BIT is whether either W or I (of WIMG) is set u32 valid_bit = BAT_MAPPED_BIT; - if (Memory::m_pFakeVMEM && (physical_address & 0xFE000000) == 0x7E000000) - valid_bit |= BAT_PHYSICAL_BIT; - else if (physical_address < Memory::GetRamSizeReal()) - valid_bit |= BAT_PHYSICAL_BIT; - else if (Memory::m_pEXRAM && physical_address >> 28 == 0x1 && - (physical_address & 0x0FFFFFFF) < Memory::GetExRamSizeReal()) - valid_bit |= BAT_PHYSICAL_BIT; - else if (physical_address >> 28 == 0xE && - physical_address < 0xE0000000 + Memory::GetL1CacheSize()) - valid_bit |= BAT_PHYSICAL_BIT; + + const bool wi = (batl.WIMG & 0b1100) != 0; + if (wi) + valid_bit |= BAT_WI_BIT; + + // Enable fastmem mappings for cached memory. There are quirks related to uncached memory + // that fastmem doesn't emulate properly (though no normal games are known to rely on them). + if (!wi) + { + if (Memory::m_pFakeVMEM && (physical_address & 0xFE000000) == 0x7E000000) + { + valid_bit |= BAT_PHYSICAL_BIT; + } + else if (physical_address < Memory::GetRamSizeReal()) + { + valid_bit |= BAT_PHYSICAL_BIT; + } + else if (Memory::m_pEXRAM && physical_address >> 28 == 0x1 && + (physical_address & 0x0FFFFFFF) < Memory::GetExRamSizeReal()) + { + valid_bit |= BAT_PHYSICAL_BIT; + } + else if (physical_address >> 28 == 0xE && + physical_address < 0xE0000000 + Memory::GetL1CacheSize()) + { + valid_bit |= BAT_PHYSICAL_BIT; + } + } // Fastmem doesn't support memchecks, so disable it for all overlapping virtual pages. if (PowerPC::memchecks.OverlapsMemcheck(virtual_address, BAT_PAGE_SIZE)) @@ -1511,10 +1540,12 @@ void IBATUpdated() template static TranslateAddressResult TranslateAddress(u32 address) { - if (TranslateBatAddess(IsOpcodeFlag(flag) ? ibat_table : dbat_table, &address)) - return TranslateAddressResult{TranslateAddressResult::BAT_TRANSLATED, address}; + bool wi = false; - return TranslatePageAddress(address, flag); + if (TranslateBatAddess(IsOpcodeFlag(flag) ? ibat_table : dbat_table, &address, &wi)) + return TranslateAddressResult{TranslateAddressResult::BAT_TRANSLATED, address, wi}; + + return TranslatePageAddress(address, flag, &wi); } std::optional GetTranslatedAddress(u32 address) diff --git a/Source/Core/Core/PowerPC/MMU.h b/Source/Core/Core/PowerPC/MMU.h index 9191cea2ab..484daa5b1b 100644 --- a/Source/Core/Core/PowerPC/MMU.h +++ b/Source/Core/Core/PowerPC/MMU.h @@ -199,16 +199,18 @@ constexpr int BAT_INDEX_SHIFT = 17; constexpr u32 BAT_PAGE_SIZE = 1 << BAT_INDEX_SHIFT; constexpr u32 BAT_MAPPED_BIT = 0x1; constexpr u32 BAT_PHYSICAL_BIT = 0x2; -constexpr u32 BAT_RESULT_MASK = UINT32_C(~0x3); +constexpr u32 BAT_WI_BIT = 0x4; +constexpr u32 BAT_RESULT_MASK = UINT32_C(~0x7); using BatTable = std::array; // 128 KB extern BatTable ibat_table; extern BatTable dbat_table; -inline bool TranslateBatAddess(const BatTable& bat_table, u32* address) +inline bool TranslateBatAddess(const BatTable& bat_table, u32* address, bool* wi) { u32 bat_result = bat_table[*address >> BAT_INDEX_SHIFT]; if ((bat_result & BAT_MAPPED_BIT) == 0) return false; *address = (bat_result & BAT_RESULT_MASK) | (*address & (BAT_PAGE_SIZE - 1)); + *wi = (bat_result & BAT_WI_BIT) != 0; return true; } From ecbce0a2040a5fa405d497a2df3695f3997017ab Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 26 Jul 2021 14:59:20 +0200 Subject: [PATCH 2/5] PowerPC: Pass on full 32-bit register contents for 8/16-bit writes --- .../Interpreter/Interpreter_LoadStore.cpp | 20 +-- Source/Core/Core/PowerPC/MMU.cpp | 164 ++++++++++-------- Source/Core/Core/PowerPC/MMU.h | 14 +- 3 files changed, 105 insertions(+), 93 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp index af2ac6eb30..62be247227 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp @@ -323,14 +323,14 @@ void Interpreter::lwzu(UGeckoInstruction inst) void Interpreter::stb(UGeckoInstruction inst) { - PowerPC::Write_U8((u8)rGPR[inst.RS], Helper_Get_EA(PowerPC::ppcState, inst)); + PowerPC::Write_U8(rGPR[inst.RS], Helper_Get_EA(PowerPC::ppcState, inst)); } void Interpreter::stbu(UGeckoInstruction inst) { const u32 address = Helper_Get_EA_U(PowerPC::ppcState, inst); - PowerPC::Write_U8((u8)rGPR[inst.RS], address); + PowerPC::Write_U8(rGPR[inst.RS], address); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { rGPR[inst.RA] = address; @@ -399,14 +399,14 @@ void Interpreter::stfsu(UGeckoInstruction inst) void Interpreter::sth(UGeckoInstruction inst) { - PowerPC::Write_U16((u16)rGPR[inst.RS], Helper_Get_EA(PowerPC::ppcState, inst)); + PowerPC::Write_U16(rGPR[inst.RS], Helper_Get_EA(PowerPC::ppcState, inst)); } void Interpreter::sthu(UGeckoInstruction inst) { const u32 address = Helper_Get_EA_U(PowerPC::ppcState, inst); - PowerPC::Write_U16((u16)rGPR[inst.RS], address); + PowerPC::Write_U16(rGPR[inst.RS], address); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { rGPR[inst.RA] = address; @@ -731,7 +731,7 @@ void Interpreter::stbux(UGeckoInstruction inst) { const u32 address = Helper_Get_EA_UX(PowerPC::ppcState, inst); - PowerPC::Write_U8((u8)rGPR[inst.RS], address); + PowerPC::Write_U8(rGPR[inst.RS], address); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { rGPR[inst.RA] = address; @@ -740,7 +740,7 @@ void Interpreter::stbux(UGeckoInstruction inst) void Interpreter::stbx(UGeckoInstruction inst) { - PowerPC::Write_U8((u8)rGPR[inst.RS], Helper_Get_EA_X(PowerPC::ppcState, inst)); + PowerPC::Write_U8(rGPR[inst.RS], Helper_Get_EA_X(PowerPC::ppcState, inst)); } void Interpreter::stfdux(UGeckoInstruction inst) @@ -819,14 +819,14 @@ void Interpreter::stfsx(UGeckoInstruction inst) void Interpreter::sthbrx(UGeckoInstruction inst) { - PowerPC::Write_U16(Common::swap16((u16)rGPR[inst.RS]), Helper_Get_EA_X(PowerPC::ppcState, inst)); + PowerPC::Write_U16_Swap(rGPR[inst.RS], Helper_Get_EA_X(PowerPC::ppcState, inst)); } void Interpreter::sthux(UGeckoInstruction inst) { const u32 address = Helper_Get_EA_UX(PowerPC::ppcState, inst); - PowerPC::Write_U16((u16)rGPR[inst.RS], address); + PowerPC::Write_U16(rGPR[inst.RS], address); if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI)) { rGPR[inst.RA] = address; @@ -835,7 +835,7 @@ void Interpreter::sthux(UGeckoInstruction inst) void Interpreter::sthx(UGeckoInstruction inst) { - PowerPC::Write_U16((u16)rGPR[inst.RS], Helper_Get_EA_X(PowerPC::ppcState, inst)); + PowerPC::Write_U16(rGPR[inst.RS], Helper_Get_EA_X(PowerPC::ppcState, inst)); } // lswi - bizarro string instruction @@ -968,7 +968,7 @@ void Interpreter::stwbrx(UGeckoInstruction inst) { const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - PowerPC::Write_U32(Common::swap32(rGPR[inst.RS]), address); + PowerPC::Write_U32_Swap(rGPR[inst.RS], address); } // The following two instructions are for SMP communications. On a single diff --git a/Source/Core/Core/PowerPC/MMU.cpp b/Source/Core/Core/PowerPC/MMU.cpp index 5a8d5e8051..2b53dde377 100644 --- a/Source/Core/Core/PowerPC/MMU.cpp +++ b/Source/Core/Core/PowerPC/MMU.cpp @@ -8,6 +8,7 @@ #include #include +#include "Common/Assert.h" #include "Common/BitUtils.h" #include "Common/CommonTypes.h" @@ -256,9 +257,26 @@ static T ReadFromHardware(u32 em_address) return 0; } -template -static void WriteToHardware(u32 em_address, const T data) +template +static void WriteToHardware(u32 em_address, const u32 data, const u32 size) { + DEBUG_ASSERT(size <= 4); + + const u32 em_address_start_page = em_address & ~(HW_PAGE_SIZE - 1); + const u32 em_address_end_page = (em_address + size - 1) & ~(HW_PAGE_SIZE - 1); + if (em_address_start_page != em_address_end_page) + { + // The write crosses a page boundary. Break it up into two writes. + // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. + // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! + const u32 first_half_size = em_address_end_page - em_address; + const u32 second_half_size = size - first_half_size; + WriteToHardware( + em_address, Common::RotateRight(data, second_half_size * 8), first_half_size); + WriteToHardware(em_address_end_page, data, second_half_size); + return; + } + if (!never_translate && MSR.DR) { auto translated_addr = TranslateAddress(em_address); @@ -268,51 +286,24 @@ static void WriteToHardware(u32 em_address, const T data) GenerateDSIException(em_address, true); return; } - if ((em_address & (sizeof(T) - 1)) && - (em_address & (HW_PAGE_SIZE - 1)) > HW_PAGE_SIZE - sizeof(T)) - { - // This could be unaligned down to the byte level... hopefully this is rare, so doing it this - // way isn't too terrible. - // TODO: floats on non-word-aligned boundaries should technically cause alignment exceptions. - // Note that "word" means 32-bit, so paired singles or doubles might still be 32-bit aligned! - u32 em_address_next_page = (em_address + sizeof(T) - 1) & ~(HW_PAGE_SIZE - 1); - auto addr_next_page = TranslateAddress(em_address_next_page); - if (!addr_next_page.Success()) - { - if (flag == XCheckTLBFlag::Write) - GenerateDSIException(em_address_next_page, true); - return; - } - T val = bswap(data); - u32 addr_translated = translated_addr.address; - for (size_t i = 0; i < sizeof(T); i++, addr_translated++) - { - if (em_address + i == em_address_next_page) - addr_translated = addr_next_page.address; - WriteToHardware(addr_translated, static_cast(val >> (i * 8))); - } - return; - } em_address = translated_addr.address; } - // TODO: Make sure these are safe for unaligned addresses. + const u32 swapped_data = Common::swap32(Common::RotateRight(data, size * 8)); if (Memory::m_pRAM && (em_address & 0xF8000000) == 0x00000000) { // Handle RAM; the masking intentionally discards bits (essentially creating // mirrors of memory). // TODO: Only the first GetRamSizeReal() is supposed to be backed by actual memory. - const T swapped_data = bswap(data); - std::memcpy(&Memory::m_pRAM[em_address & Memory::GetRamMask()], &swapped_data, sizeof(T)); + std::memcpy(&Memory::m_pRAM[em_address & Memory::GetRamMask()], &swapped_data, size); return; } if (Memory::m_pEXRAM && (em_address >> 28) == 0x1 && (em_address & 0x0FFFFFFF) < Memory::GetExRamSizeReal()) { - const T swapped_data = bswap(data); - std::memcpy(&Memory::m_pEXRAM[em_address & 0x0FFFFFFF], &swapped_data, sizeof(T)); + std::memcpy(&Memory::m_pEXRAM[em_address & 0x0FFFFFFF], &swapped_data, size); return; } @@ -320,8 +311,7 @@ static void WriteToHardware(u32 em_address, const T data) if (Memory::m_pL1Cache && (em_address >> 28 == 0xE) && (em_address < (0xE0000000 + Memory::GetL1CacheSize()))) { - const T swapped_data = bswap(data); - std::memcpy(&Memory::m_pL1Cache[em_address & 0x0FFFFFFF], &swapped_data, sizeof(T)); + std::memcpy(&Memory::m_pL1Cache[em_address & 0x0FFFFFFF], &swapped_data, size); return; } @@ -330,9 +320,7 @@ static void WriteToHardware(u32 em_address, const T data) // [0x7E000000, 0x80000000). if (Memory::m_pFakeVMEM && ((em_address & 0xFE000000) == 0x7E000000)) { - const T swapped_data = bswap(data); - std::memcpy(&Memory::m_pFakeVMEM[em_address & Memory::GetFakeVMemMask()], &swapped_data, - sizeof(T)); + std::memcpy(&Memory::m_pFakeVMEM[em_address & Memory::GetFakeVMemMask()], &swapped_data, size); return; } @@ -341,19 +329,24 @@ static void WriteToHardware(u32 em_address, const T data) // Pac-Man World 3 in particular is affected by this. if (flag == XCheckTLBFlag::Write && (em_address & 0xFFFFF000) == 0x0C008000) { - switch (sizeof(T)) + switch (size) { case 1: - GPFifo::Write8((u8)data); + GPFifo::Write8(static_cast(data)); return; case 2: - GPFifo::Write16((u16)data); + GPFifo::Write16(static_cast(data)); return; case 4: - GPFifo::Write32((u32)data); + GPFifo::Write32(data); return; - case 8: - GPFifo::Write64((u64)data); + default: + // Some kind of misaligned write. TODO: Does this match how the actual hardware handles it? + for (size_t i = size * 8; i > 0;) + { + i -= 8; + GPFifo::Write8(static_cast(data >> i)); + } return; } } @@ -362,12 +355,28 @@ static void WriteToHardware(u32 em_address, const T data) { if (em_address < 0x0c000000) { - EFB_Write((u32)data, em_address); + EFB_Write(data, em_address); return; } - else + + switch (size) { - Memory::mmio_mapping->Write(em_address, data); + case 1: + Memory::mmio_mapping->Write(em_address, static_cast(data)); + return; + case 2: + Memory::mmio_mapping->Write(em_address, static_cast(data)); + return; + case 4: + Memory::mmio_mapping->Write(em_address, data); + return; + default: + // Some kind of misaligned write. TODO: Does this match how the actual hardware handles it? + for (size_t i = size * 8; i > 0; em_address++) + { + i -= 8; + Memory::mmio_mapping->Write(em_address, static_cast(data >> i)); + } return; } } @@ -608,42 +617,40 @@ u32 Read_U16_ZX(const u32 address) return Read_U16(address); } -void Write_U8(const u8 var, const u32 address) +void Write_U8(const u32 var, const u32 address) { Memcheck(address, var, true, 1); - WriteToHardware(address, var); + WriteToHardware(address, var, 1); } -void Write_U16(const u16 var, const u32 address) +void Write_U16(const u32 var, const u32 address) { Memcheck(address, var, true, 2); - WriteToHardware(address, var); + WriteToHardware(address, var, 2); } -void Write_U16_Swap(const u16 var, const u32 address) +void Write_U16_Swap(const u32 var, const u32 address) { - Memcheck(address, var, true, 2); - Write_U16(Common::swap16(var), address); + Write_U16((var & 0xFFFF0000) | Common::swap16(static_cast(var)), address); } void Write_U32(const u32 var, const u32 address) { Memcheck(address, var, true, 4); - WriteToHardware(address, var); + WriteToHardware(address, var, 4); } void Write_U32_Swap(const u32 var, const u32 address) { - Memcheck(address, var, true, 4); Write_U32(Common::swap32(var), address); } void Write_U64(const u64 var, const u32 address) { Memcheck(address, (u32)var, true, 8); - WriteToHardware(address, var); + WriteToHardware(address, static_cast(var >> 32), 4); + WriteToHardware(address + sizeof(u32), static_cast(var), 4); } void Write_U64_Swap(const u64 var, const u32 address) { - Memcheck(address, (u32)var, true, 8); Write_U64(Common::swap64(var), address); } @@ -688,24 +695,25 @@ double HostRead_F64(const u32 address) return Common::BitCast(integral); } -void HostWrite_U8(const u8 var, const u32 address) +void HostWrite_U8(const u32 var, const u32 address) { - WriteToHardware(address, var); + WriteToHardware(address, var, 1); } -void HostWrite_U16(const u16 var, const u32 address) +void HostWrite_U16(const u32 var, const u32 address) { - WriteToHardware(address, var); + WriteToHardware(address, var, 2); } void HostWrite_U32(const u32 var, const u32 address) { - WriteToHardware(address, var); + WriteToHardware(address, var, 4); } void HostWrite_U64(const u64 var, const u32 address) { - WriteToHardware(address, var); + WriteToHardware(address, static_cast(var >> 32), 4); + WriteToHardware(address + sizeof(u32), static_cast(var), 4); } void HostWrite_F32(const float var, const u32 address) @@ -722,8 +730,8 @@ void HostWrite_F64(const double var, const u32 address) HostWrite_U64(integral, address); } -template -static TryWriteResult HostTryWriteUX(const T var, const u32 address, RequestedAddressSpace space) +static TryWriteResult HostTryWriteUX(const u32 var, const u32 address, const u32 size, + RequestedAddressSpace space) { if (!HostIsRAMAddress(address, space)) return TryWriteResult(); @@ -731,15 +739,15 @@ static TryWriteResult HostTryWriteUX(const T var, const u32 address, RequestedAd switch (space) { case RequestedAddressSpace::Effective: - WriteToHardware(address, var); + WriteToHardware(address, var, size); return TryWriteResult(!!MSR.DR); case RequestedAddressSpace::Physical: - WriteToHardware(address, var); + WriteToHardware(address, var, size); return TryWriteResult(false); case RequestedAddressSpace::Virtual: if (!MSR.DR) return TryWriteResult(); - WriteToHardware(address, var); + WriteToHardware(address, var, size); return TryWriteResult(true); } @@ -747,24 +755,28 @@ static TryWriteResult HostTryWriteUX(const T var, const u32 address, RequestedAd return TryWriteResult(); } -TryWriteResult HostTryWriteU8(const u8 var, const u32 address, RequestedAddressSpace space) +TryWriteResult HostTryWriteU8(const u32 var, const u32 address, RequestedAddressSpace space) { - return HostTryWriteUX(var, address, space); + return HostTryWriteUX(var, address, 1, space); } -TryWriteResult HostTryWriteU16(const u16 var, const u32 address, RequestedAddressSpace space) +TryWriteResult HostTryWriteU16(const u32 var, const u32 address, RequestedAddressSpace space) { - return HostTryWriteUX(var, address, space); + return HostTryWriteUX(var, address, 2, space); } TryWriteResult HostTryWriteU32(const u32 var, const u32 address, RequestedAddressSpace space) { - return HostTryWriteUX(var, address, space); + return HostTryWriteUX(var, address, 4, space); } TryWriteResult HostTryWriteU64(const u64 var, const u32 address, RequestedAddressSpace space) { - return HostTryWriteUX(var, address, space); + const TryWriteResult result = HostTryWriteUX(static_cast(var >> 32), address, 4, space); + if (!result) + return result; + + return HostTryWriteUX(static_cast(var), address + 4, 4, space); } TryWriteResult HostTryWriteF32(const float var, const u32 address, RequestedAddressSpace space) @@ -1001,8 +1013,8 @@ void ClearCacheLine(u32 address) // TODO: This isn't precisely correct for non-RAM regions, but the difference // is unlikely to matter. - for (u32 i = 0; i < 32; i += 8) - WriteToHardware(address + i, 0); + for (u32 i = 0; i < 32; i += 4) + WriteToHardware(address + i, 0, 4); } u32 IsOptimizableMMIOAccess(u32 address, u32 access_size) diff --git a/Source/Core/Core/PowerPC/MMU.h b/Source/Core/Core/PowerPC/MMU.h index 484daa5b1b..9e2a3a0196 100644 --- a/Source/Core/Core/PowerPC/MMU.h +++ b/Source/Core/Core/PowerPC/MMU.h @@ -86,8 +86,8 @@ HostTryReadString(u32 address, size_t size = 0, // Writes a value to emulated memory using the currently active MMU settings. // If the write fails (eg. address does not correspond to a mapped address in the current address // space), a PanicAlert will be shown to the user. -void HostWrite_U8(u8 var, u32 address); -void HostWrite_U16(u16 var, u32 address); +void HostWrite_U8(u32 var, u32 address); +void HostWrite_U16(u32 var, u32 address); void HostWrite_U32(u32 var, u32 address); void HostWrite_U64(u64 var, u32 address); void HostWrite_F32(float var, u32 address); @@ -111,9 +111,9 @@ struct TryWriteResult // If the write succeeds, the returned TryWriteResult contains information on whether the given // address had to be translated or not. Unlike the HostWrite functions, this does not raise a // user-visible alert on failure. -TryWriteResult HostTryWriteU8(u8 var, const u32 address, +TryWriteResult HostTryWriteU8(u32 var, const u32 address, RequestedAddressSpace space = RequestedAddressSpace::Effective); -TryWriteResult HostTryWriteU16(u16 var, const u32 address, +TryWriteResult HostTryWriteU16(u32 var, const u32 address, RequestedAddressSpace space = RequestedAddressSpace::Effective); TryWriteResult HostTryWriteU32(u32 var, const u32 address, RequestedAddressSpace space = RequestedAddressSpace::Effective); @@ -158,12 +158,12 @@ double Read_F64(u32 address); u32 Read_U8_ZX(u32 address); u32 Read_U16_ZX(u32 address); -void Write_U8(u8 var, u32 address); -void Write_U16(u16 var, u32 address); +void Write_U8(u32 var, u32 address); +void Write_U16(u32 var, u32 address); void Write_U32(u32 var, u32 address); void Write_U64(u64 var, u32 address); -void Write_U16_Swap(u16 var, u32 address); +void Write_U16_Swap(u32 var, u32 address); void Write_U32_Swap(u32 var, u32 address); void Write_U64_Swap(u64 var, u32 address); From 12629beff8d9e1eb9a23f1f5ac05d6b56abf946f Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 26 Jul 2021 17:34:00 +0200 Subject: [PATCH 3/5] JitArm64: Call swap variants of memory write functions Write_U16_Swap leaves the upper 32 bits alone. Reimplementing this correctly in the JIT would require more than one instruction, so let's just call Write_U16_Swap instead, like Jit64 does. --- .../Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index b45bbfc35a..51fa36b7b0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -197,18 +197,16 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR } else if (flags & BackPatchInfo::FLAG_STORE) { - ARM64Reg temp = ARM64Reg::W0; - temp = ByteswapBeforeStore(this, temp, RS, flags, false); - if (temp != ARM64Reg::W0) - MOV(ARM64Reg::W0, temp); + const bool reverse = (flags & BackPatchInfo::FLAG_REVERSE) != 0; if (flags & BackPatchInfo::FLAG_SIZE_32) - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); + MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U32_Swap : &PowerPC::Write_U32); else if (flags & BackPatchInfo::FLAG_SIZE_16) - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U16); + MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U16_Swap : &PowerPC::Write_U16); else MOVP2R(ARM64Reg::X8, &PowerPC::Write_U8); + MOV(ARM64Reg::W0, RS); BLR(ARM64Reg::X8); } else if (flags & BackPatchInfo::FLAG_ZERO_256) From 543ed8a97c60cd198d7cb765b8a43992a43fb502 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 26 Jul 2021 16:51:15 +0200 Subject: [PATCH 4/5] PowerPC: Implement broken masking for uncached unaligned writes This implements the behavior described in https://bugs.dolphin-emu.org/issues/12565. Thank you to eigenform, delroth, phire, marcan, segher, and Extrems for all helping in one way or another with the efforts to reverse engineer this behavior, and to Rylie for reporting the issue. --- Source/Core/Core/PowerPC/MMU.cpp | 92 ++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/Source/Core/Core/PowerPC/MMU.cpp b/Source/Core/Core/PowerPC/MMU.cpp index 2b53dde377..50bc648726 100644 --- a/Source/Core/Core/PowerPC/MMU.cpp +++ b/Source/Core/Core/PowerPC/MMU.cpp @@ -11,6 +11,7 @@ #include "Common/Assert.h" #include "Common/BitUtils.h" #include "Common/CommonTypes.h" +#include "Common/Logging/Log.h" #include "Core/ConfigManager.h" #include "Core/HW/CPU.h" @@ -277,6 +278,8 @@ static void WriteToHardware(u32 em_address, const u32 data, const u32 size) return; } + bool wi = false; + if (!never_translate && MSR.DR) { auto translated_addr = TranslateAddress(em_address); @@ -287,41 +290,7 @@ static void WriteToHardware(u32 em_address, const u32 data, const u32 size) return; } em_address = translated_addr.address; - } - - const u32 swapped_data = Common::swap32(Common::RotateRight(data, size * 8)); - - if (Memory::m_pRAM && (em_address & 0xF8000000) == 0x00000000) - { - // Handle RAM; the masking intentionally discards bits (essentially creating - // mirrors of memory). - // TODO: Only the first GetRamSizeReal() is supposed to be backed by actual memory. - std::memcpy(&Memory::m_pRAM[em_address & Memory::GetRamMask()], &swapped_data, size); - return; - } - - if (Memory::m_pEXRAM && (em_address >> 28) == 0x1 && - (em_address & 0x0FFFFFFF) < Memory::GetExRamSizeReal()) - { - std::memcpy(&Memory::m_pEXRAM[em_address & 0x0FFFFFFF], &swapped_data, size); - return; - } - - // Locked L1 technically doesn't have a fixed address, but games all use 0xE0000000. - if (Memory::m_pL1Cache && (em_address >> 28 == 0xE) && - (em_address < (0xE0000000 + Memory::GetL1CacheSize()))) - { - std::memcpy(&Memory::m_pL1Cache[em_address & 0x0FFFFFFF], &swapped_data, size); - return; - } - - // In Fake-VMEM mode, we need to map the memory somewhere into - // physical memory for BAT translation to work; we currently use - // [0x7E000000, 0x80000000). - if (Memory::m_pFakeVMEM && ((em_address & 0xFE000000) == 0x7E000000)) - { - std::memcpy(&Memory::m_pFakeVMEM[em_address & Memory::GetFakeVMemMask()], &swapped_data, size); - return; + wi = translated_addr.wi; } // Check for a gather pipe write. @@ -381,6 +350,59 @@ static void WriteToHardware(u32 em_address, const u32 data, const u32 size) } } + const u32 swapped_data = Common::swap32(Common::RotateRight(data, size * 8)); + + // Locked L1 technically doesn't have a fixed address, but games all use 0xE0000000. + if (Memory::m_pL1Cache && (em_address >> 28 == 0xE) && + (em_address < (0xE0000000 + Memory::GetL1CacheSize()))) + { + std::memcpy(&Memory::m_pL1Cache[em_address & 0x0FFFFFFF], &swapped_data, size); + return; + } + + if (wi && (size < 4 || (em_address & 0x3))) + { + // When a write to memory is performed in hardware, 64 bits of data are sent to the memory + // controller along with a mask. This mask is encoded using just two bits of data - one for + // the upper 32 bits and one for the lower 32 bits - which leads to some odd data duplication + // behavior for write-through/cache-inhibited writes with a start address or end address that + // isn't 32-bit aligned. See https://bugs.dolphin-emu.org/issues/12565 for details. + + const u32 rotated_data = Common::RotateRight(data, ((em_address & 0x3) + size) * 8); + + for (u32 addr = em_address & ~0x7; addr < em_address + size; addr += 8) + { + WriteToHardware(addr, rotated_data, 4); + WriteToHardware(addr + 4, rotated_data, 4); + } + + return; + } + + if (Memory::m_pRAM && (em_address & 0xF8000000) == 0x00000000) + { + // Handle RAM; the masking intentionally discards bits (essentially creating + // mirrors of memory). + std::memcpy(&Memory::m_pRAM[em_address & Memory::GetRamMask()], &swapped_data, size); + return; + } + + if (Memory::m_pEXRAM && (em_address >> 28) == 0x1 && + (em_address & 0x0FFFFFFF) < Memory::GetExRamSizeReal()) + { + std::memcpy(&Memory::m_pEXRAM[em_address & 0x0FFFFFFF], &swapped_data, size); + return; + } + + // In Fake-VMEM mode, we need to map the memory somewhere into + // physical memory for BAT translation to work; we currently use + // [0x7E000000, 0x80000000). + if (Memory::m_pFakeVMEM && ((em_address & 0xFE000000) == 0x7E000000)) + { + std::memcpy(&Memory::m_pFakeVMEM[em_address & Memory::GetFakeVMemMask()], &swapped_data, size); + return; + } + PanicAlertFmt("Unable to resolve write address {:x} PC {:x}", em_address, PC); } // ===================== From f333c0949fb95a9d3e0ad49f9f1852c9ad6068f3 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 26 Jul 2021 17:48:47 +0200 Subject: [PATCH 5/5] PowerPC: Implement PI interrupt for uncached unaligned writes --- Source/Core/Core/PowerPC/MMU.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/Core/PowerPC/MMU.cpp b/Source/Core/Core/PowerPC/MMU.cpp index 50bc648726..3c70a5319c 100644 --- a/Source/Core/Core/PowerPC/MMU.cpp +++ b/Source/Core/Core/PowerPC/MMU.cpp @@ -18,6 +18,7 @@ #include "Core/HW/GPFifo.h" #include "Core/HW/MMIO.h" #include "Core/HW/Memmap.h" +#include "Core/HW/ProcessorInterface.h" #include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/PowerPC.h" @@ -368,6 +369,11 @@ static void WriteToHardware(u32 em_address, const u32 data, const u32 size) // behavior for write-through/cache-inhibited writes with a start address or end address that // isn't 32-bit aligned. See https://bugs.dolphin-emu.org/issues/12565 for details. + // TODO: This interrupt is supposed to have associated cause and address registers + // TODO: This should trigger the hwtest's interrupt handling, but it does not seem to + // (https://github.com/dolphin-emu/hwtests/pull/42) + ProcessorInterface::SetInterrupt(ProcessorInterface::INT_CAUSE_PI); + const u32 rotated_data = Common::RotateRight(data, ((em_address & 0x3) + size) * 8); for (u32 addr = em_address & ~0x7; addr < em_address + size; addr += 8)