From a2f711c04852558bdd025de3367655ff5db4d045 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 18 Jan 2024 09:39:33 +0100 Subject: [PATCH 001/306] Added CP15 Data and Instruction Cache Lockdown Register --- src/ARM.h | 1 + src/CP15.cpp | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/ARM.h b/src/ARM.h index 1e0b71b8..c6711333 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -322,6 +322,7 @@ public: u32 RNGSeed; u32 DTCMSetting, ITCMSetting; + u32 DCacheLockDown, ICacheLockDown; // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate diff --git a/src/CP15.cpp b/src/CP15.cpp index 58137fdd..ebebf975 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -55,6 +55,9 @@ void ARMv5::CP15Reset() DTCMBase = 0xFFFFFFFF; DTCMMask = 0; + ICacheLockDown = 0; + DCacheLockDown = 0; + memset(ICache, 0, 0x2000); ICacheInvalidateAll(); memset(ICacheCount, 0, 64); @@ -628,6 +631,22 @@ void ARMv5::CP15Write(u32 id, u32 val) //printf("flush data cache SI\n"); return; + case 0x900: + // Cache Lockdown - Format B + // Bit 31: Lock bit + // Bit 0..Way-1: locked ways + // The Cache is 4 way associative + // But all bits are r/w + DCacheLockDown = val ; + return; + case 0x901: + // Cache Lockdown - Format B + // Bit 31: Lock bit + // Bit 0..Way-1: locked ways + // The Cache is 4 way associative + // But all bits are r/w + ICacheLockDown = val; + return; case 0x910: DTCMSetting = val & 0xFFFFF03E; @@ -751,6 +770,10 @@ u32 ARMv5::CP15Read(u32 id) const case 0x671: return PU_Region[(id >> 4) & 0xF]; + case 0x900: + return DCacheLockDown; + case 0x901: + return ICacheLockDown; case 0x910: return DTCMSetting; From 716b4af81548614200cdfa560079c9c3b7a8e896 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 18 Jan 2024 14:35:03 +0100 Subject: [PATCH 002/306] Added ICacheLockDown and DCacheLockDown to CP15 savestate --- src/CP15.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index ebebf975..b4bc5050 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -87,6 +87,9 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->VarArray(ITCM, ITCMPhysicalSize); file->VarArray(DTCM, DTCMPhysicalSize); + file->Var32(&DCacheLockDown); + file->Var32(&ICacheLockDown); + file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); file->Var32(&PU_DataCacheWrite); From a4f8e6fe29ac24f15ecd1dc96382fb38244e45eb Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 18 Jan 2024 14:39:08 +0100 Subject: [PATCH 003/306] Updated savestate version --- src/Savestate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Savestate.h b/src/Savestate.h index 2e1400a0..52c7e294 100644 --- a/src/Savestate.h +++ b/src/Savestate.h @@ -24,7 +24,7 @@ #include #include "types.h" -#define SAVESTATE_MAJOR 12 +#define SAVESTATE_MAJOR 13 #define SAVESTATE_MINOR 1 namespace melonDS From 1019afee92778a718301b77b6c8af75123e6c8be Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 19 Jan 2024 08:30:38 +0100 Subject: [PATCH 004/306] Cleaned up magic numbers and simplified (not yet used) ICache functions Marked reading CP15 Cache Dirty Bit as not present --- src/CP15.cpp | 101 ++++++++++++++++++++------------------------- src/MemConstants.h | 14 +++++++ 2 files changed, 58 insertions(+), 57 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index b4bc5050..916f1631 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -58,9 +58,9 @@ void ARMv5::CP15Reset() ICacheLockDown = 0; DCacheLockDown = 0; - memset(ICache, 0, 0x2000); + memset(ICache, 0, ICACHE_SIZE); ICacheInvalidateAll(); - memset(ICacheCount, 0, 64); + memset(ICacheCount, 0, ICACHE_LINESPERSET); PU_CodeCacheable = 0; PU_DataCacheable = 0; @@ -340,42 +340,26 @@ u32 ARMv5::RandomLineIndex() void ARMv5::ICacheLookup(u32 addr) { - u32 tag = addr & 0xFFFFF800; - u32 id = (addr >> 5) & 0x3F; + u32 tag = addr & ~(ICACHE_LINESPERSET * ICACHE_LINELENGTH - 1); + u32 id = (addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); - id <<= 2; - if (ICacheTags[id+0] == tag) + id <<= ICACHE_SETS_LOG2; + for (int set=0;set>2]; - ICacheCount[id>>2] = (line+1) & 0x3; + line = ICacheCount[id>>ICACHE_SETS_LOG2]; + ICacheCount[id>>ICACHE_SETS_LOG2] = (line+1) & (ICACHE_SETS-1); } else { @@ -384,16 +368,16 @@ void ARMv5::ICacheLookup(u32 addr) line += id; - addr &= ~0x1F; - u8* ptr = &ICache[line << 5]; + addr &= ~(ICACHE_LINELENGTH-1); + u8* ptr = &ICache[line << ICACHE_LINELENGTH_LOG2]; if (CodeMem.Mem) { - memcpy(ptr, &CodeMem.Mem[addr & CodeMem.Mask], 32); + memcpy(ptr, &CodeMem.Mem[addr & CodeMem.Mask], ICACHE_LINELENGTH); } else { - for (int i = 0; i < 32; i+=4) + for (int i = 0; i < ICACHE_LINELENGTH; i+=sizeof(u32)) *(u32*)&ptr[i] = NDS.ARM9Read32(addr+i); } @@ -407,35 +391,29 @@ void ARMv5::ICacheLookup(u32 addr) void ARMv5::ICacheInvalidateByAddr(u32 addr) { - u32 tag = addr & 0xFFFFF800; - u32 id = (addr >> 5) & 0x3F; + u32 tag = addr & ~(ICACHE_LINESPERSET * ICACHE_LINELENGTH - 1); + u32 id = (addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); - id <<= 2; - if (ICacheTags[id+0] == tag) + id <<= ICACHE_SETS_LOG2; + for (int set=0;set> 4) & 0xF]; + case 0x7A6: + // read Cache Dirty Bit (optional) + // it is not present on the NDS/DSi + return 0; + case 0x900: return DCacheLockDown; case 0x901: diff --git a/src/MemConstants.h b/src/MemConstants.h index e9aa6b2b..ccd1ea00 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -34,6 +34,20 @@ constexpr u32 ITCMPhysicalSize = 0x8000; constexpr u32 DTCMPhysicalSize = 0x4000; constexpr u32 ARM7BIOSCRC32 = 0x1280f0d5; constexpr u32 ARM9BIOSCRC32 = 0x2ab23573; + +constexpr u32 ICACHE_SIZE_LOG2 = 13; +constexpr u32 ICACHE_SIZE = 1 << ICACHE_SIZE_LOG2; +constexpr u32 ICACHE_SETS_LOG2 = 2; +constexpr u32 ICACHE_SETS = 1 << ICACHE_SETS_LOG2; +constexpr u32 ICACHE_LINELENGTH_ENCODED = 2; +constexpr u32 ICACHE_LINELENGTH_LOG2 = ICACHE_LINELENGTH_ENCODED + 3; +constexpr u32 ICACHE_LINELENGTH = 8 * (1 << ICACHE_LINELENGTH_ENCODED); +constexpr u32 ICACHE_LINESPERSET = ICACHE_SIZE / (ICACHE_SETS * ICACHE_LINELENGTH); + +constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 < 14); +constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 < 12); +constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 < 2); +constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 < 3); } #endif // MELONDS_MEMCONSTANTS_H \ No newline at end of file From 434c234098dcefaeeebeefc7c699f0541b02d1fe Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 19 Jan 2024 10:00:02 +0100 Subject: [PATCH 005/306] Enable instruction cache routines. Fixed typos in constants. --- src/ARM.h | 1 + src/CP15.cpp | 18 +++++++++++++++++- src/MemConstants.h | 9 +++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index c6711333..64921654 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -313,6 +313,7 @@ public: void ICacheLookup(u32 addr); void ICacheInvalidateByAddr(u32 addr); void ICacheInvalidateAll(); + bool IsAddressICachable(u32 addr); void CP15Write(u32 id, u32 val); u32 CP15Read(u32 id) const; diff --git a/src/CP15.cpp b/src/CP15.cpp index 916f1631..0573f974 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -417,6 +417,11 @@ void ARMv5::ICacheInvalidateAll() ICacheTags[i] = 1; } +bool ARMv5::IsAddressICachable(u32 addr) +{ + return PU_Map[addr >> 12] & 0x40 ; +} + void ARMv5::CP15Write(u32 id, u32 val) { @@ -430,7 +435,7 @@ void ARMv5::CP15Write(u32 id, u32 val) val &= 0x000FF085; CP15Control &= ~0x000FF085; CP15Control |= val; - //printf("CP15Control = %08X (%08X->%08X)\n", CP15Control, old, val); + //Log(LogLevel::Debug, "CP15Control = %08X (%08X->%08X)\n", CP15Control, old, val); UpdateDTCMSetting(); UpdateITCMSetting(); if ((old & 0x1005) != (val & 0x1005)) @@ -800,6 +805,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } CodeCycles = RegionCodeCycles; +#if 0 if (CodeCycles == 0xFF) // cached memory. hax { if (branch || !(addr & 0x1F)) @@ -809,6 +815,16 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) //return *(u32*)&CurICacheLine[addr & 0x1C]; } +#else + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) + { + if (IsAddressICachable(addr)) + { + ICacheLookup(addr); + return *(u32*)&CurICacheLine[addr & (ICACHE_LINELENGTH - 4)]; + } + } +#endif if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; diff --git a/src/MemConstants.h b/src/MemConstants.h index ccd1ea00..fbb37523 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -44,10 +44,11 @@ constexpr u32 ICACHE_LINELENGTH_LOG2 = ICACHE_LINELENGTH_ENCODED + 3; constexpr u32 ICACHE_LINELENGTH = 8 * (1 << ICACHE_LINELENGTH_ENCODED); constexpr u32 ICACHE_LINESPERSET = ICACHE_SIZE / (ICACHE_SETS * ICACHE_LINELENGTH); -constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 < 14); -constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 < 12); -constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 < 2); -constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 < 3); +constexpr u32 CP15_CR_MPUENABLE = (1 << 0); +constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 << 14); +constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); +constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); +constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); } #endif // MELONDS_MEMCONSTANTS_H \ No newline at end of file From d9fcc2ec2c47ea91a1e5542f7fdcf30c3536e398 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 19 Jan 2024 10:24:02 +0100 Subject: [PATCH 006/306] Replaced more CP15 magic values with named constants --- src/CP15.cpp | 22 +++++++++++----------- src/MemConstants.h | 5 +++++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 0573f974..781f1253 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -114,7 +114,7 @@ void ARMv5::UpdateDTCMSetting() u32 newDTCMMask; u32 newDTCMSize; - if (CP15Control & (1<<16)) + if (CP15Control & CP15_TCM_CR_DTCM_ENABLE) { newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); if (newDTCMSize < 0x1000) newDTCMSize = 0x1000; @@ -138,7 +138,7 @@ void ARMv5::UpdateDTCMSetting() void ARMv5::UpdateITCMSetting() { - if (CP15Control & (1<<18)) + if (CP15Control & CP15_TCM_CR_ITCM_ENABLE) { ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); #ifdef JIT_ENABLED @@ -156,7 +156,7 @@ void ARMv5::UpdateITCMSetting() // (not to the region range/enabled status) void ARMv5::UpdatePURegion(u32 n) { - if (!(CP15Control & (1<<0))) + if (!(CP15Control & CP15_CR_MPUENABLE)) return; u32 coderw = (PU_CodeRW >> (4*n)) & 0xF; @@ -170,12 +170,12 @@ void ARMv5::UpdatePURegion(u32 n) // 1/0: goes to memory and cache // 1/1: goes to cache - if (CP15Control & (1<<12)) + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) codecache = (PU_CodeCacheable >> n) & 0x1; else codecache = 0; - if (CP15Control & (1<<2)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { datacache = (PU_DataCacheable >> n) & 0x1; datawrite = (PU_DataCacheWrite >> n) & 0x1; @@ -263,13 +263,13 @@ void ARMv5::UpdatePURegion(u32 n) void ARMv5::UpdatePURegions(bool update_all) { - if (!(CP15Control & (1<<0))) + if (!(CP15Control & CP15_CR_MPUENABLE)) { // PU disabled u8 mask = 0x07; - if (CP15Control & (1<<2)) mask |= 0x30; - if (CP15Control & (1<<12)) mask |= 0x40; + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) mask |= 0x30; + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) mask |= 0x40; memset(PU_UserMap, mask, 0x100000); memset(PU_PrivMap, mask, 0x100000); @@ -442,9 +442,9 @@ void ARMv5::CP15Write(u32 id, u32 val) { UpdatePURegions((old & 0x1) != (val & 0x1)); } - if (val & (1<<7)) Log(LogLevel::Warn, "!!!! ARM9 BIG ENDIAN MODE. VERY BAD. SHIT GONNA ASPLODE NOW\n"); - if (val & (1<<13)) ExceptionBase = 0xFFFF0000; - else ExceptionBase = 0x00000000; + if (val & CP15_CR_BIGENDIAN) Log(LogLevel::Warn, "!!!! ARM9 BIG ENDIAN MODE. VERY BAD. SHIT GONNA ASPLODE NOW\n"); + if (val & CP15_CR_HIGHEXCEPTIONBASE) ExceptionBase = 0xFFFF0000; + else ExceptionBase = 0x00000000; } return; diff --git a/src/MemConstants.h b/src/MemConstants.h index fbb37523..f06edd96 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -45,10 +45,15 @@ constexpr u32 ICACHE_LINELENGTH = 8 * (1 << ICACHE_LINELENGTH_ENCODED); constexpr u32 ICACHE_LINESPERSET = ICACHE_SIZE / (ICACHE_SETS * ICACHE_LINELENGTH); constexpr u32 CP15_CR_MPUENABLE = (1 << 0); +constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); +constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 << 14); constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); +constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); +constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); + } #endif // MELONDS_MEMCONSTANTS_H \ No newline at end of file From b6b0197dd3e1da1df3a14e0446772885f271a8ff Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 19 Jan 2024 10:38:07 +0100 Subject: [PATCH 007/306] Implemented "weird" instruction cache invalidation by Set/Way --- src/ARM.h | 2 ++ src/CP15.cpp | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/ARM.h b/src/ARM.h index 64921654..e9611b81 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -312,6 +312,8 @@ public: void ICacheLookup(u32 addr); void ICacheInvalidateByAddr(u32 addr); + void ICacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine); + void ICacheInvalidateAll(); bool IsAddressICachable(u32 addr); diff --git a/src/CP15.cpp b/src/CP15.cpp index 781f1253..22285cf7 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -408,6 +408,21 @@ void ARMv5::ICacheInvalidateByAddr(u32 addr) } } +void ARMv5::ICacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine) +{ + if (cacheSet >= ICACHE_SETS) + return; + if (cacheLine >= ICACHE_LINESPERSET) + return; + + u32 idx = (cacheLine << ICACHE_SETS_LOG2) + cacheSet; + // TODO: is this a valid magic number? + // it should indicate that no address is loaded here, instead + // a tag of 1 indicates that addr 0x00000800.. 0x0000FBF (depending on id) ist stored at this set. + ICacheTags[idx] = 1; +} + + void ARMv5::ICacheInvalidateAll() { // TODO: is this a valid magic number? @@ -598,7 +613,12 @@ void ARMv5::CP15Write(u32 id, u32 val) //Halt(255); return; case 0x752: - Log(LogLevel::Warn, "CP15: ICACHE INVALIDATE WEIRD. %08X\n", val); + { + // Cache invalidat by line number and set number + u8 cacheSet = val >> (32 - ICACHE_SETS_LOG2) & (ICACHE_SETS -1); + u8 cacheLine = (val >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET -1); + ICacheInvalidateBySetAndWay(cacheSet, cacheLine); + } //Halt(255); return; From 0a07661b5744587196009fd6a2515c66817fe53d Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 24 Jan 2024 09:51:56 +0100 Subject: [PATCH 008/306] Implemented DCache and several CP15 registers --- src/ARM.h | 30 +++- src/CP15.cpp | 355 +++++++++++++++++++++++++++++++++++++++++---- src/MemConstants.h | 13 ++ 3 files changed, 363 insertions(+), 35 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index e9611b81..99ef4baf 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -311,11 +311,25 @@ public: u32 RandomLineIndex(); void ICacheLookup(u32 addr); + bool IsAddressICachable(u32 addr); + + void ICacheInvalidateAll(); void ICacheInvalidateByAddr(u32 addr); void ICacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine); - void ICacheInvalidateAll(); - bool IsAddressICachable(u32 addr); + + void DCacheLookup(u32 addr); + bool IsAddressDCachable(u32 addr); + + void DCacheInvalidateAll(); + void DCacheInvalidateByAddr(u32 addr); + void DCacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine); + + void DCacheClearAll(); + void DCacheClearByAddr(u32 addr); + void DCacheClearByASetAndWay(u8 cacheSet, u8 cacheLine); + + void CP15Write(u32 id, u32 val); u32 CP15Read(u32 id) const; @@ -326,6 +340,7 @@ public: u32 DTCMSetting, ITCMSetting; u32 DCacheLockDown, ICacheLockDown; + u32 CacheDebugRegisterIndex; // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate @@ -336,9 +351,13 @@ public: u8 ITCM[ITCMPhysicalSize]; u8* DTCM; - u8 ICache[0x2000]; - u32 ICacheTags[64*4]; - u8 ICacheCount[64]; + u8 ICache[ICACHE_SIZE]; + u32 ICacheTags[ICACHE_LINESPERSET*ICACHE_SETS]; + u8 ICacheCount; + + u8 DCache[DCACHE_SIZE]; + u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; + u8 DCacheCount; u32 PU_CodeCacheable; u32 PU_DataCacheable; @@ -361,6 +380,7 @@ public: u8 MemTimings[0x100000][4]; u8* CurICacheLine; + u8* CurDCacheLine; bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); diff --git a/src/CP15.cpp b/src/CP15.cpp index 22285cf7..ea3acddf 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -57,10 +57,15 @@ void ARMv5::CP15Reset() ICacheLockDown = 0; DCacheLockDown = 0; + CacheDebugRegisterIndex = 0; memset(ICache, 0, ICACHE_SIZE); ICacheInvalidateAll(); - memset(ICacheCount, 0, ICACHE_LINESPERSET); + ICacheCount = 0; + + memset(DCache, 0, DCACHE_SIZE); + DCacheInvalidateAll(); + DCacheCount = 0; PU_CodeCacheable = 0; PU_DataCacheable = 0; @@ -87,8 +92,17 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->VarArray(ITCM, ITCMPhysicalSize); file->VarArray(DTCM, DTCMPhysicalSize); + file->VarArray(ICache, sizeof(ICache)); + file->VarArray(ICacheTags, sizeof(ICacheTags)); + file->Var8(&ICacheCount); + + file->VarArray(DCache, sizeof(DCache)); + file->VarArray(DCacheTags, sizeof(DCacheTags)); + file->Var8(&DCacheCount); + file->Var32(&DCacheLockDown); file->Var32(&ICacheLockDown); + file->Var32(&CacheDebugRegisterIndex); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -340,13 +354,13 @@ u32 ARMv5::RandomLineIndex() void ARMv5::ICacheLookup(u32 addr) { - u32 tag = addr & ~(ICACHE_LINESPERSET * ICACHE_LINELENGTH - 1); + u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; u32 id = (addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); id <<= ICACHE_SETS_LOG2; for (int set=0;set>ICACHE_SETS_LOG2]; - ICacheCount[id>>ICACHE_SETS_LOG2] = (line+1) & (ICACHE_SETS-1); + line = ICacheCount; + ICacheCount = (line+1) & (ICACHE_SETS-1); } else { @@ -381,7 +395,7 @@ void ARMv5::ICacheLookup(u32 addr) *(u32*)&ptr[i] = NDS.ARM9Read32(addr+i); } - ICacheTags[line] = tag; + ICacheTags[line] = addr | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); @@ -391,18 +405,15 @@ void ARMv5::ICacheLookup(u32 addr) void ARMv5::ICacheInvalidateByAddr(u32 addr) { - u32 tag = addr & ~(ICACHE_LINESPERSET * ICACHE_LINELENGTH - 1); + u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; u32 id = (addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); id <<= ICACHE_SETS_LOG2; for (int set=0;set> 12] & 0x40 ; } +void ARMv5::DCacheLookup(u32 addr) +{ + //Log(LogLevel::Debug,"DCache load @ %08x\n", addr); + addr &= ~3; + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + + id <<= DCACHE_SETS_LOG2; + for (int set=0;set %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + return; + } + } + + // cache miss + u32 line; + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) + { + line = DCacheCount; + DCacheCount = (line+1) & (DCACHE_SETS-1); + } + else + { + line = RandomLineIndex(); + } + + line += id; + + addr &= ~(DCACHE_LINELENGTH-1); + u8* ptr = &DCache[line << DCACHE_LINELENGTH_LOG2]; + + //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", addr); + for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) + { + //DataRead32S(addr+i, (u32*)&ptr[i]); + if (addr+i < ITCMSize) + { + *((u32*)&ptr[i]) = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + } else + if (((addr+i) & DTCMMask) == DTCMBase) + { + *((u32*)&ptr[i]) = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + } else + { + *((u32*)&ptr[i]) = BusRead32(addr+i); + + } + //Log(LogLevel::Debug,"DCache store @ %08x: %08x\n", addr+i, *(u32*)&ptr[i]); + } + + DCacheTags[line] = addr | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; + + // ouch :/ + //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); + DataCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * 7)) << NDS.ARM9ClockShift; + CurDCacheLine = ptr; +} + +void ARMv5::DCacheInvalidateByAddr(u32 addr) +{ + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + + id <<= DCACHE_SETS_LOG2; + for (int set=0;set= DCACHE_SETS) + return; + if (cacheLine >= DCACHE_LINESPERSET) + return; + + u32 idx = (cacheLine << DCACHE_SETS_LOG2) + cacheSet; + DCacheTags[idx] &= ~CACHE_FLAG_VALID; ; +} + + +void ARMv5::DCacheInvalidateAll() +{ + for (int i = 0; i < DCACHE_SIZE / DCACHE_LINELENGTH; i++) + DCacheTags[i] &= ~CACHE_FLAG_VALID; ; +} + +void ARMv5::DCacheClearAll() +{ + // TODO: right now any write to cached data goes straight to the + // underlying memory and invalidates the cache line. +} + +void ARMv5::DCacheClearByAddr(u32 addr) +{ + // TODO: right now any write to cached data goes straight to the + // underlying memory and invalidates the cache line. +} + +void ARMv5::DCacheClearByASetAndWay(u8 cacheSet, u8 cacheLine) +{ + // TODO: right now any write to cached data goes straight to the + // underlying memory and invalidates the cache line. +} + +bool ARMv5::IsAddressDCachable(u32 addr) +{ + return PU_Map[addr >> 12] & 0x10 ; +} void ARMv5::CP15Write(u32 id, u32 val) { @@ -623,23 +748,75 @@ void ARMv5::CP15Write(u32 id, u32 val) return; - case 0x761: + case 0x760: + DCacheInvalidateAll(); //printf("inval data cache %08X\n", val); return; - case 0x762: + case 0x761: + DCacheInvalidateByAddr(val); //printf("inval data cache SI\n"); return; + case 0x762: + { + // Cache invalidat by line number and set number + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheInvalidateBySetAndWay(cacheSet, cacheLine); + } + return; + case 0x770: + // invalidate both caches + ICacheInvalidateAll(); + DCacheInvalidateAll(); + break; + + case 0x7A0: + //Log(LogLevel::Debug,"clean data cache\n"); + DCacheClearAll(); + return; case 0x7A1: - //printf("flush data cache %08X\n", val); + //Log(LogLevel::Debug,"clean data cache MVA\n"); + DCacheClearByAddr(val); return; case 0x7A2: - //printf("flush data cache SI\n"); + //Log(LogLevel::Debug,"clean data cache SET/WAY\n"); + { + // Cache invalidat by line number and set number + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheClearByASetAndWay(cacheSet, cacheLine); + } return; case 0x7A3: // Test and clean (optional) // Is not present on the NDS/DSi - return; + return; + + case 0x7D1: + Log(LogLevel::Debug,"Prefetch instruction cache MVA\n"); + break; + + case 0x7E0: + //Log(LogLevel::Debug,"clean & invalidate data cache\n"); + DCacheClearAll(); + DCacheInvalidateAll(); + return; + case 0x7E1: + //Log(LogLevel::Debug,"clean & invalidate data cache MVA\n"); + DCacheClearByAddr(val); + DCacheInvalidateByAddr(val); + return; + case 0x7E2: + //Log(LogLevel::Debug,"clean & invalidate data cache SET/WAY\n"); + { + // Cache invalidat by line number and set number + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheClearByASetAndWay(cacheSet, cacheLine); + DCacheInvalidateBySetAndWay(cacheSet, cacheLine); + } + return; case 0x900: // Cache Lockdown - Format B @@ -648,6 +825,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // The Cache is 4 way associative // But all bits are r/w DCacheLockDown = val ; + Log(LogLevel::Debug,"ICacheLockDown\n"); return; case 0x901: // Cache Lockdown - Format B @@ -656,6 +834,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // The Cache is 4 way associative // But all bits are r/w ICacheLockDown = val; + Log(LogLevel::Debug,"ICacheLockDown\n"); return; case 0x910: @@ -669,16 +848,27 @@ void ARMv5::CP15Write(u32 id, u32 val) return; case 0xF00: - //printf("cache debug index register %08X\n", val); + CacheDebugRegisterIndex = val; return; case 0xF10: - //printf("cache debug instruction tag %08X\n", val); - return; + // instruction cache Tag register + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + ICacheTags[(index << ICACHE_SETS_LOG2) + segment] = val; + } case 0xF20: - //printf("cache debug data tag %08X\n", val); - return; + // data cache Tag register + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + DCacheTags[(index << DCACHE_SETS_LOG2) + segment] = val; + } + case 0xF30: //printf("cache debug instruction cache %08X\n", val); @@ -794,6 +984,27 @@ u32 ARMv5::CP15Read(u32 id) const return DTCMSetting; case 0x911: return ITCMSetting; + + case 0xF00: + return CacheDebugRegisterIndex; + case 0xF10: + // instruction cache Tag register + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + Log(LogLevel::Debug, "Read ICache Tag %08lx -> %08lx\n", CacheDebugRegisterIndex, ICacheTags[(index << ICACHE_SETS_LOG2) + segment]); + return ICacheTags[(index << ICACHE_SETS_LOG2) + segment]; + } + case 0xF20: + // data cache Tag register + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + Log(LogLevel::Debug, "Read DCache Tag %08lx (%u, %02x, %u) -> %08lx\n", CacheDebugRegisterIndex, segment, index, wordAddress, DCacheTags[(index << DCACHE_SETS_LOG2) + segment]); + return DCacheTags[(index << DCACHE_SETS_LOG2) + segment]; + } } if ((id & 0xF00) == 0xF00) // test/debug shit? @@ -860,6 +1071,19 @@ void ARMv5::DataRead8(u32 addr, u32* val) return; } +#if 1 + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + DataCycles = 1; + *val = CurDCacheLine[addr & (DCACHE_LINELENGTH - 1)]; + return; + } + } +#endif + DataRegion = addr; if (addr < ITCMSize) @@ -887,6 +1111,19 @@ void ARMv5::DataRead16(u32 addr, u32* val) return; } +#if 1 + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + DataCycles = 1; + *val = *(u16 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 2)]; + return; + } + } +#endif + DataRegion = addr; addr &= ~1; @@ -916,6 +1153,19 @@ void ARMv5::DataRead32(u32 addr, u32* val) return; } +#if 1 + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + DataCycles = 1; + *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; + return; + } + } +#endif + DataRegion = addr; addr &= ~3; @@ -941,6 +1191,19 @@ void ARMv5::DataRead32S(u32 addr, u32* val) { addr &= ~3; +#if 1 + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + DataCycles = 1; + *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; + return; + } + } +#endif + if (addr < ITCMSize) { DataCycles += 1; @@ -966,6 +1229,14 @@ void ARMv5::DataWrite8(u32 addr, u8 val) return; } + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheInvalidateByAddr(addr); + } + } + DataRegion = addr; if (addr < ITCMSize) @@ -994,6 +1265,14 @@ void ARMv5::DataWrite16(u32 addr, u16 val) return; } + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheInvalidateByAddr(addr); + } + } + DataRegion = addr; addr &= ~1; @@ -1024,6 +1303,14 @@ void ARMv5::DataWrite32(u32 addr, u32 val) return; } + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheInvalidateByAddr(addr); + } + } + DataRegion = addr; addr &= ~3; @@ -1050,6 +1337,14 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) { addr &= ~3; + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (PU_Map[addr >> 12] & 0x10) + { + DCacheInvalidateByAddr(addr); + } + } + if (addr < ITCMSize) { DataCycles += 1; diff --git a/src/MemConstants.h b/src/MemConstants.h index f06edd96..6b378f7a 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -44,6 +44,19 @@ constexpr u32 ICACHE_LINELENGTH_LOG2 = ICACHE_LINELENGTH_ENCODED + 3; constexpr u32 ICACHE_LINELENGTH = 8 * (1 << ICACHE_LINELENGTH_ENCODED); constexpr u32 ICACHE_LINESPERSET = ICACHE_SIZE / (ICACHE_SETS * ICACHE_LINELENGTH); +constexpr u32 DCACHE_SIZE_LOG2 = 12; +constexpr u32 DCACHE_SIZE = 1 << DCACHE_SIZE_LOG2; +constexpr u32 DCACHE_SETS_LOG2 = 2; +constexpr u32 DCACHE_SETS = 1 << DCACHE_SETS_LOG2; +constexpr u32 DCACHE_LINELENGTH_ENCODED = 2; +constexpr u32 DCACHE_LINELENGTH_LOG2 = DCACHE_LINELENGTH_ENCODED + 3; +constexpr u32 DCACHE_LINELENGTH = 8 * (1 << DCACHE_LINELENGTH_ENCODED); +constexpr u32 DCACHE_LINESPERSET = DCACHE_SIZE / (DCACHE_SETS * DCACHE_LINELENGTH); + +constexpr u32 CACHE_FLAG_VALID = (1 << 4); +constexpr u32 CACHE_FLAG_DIRTY_LOWERHALF = (1 << 2); +constexpr u32 CACHE_FLAG_DIRTY_UPPERHALF = (1 << 3); + constexpr u32 CP15_CR_MPUENABLE = (1 << 0); constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); From a46f972c2179513060fc845faf2cbca418dc2133 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 24 Jan 2024 10:37:17 +0100 Subject: [PATCH 009/306] Added Cache Data Write and Read via CP15 --- src/CP15.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index ea3acddf..3de89caa 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -872,10 +872,22 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0xF30: //printf("cache debug instruction cache %08X\n", val); + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + *(u32 *)&ICache[(((index << ICACHE_SETS_LOG2) + segment) << ICACHE_LINELENGTH_LOG2) + wordAddress*4] = val; + } return; case 0xF40: //printf("cache debug data cache %08X\n", val); + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + *(u32 *)&DCache[((index << DCACHE_SETS_LOG2) + segment) << DCACHE_LINELENGTH_LOG2 + wordAddress*4] = val; + } return; } @@ -1005,6 +1017,20 @@ u32 ARMv5::CP15Read(u32 id) const Log(LogLevel::Debug, "Read DCache Tag %08lx (%u, %02x, %u) -> %08lx\n", CacheDebugRegisterIndex, segment, index, wordAddress, DCacheTags[(index << DCACHE_SETS_LOG2) + segment]); return DCacheTags[(index << DCACHE_SETS_LOG2) + segment]; } + case 0xF30: + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + return *(u32 *)&ICache[(((index << ICACHE_SETS_LOG2) + segment) << ICACHE_LINELENGTH_LOG2) + wordAddress*4]; + } + case 0xF40: + { + uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); + uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; + uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + return *(u32 *)&DCache[(((index << DCACHE_SETS_LOG2) + segment) << DCACHE_LINELENGTH_LOG2) + wordAddress*4]; + } } if ((id & 0xF00) == 0xF00) // test/debug shit? From f67e93918cf51972f40ca9a2614b33b95d38ff0f Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 24 Jan 2024 11:02:34 +0100 Subject: [PATCH 010/306] Fixed data cache using only 1 cycle on miss. --- src/CP15.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 3de89caa..edd720cf 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -456,6 +456,8 @@ void ARMv5::DCacheLookup(u32 addr) { DataCycles = 1; CurDCacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + DataCycles = 1; + //Log(LogLevel::Debug,"DCache hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } @@ -492,7 +494,6 @@ void ARMv5::DCacheLookup(u32 addr) } else { *((u32*)&ptr[i]) = BusRead32(addr+i); - } //Log(LogLevel::Debug,"DCache store @ %08x: %08x\n", addr+i, *(u32*)&ptr[i]); } @@ -1103,7 +1104,6 @@ void ARMv5::DataRead8(u32 addr, u32* val) if (PU_Map[addr >> 12] & 0x10) { DCacheLookup(addr & ~3); - DataCycles = 1; *val = CurDCacheLine[addr & (DCACHE_LINELENGTH - 1)]; return; } @@ -1143,7 +1143,6 @@ void ARMv5::DataRead16(u32 addr, u32* val) if (PU_Map[addr >> 12] & 0x10) { DCacheLookup(addr & ~3); - DataCycles = 1; *val = *(u16 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 2)]; return; } @@ -1185,7 +1184,6 @@ void ARMv5::DataRead32(u32 addr, u32* val) if (PU_Map[addr >> 12] & 0x10) { DCacheLookup(addr & ~3); - DataCycles = 1; *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; return; } @@ -1223,7 +1221,6 @@ void ARMv5::DataRead32S(u32 addr, u32* val) if (PU_Map[addr >> 12] & 0x10) { DCacheLookup(addr & ~3); - DataCycles = 1; *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; return; } From b23cb819bbca8612bab0f764f73b9394e61af973 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 24 Jan 2024 11:57:57 +0100 Subject: [PATCH 011/306] Changed write to cached data from invalidating to updating the cached data to get nearer to hw timings --- src/ARM.h | 3 ++ src/CP15.cpp | 85 +++++++++++++++++++++++++++++++++++++++++------- wfcsettings.bin | Bin 0 -> 2304 bytes 3 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 wfcsettings.bin diff --git a/src/ARM.h b/src/ARM.h index 99ef4baf..f7a6b98b 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -319,6 +319,9 @@ public: void DCacheLookup(u32 addr); + void DCacheWrite32(u32 addr, u32 val); + void DCacheWrite16(u32 addr, u16 val); + void DCacheWrite8(u32 addr, u8 val); bool IsAddressDCachable(u32 addr); void DCacheInvalidateAll(); diff --git a/src/CP15.cpp b/src/CP15.cpp index edd720cf..b3afb00b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -454,7 +454,6 @@ void ARMv5::DCacheLookup(u32 addr) { if ((DCacheTags[id+set] & ~0x0F) == tag) { - DataCycles = 1; CurDCacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; DataCycles = 1; @@ -506,6 +505,66 @@ void ARMv5::DCacheLookup(u32 addr) CurDCacheLine = ptr; } +void ARMv5::DCacheWrite32(u32 addr, u32 val) +{ + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + + id <<= DCACHE_SETS_LOG2; + for (int set=0;set %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + return; + } + } +} + +void ARMv5::DCacheWrite16(u32 addr, u16 val) +{ + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + + id <<= DCACHE_SETS_LOG2; + for (int set=0;set %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + return; + } + } +} + +void ARMv5::DCacheWrite8(u32 addr, u8 val) +{ + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + + id <<= DCACHE_SETS_LOG2; + for (int set=0;set %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + return; + } + } +} + void ARMv5::DCacheInvalidateByAddr(u32 addr) { u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; @@ -1098,6 +1157,8 @@ void ARMv5::DataRead8(u32 addr, u32* val) return; } + DataRegion = addr; + #if 1 if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { @@ -1110,8 +1171,6 @@ void ARMv5::DataRead8(u32 addr, u32* val) } #endif - DataRegion = addr; - if (addr < ITCMSize) { DataCycles = 1; @@ -1137,6 +1196,8 @@ void ARMv5::DataRead16(u32 addr, u32* val) return; } + DataRegion = addr; + #if 1 if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { @@ -1149,8 +1210,6 @@ void ARMv5::DataRead16(u32 addr, u32* val) } #endif - DataRegion = addr; - addr &= ~1; if (addr < ITCMSize) @@ -1178,6 +1237,8 @@ void ARMv5::DataRead32(u32 addr, u32* val) return; } + DataRegion = addr; + #if 1 if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { @@ -1190,8 +1251,6 @@ void ARMv5::DataRead32(u32 addr, u32* val) } #endif - DataRegion = addr; - addr &= ~3; if (addr < ITCMSize) @@ -1256,7 +1315,8 @@ void ARMv5::DataWrite8(u32 addr, u8 val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheInvalidateByAddr(addr); + DCacheWrite8(addr, val); + //DCacheInvalidateByAddr(addr); } } @@ -1292,7 +1352,8 @@ void ARMv5::DataWrite16(u32 addr, u16 val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheInvalidateByAddr(addr); + DCacheWrite16(addr, val); +// DCacheInvalidateByAddr(addr); } } @@ -1330,7 +1391,8 @@ void ARMv5::DataWrite32(u32 addr, u32 val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheInvalidateByAddr(addr); + DCacheWrite32(addr, val); +// DCacheInvalidateByAddr(addr); } } @@ -1364,7 +1426,8 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheInvalidateByAddr(addr); + DCacheWrite32(addr, val); +// DCacheInvalidateByAddr(addr); } } diff --git a/wfcsettings.bin b/wfcsettings.bin new file mode 100644 index 0000000000000000000000000000000000000000..af3146aeb6423c6cff0a4d369941eb72c52e62d7 GIT binary patch literal 2304 zcmezWe-w;{z-S1JhQMeDjKmONpb+Gy=H%x&22f}U-Lx};ZSpv2Zan{jrB=^(PP*AV buonG?T94-cg1ZCjEV|i_>VLSkr#}DyyOIjR literal 0 HcmV?d00001 From 71b5c829aa0b8f2b14796ce144dc2fc5cb348caf Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 24 Jan 2024 14:53:55 +0100 Subject: [PATCH 012/306] Fixed unaligned access to data cache --- src/CP15.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index b3afb00b..0f5c32df 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -507,6 +507,8 @@ void ARMv5::DCacheLookup(u32 addr) void ARMv5::DCacheWrite32(u32 addr, u32 val) { + addr &= ~3; + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); @@ -519,7 +521,7 @@ void ARMv5::DCacheWrite32(u32 addr, u32 val) *(u32 *)&CurDCacheLine[addr & (ICACHE_LINELENGTH-1)] = val; DataCycles = 1; - //Log(LogLevel::Debug,"DCache hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + //Log(LogLevel::Debug,"DCache write32 hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } } @@ -527,6 +529,8 @@ void ARMv5::DCacheWrite32(u32 addr, u32 val) void ARMv5::DCacheWrite16(u32 addr, u16 val) { + addr &= ~1; + u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); @@ -539,7 +543,7 @@ void ARMv5::DCacheWrite16(u32 addr, u16 val) *(u16 *)&CurDCacheLine[addr & (ICACHE_LINELENGTH-1)] = val; DataCycles = 1; - //Log(LogLevel::Debug,"DCache hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + //Log(LogLevel::Debug,"DCache write16 hit @ %08x -> %04x\n", addr, ((u16 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } } @@ -559,7 +563,7 @@ void ARMv5::DCacheWrite8(u32 addr, u8 val) *(u8 *)&CurDCacheLine[addr & (ICACHE_LINELENGTH-1)] = val; DataCycles = 1; - //Log(LogLevel::Debug,"DCache hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); + //Log(LogLevel::Debug,"DCache write hit8 @ %08x -> %02x\n", addr, ((u8 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } } @@ -1153,6 +1157,7 @@ void ARMv5::DataRead8(u32 addr, u32* val) { if (!(PU_Map[addr>>12] & 0x01)) { + Log(LogLevel::Debug, "data8 abort @ %08lx\n", addr); DataAbort(); return; } @@ -1192,6 +1197,7 @@ void ARMv5::DataRead16(u32 addr, u32* val) { if (!(PU_Map[addr>>12] & 0x01)) { + Log(LogLevel::Debug, "data16 abort @ %08lx\n", addr); DataAbort(); return; } @@ -1233,6 +1239,7 @@ void ARMv5::DataRead32(u32 addr, u32* val) { if (!(PU_Map[addr>>12] & 0x01)) { + Log(LogLevel::Debug, "data32 abort @ %08lx\n", addr); DataAbort(); return; } From 7b8327d3a484851628d412ecd55f79528df7d69e Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 09:05:51 +0100 Subject: [PATCH 013/306] Disabled Caches, when JIT is enabled --- src/CP15.cpp | 159 +++++++++++++++++++++++++++++++-------------------- src/NDS.h | 4 ++ 2 files changed, 101 insertions(+), 62 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 0f5c32df..ef7f6347 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1106,7 +1106,7 @@ u32 ARMv5::CP15Read(u32 id) const // TCM are handled here. -// TODO: later on, handle PU, and maybe caches +// TODO: later on, handle PU u32 ARMv5::CodeRead32(u32 addr, bool branch) { @@ -1126,27 +1126,30 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } CodeCycles = RegionCodeCycles; -#if 0 - if (CodeCycles == 0xFF) // cached memory. hax +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (branch || !(addr & 0x1F)) - CodeCycles = kCodeCacheTiming;//ICacheLookup(addr); - else - CodeCycles = 1; - - //return *(u32*)&CurICacheLine[addr & 0x1C]; - } -#else - if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) - { - if (IsAddressICachable(addr)) + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) { - ICacheLookup(addr); - return *(u32*)&CurICacheLine[addr & (ICACHE_LINELENGTH - 4)]; + if (IsAddressICachable(addr)) + { + ICacheLookup(addr); + return *(u32*)&CurICacheLine[addr & (ICACHE_LINELENGTH - 4)]; + } + } + } else + { + if (CodeCycles == 0xFF) // cached memory. hax + { + if (branch || !(addr & 0x1F)) + CodeCycles = kCodeCacheTiming;//ICacheLookup(addr); + else + CodeCycles = 1; + + //return *(u32*)&CurICacheLine[addr & 0x1C]; } } -#endif - if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; return BusRead32(addr); @@ -1164,17 +1167,20 @@ void ARMv5::DataRead8(u32 addr, u32* val) DataRegion = addr; -#if 1 - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheLookup(addr & ~3); - *val = CurDCacheLine[addr & (DCACHE_LINELENGTH - 1)]; - return; + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + *val = CurDCacheLine[addr & (DCACHE_LINELENGTH - 1)]; + return; + } } } -#endif if (addr < ITCMSize) { @@ -1204,17 +1210,20 @@ void ARMv5::DataRead16(u32 addr, u32* val) DataRegion = addr; -#if 1 - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheLookup(addr & ~3); - *val = *(u16 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 2)]; - return; + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + *val = *(u16 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 2)]; + return; + } } } -#endif addr &= ~1; @@ -1246,17 +1255,20 @@ void ARMv5::DataRead32(u32 addr, u32* val) DataRegion = addr; -#if 1 - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheLookup(addr & ~3); - *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; - return; + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; + return; + } } } -#endif addr &= ~3; @@ -1281,17 +1293,20 @@ void ARMv5::DataRead32S(u32 addr, u32* val) { addr &= ~3; -#if 1 - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheLookup(addr & ~3); - *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; - return; + if (PU_Map[addr >> 12] & 0x10) + { + DCacheLookup(addr & ~3); + *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; + return; + } } } -#endif if (addr < ITCMSize) { @@ -1318,12 +1333,17 @@ void ARMv5::DataWrite8(u32 addr, u8 val) return; } - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheWrite8(addr, val); - //DCacheInvalidateByAddr(addr); + if (PU_Map[addr >> 12] & 0x10) + { + DCacheWrite8(addr, val); + //DCacheInvalidateByAddr(addr); + } } } @@ -1355,12 +1375,17 @@ void ARMv5::DataWrite16(u32 addr, u16 val) return; } - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheWrite16(addr, val); -// DCacheInvalidateByAddr(addr); + if (PU_Map[addr >> 12] & 0x10) + { + DCacheWrite16(addr, val); + // DCacheInvalidateByAddr(addr); + } } } @@ -1394,12 +1419,17 @@ void ARMv5::DataWrite32(u32 addr, u32 val) return; } - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheWrite32(addr, val); -// DCacheInvalidateByAddr(addr); + if (PU_Map[addr >> 12] & 0x10) + { + DCacheWrite32(addr, val); + // DCacheInvalidateByAddr(addr); + } } } @@ -1429,12 +1459,17 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) { addr &= ~3; - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) +#ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) +#endif { - if (PU_Map[addr >> 12] & 0x10) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - DCacheWrite32(addr, val); -// DCacheInvalidateByAddr(addr); + if (PU_Map[addr >> 12] & 0x10) + { + DCacheWrite32(addr, val); + // DCacheInvalidateByAddr(addr); + } } } diff --git a/src/NDS.h b/src/NDS.h index f9df2d69..cfb8e3b5 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -302,6 +302,10 @@ public: // TODO: Encapsulate the rest of these members melonDS::GPU GPU; melonDS::AREngine AREngine; +#ifdef JIT_ENABLED + bool IsJITEnabled(){return EnableJIT;}; +#endif + const u32 ARM7WRAMSize = 0x10000; u8* ARM7WRAM; From 9d2e5159473b86670c790e2d8914197c4113d80f Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 10:08:57 +0100 Subject: [PATCH 014/306] Implemented CacheLockDown --- src/CP15.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ src/MemConstants.h | 2 ++ 2 files changed, 42 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index ef7f6347..fef3eac1 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -380,6 +380,24 @@ void ARMv5::ICacheLookup(u32 addr) line = RandomLineIndex(); } + if (ICacheLockDown) + { + if (ICacheLockDown & CACHE_LOCKUP_L) + { + // load into locked up cache + // into the selected set + line = ICacheLockDown & (ICACHE_SETS-1); + } else + { + u8 minSet = ICacheLockDown & (ICACHE_SETS-1); + if (minSet) + { + // part of the cache is locked up and only the cachelines + line = (line % (ICACHE_SETS - minSet)) + minSet; + } + } + } + line += id; addr &= ~(ICACHE_LINELENGTH-1); @@ -474,6 +492,24 @@ void ARMv5::DCacheLookup(u32 addr) line = RandomLineIndex(); } + if (DCacheLockDown) + { + if (DCacheLockDown & CACHE_LOCKUP_L) + { + // load into locked up cache + // into the selected set + line = DCacheLockDown & (DCACHE_SETS-1); + } else + { + u8 minSet = DCacheLockDown & (DCACHE_SETS-1); + if (minSet) + { + // part of the cache is locked up and only the cachelines + line = (line % (DCACHE_SETS - minSet)) + minSet; + } + } + } + line += id; addr &= ~(DCACHE_LINELENGTH-1); @@ -856,6 +892,10 @@ void ARMv5::CP15Write(u32 id, u32 val) // Test and clean (optional) // Is not present on the NDS/DSi return; + case 0x7A4: + // Drain Write Buffer: Stall until all write back completed + // TODO when write back was implemented instead of write through + return; case 0x7D1: Log(LogLevel::Debug,"Prefetch instruction cache MVA\n"); diff --git a/src/MemConstants.h b/src/MemConstants.h index 6b378f7a..af6f89f7 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -57,6 +57,8 @@ constexpr u32 CACHE_FLAG_VALID = (1 << 4); constexpr u32 CACHE_FLAG_DIRTY_LOWERHALF = (1 << 2); constexpr u32 CACHE_FLAG_DIRTY_UPPERHALF = (1 << 3); +constexpr u32 CACHE_LOCKUP_L = (1 << 31); + constexpr u32 CP15_CR_MPUENABLE = (1 << 0); constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); From bf0767b4f379809bdcca79d82ac90ced1158809b Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 10:48:04 +0100 Subject: [PATCH 015/306] Added CP15 prefetch routine --- src/CP15.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index fef3eac1..c2f8720c 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -899,6 +899,9 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x7D1: Log(LogLevel::Debug,"Prefetch instruction cache MVA\n"); + // we force a fill by looking up the value from cache + // if it wasn't cached yet, it will be loaded into cache + ICacheLookup(val & ~0x03); break; case 0x7E0: From 3c94802704639c16e111ca0d66c8f39ed902fe26 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 11:05:58 +0100 Subject: [PATCH 016/306] Removed magic number from Cache Fill timings. --- src/CP15.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index c2f8720c..8f3d7138 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -417,7 +417,8 @@ void ARMv5::ICacheLookup(u32 addr) // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); - CodeCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * 7)) << NDS.ARM9ClockShift; + // first N32 remaining S32 + CodeCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; CurICacheLine = ptr; } @@ -537,7 +538,8 @@ void ARMv5::DCacheLookup(u32 addr) // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); - DataCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * 7)) << NDS.ARM9ClockShift; + // first N32 remaining S32 + DataCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; CurDCacheLine = ptr; } From 8a0ad8ac3f99211e9d6f162856b2eece76137ad4 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 11:44:26 +0100 Subject: [PATCH 017/306] Added permission checks to CP15 Cache register write operations --- src/CP15.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index 8f3d7138..94494e62 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -832,14 +832,31 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: + // Can be executed in user and priv mode ICacheInvalidateAll(); //Halt(255); return; case 0x751: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } ICacheInvalidateByAddr(val); //Halt(255); return; case 0x752: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { // Cache invalidat by line number and set number u8 cacheSet = val >> (32 - ICACHE_SETS_LOG2) & (ICACHE_SETS -1); @@ -851,14 +868,38 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x760: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } DCacheInvalidateAll(); //printf("inval data cache %08X\n", val); return; case 0x761: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } DCacheInvalidateByAddr(val); //printf("inval data cache SI\n"); return; case 0x762: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { // Cache invalidat by line number and set number u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); @@ -869,20 +910,45 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x770: // invalidate both caches + // can be called from user and privileged ICacheInvalidateAll(); DCacheInvalidateAll(); break; case 0x7A0: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } //Log(LogLevel::Debug,"clean data cache\n"); DCacheClearAll(); return; case 0x7A1: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } //Log(LogLevel::Debug,"clean data cache MVA\n"); DCacheClearByAddr(val); return; case 0x7A2: //Log(LogLevel::Debug,"clean data cache SET/WAY\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { // Cache invalidat by line number and set number u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); @@ -891,16 +957,33 @@ void ARMv5::CP15Write(u32 id, u32 val) } return; case 0x7A3: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } // Test and clean (optional) // Is not present on the NDS/DSi return; case 0x7A4: + // Can be used in user and privileged mode // Drain Write Buffer: Stall until all write back completed // TODO when write back was implemented instead of write through return; case 0x7D1: Log(LogLevel::Debug,"Prefetch instruction cache MVA\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } // we force a fill by looking up the value from cache // if it wasn't cached yet, it will be loaded into cache ICacheLookup(val & ~0x03); @@ -908,16 +991,40 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x7E0: //Log(LogLevel::Debug,"clean & invalidate data cache\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } DCacheClearAll(); DCacheInvalidateAll(); return; case 0x7E1: //Log(LogLevel::Debug,"clean & invalidate data cache MVA\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } DCacheClearByAddr(val); DCacheInvalidateByAddr(val); return; case 0x7E2: //Log(LogLevel::Debug,"clean & invalidate data cache SET/WAY\n"); + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { // Cache invalidat by line number and set number u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); @@ -928,6 +1035,14 @@ void ARMv5::CP15Write(u32 id, u32 val) return; case 0x900: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } // Cache Lockdown - Format B // Bit 31: Lock bit // Bit 0..Way-1: locked ways @@ -937,6 +1052,14 @@ void ARMv5::CP15Write(u32 id, u32 val) Log(LogLevel::Debug,"ICacheLockDown\n"); return; case 0x901: + // requires priv mode or causes UNKNOWN INSTRUCTION exception + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } // Cache Lockdown - Format B // Bit 31: Lock bit // Bit 0..Way-1: locked ways From 6959d6f2b0707e7ef6660c2f0fb22949875259ca Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 11:52:37 +0100 Subject: [PATCH 018/306] Added privilege checks for reading & writing CP15 cache registers --- src/CP15.cpp | 67 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 94494e62..d6ddcad8 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1080,11 +1080,25 @@ void ARMv5::CP15Write(u32 id, u32 val) return; case 0xF00: - CacheDebugRegisterIndex = val; + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else + CacheDebugRegisterIndex = val; return; case 0xF10: // instruction cache Tag register + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; @@ -1094,6 +1108,13 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0xF20: // data cache Tag register + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; @@ -1104,6 +1125,13 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0xF30: //printf("cache debug instruction cache %08X\n", val); + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; @@ -1114,6 +1142,13 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0xF40: //printf("cache debug data cache %08X\n", val); + if (PU_Map != PU_PrivMap) + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; @@ -1220,9 +1255,17 @@ u32 ARMv5::CP15Read(u32 id) const return 0; case 0x900: - return DCacheLockDown; + if (PU_Map != PU_PrivMap) + { + return 0; + } else + return DCacheLockDown; case 0x901: - return ICacheLockDown; + if (PU_Map != PU_PrivMap) + { + return 0; + } else + return ICacheLockDown; case 0x910: return DTCMSetting; @@ -1230,9 +1273,17 @@ u32 ARMv5::CP15Read(u32 id) const return ITCMSetting; case 0xF00: - return CacheDebugRegisterIndex; + if (PU_Map != PU_PrivMap) + { + return 0; + } else + return CacheDebugRegisterIndex; case 0xF10: // instruction cache Tag register + if (PU_Map != PU_PrivMap) + { + return 0; + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; @@ -1242,6 +1293,10 @@ u32 ARMv5::CP15Read(u32 id) const } case 0xF20: // data cache Tag register + if (PU_Map != PU_PrivMap) + { + return 0; + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; @@ -1250,6 +1305,10 @@ u32 ARMv5::CP15Read(u32 id) const return DCacheTags[(index << DCACHE_SETS_LOG2) + segment]; } case 0xF30: + if (PU_Map != PU_PrivMap) + { + return 0; + } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (ICACHE_LINELENGTH-1)) >> 2; From cd60c13ea16ec60c5ba89a2a5d2b8ac412b78ddd Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 12:09:17 +0100 Subject: [PATCH 019/306] Replaced Magic numbers in CP15 cache type register --- src/CP15.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index d6ddcad8..58105aa0 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1181,7 +1181,9 @@ u32 ARMv5::CP15Read(u32 id) const return 0x41059461; case 0x001: // cache type - return 0x0F0D2112; + return CACHE_TR_LOCKUP_TYPE_B | CACHE_TR_NONUNIFIED + | (DCACHE_LINELENGTH_ENCODED << 12) | (DCACHE_SETS_LOG2 << 15) | ((DCACHE_SIZE_LOG2 - 9) << 18) + | (ICACHE_LINELENGTH_ENCODED << 0) | (ICACHE_SETS_LOG2 << 3) | ((ICACHE_SIZE_LOG2 - 9) << 6); case 0x002: // TCM size return (6 << 6) | (5 << 18); From a0f4eb691ba4a2308ca0545b4b8fa75b68ded92e Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 25 Jan 2024 12:12:31 +0100 Subject: [PATCH 020/306] Fixed Typo and missing file on the cache type constants --- src/CP15.cpp | 2 +- src/MemConstants.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 58105aa0..2a3936d9 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1181,7 +1181,7 @@ u32 ARMv5::CP15Read(u32 id) const return 0x41059461; case 0x001: // cache type - return CACHE_TR_LOCKUP_TYPE_B | CACHE_TR_NONUNIFIED + return CACHE_TR_LOCKDOWN_TYPE_B | CACHE_TR_NONUNIFIED | (DCACHE_LINELENGTH_ENCODED << 12) | (DCACHE_SETS_LOG2 << 15) | ((DCACHE_SIZE_LOG2 - 9) << 18) | (ICACHE_LINELENGTH_ENCODED << 0) | (ICACHE_SETS_LOG2 << 3) | ((ICACHE_SIZE_LOG2 - 9) << 6); diff --git a/src/MemConstants.h b/src/MemConstants.h index af6f89f7..332b9b18 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -57,6 +57,9 @@ constexpr u32 CACHE_FLAG_VALID = (1 << 4); constexpr u32 CACHE_FLAG_DIRTY_LOWERHALF = (1 << 2); constexpr u32 CACHE_FLAG_DIRTY_UPPERHALF = (1 << 3); +constexpr u32 CACHE_TR_LOCKDOWN_TYPE_B = (7 << 25); +constexpr u32 CACHE_TR_NONUNIFIED = (1 << 24); + constexpr u32 CACHE_LOCKUP_L = (1 << 31); constexpr u32 CP15_CR_MPUENABLE = (1 << 0); From caa90dd5acc26bcb074a15677cde388b86d9ccf7 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 26 Jan 2024 09:21:08 +0100 Subject: [PATCH 021/306] Changed DCache Random Cache-Line selection to a double Galoise LFSR --- src/ARM.h | 2 ++ src/CP15.cpp | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/ARM.h b/src/ARM.h index f7a6b98b..2eaf940d 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -318,6 +318,7 @@ public: void ICacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine); + u8 DCacheRandom(); void DCacheLookup(u32 addr); void DCacheWrite32(u32 addr, u32 val); void DCacheWrite16(u32 addr, u16 val); @@ -361,6 +362,7 @@ public: u8 DCache[DCACHE_SIZE]; u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; u8 DCacheCount; + u32 DCacheLFSRStates; u32 PU_CodeCacheable; u32 PU_DataCacheable; diff --git a/src/CP15.cpp b/src/CP15.cpp index 2a3936d9..848859e9 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -67,6 +67,10 @@ void ARMv5::CP15Reset() DCacheInvalidateAll(); DCacheCount = 0; + // make sure that both half words are not the same otherwise the random of the DCache set selection only produces + // '00' and '11' + DCacheLFSRStates = 0xDEADBEEF; + PU_CodeCacheable = 0; PU_DataCacheable = 0; PU_DataCacheWrite = 0; @@ -99,6 +103,7 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->VarArray(DCache, sizeof(DCache)); file->VarArray(DCacheTags, sizeof(DCacheTags)); file->Var8(&DCacheCount); + file->Var32(&DCacheLFSRStates); file->Var32(&DCacheLockDown); file->Var32(&ICacheLockDown); @@ -352,6 +357,17 @@ u32 ARMv5::RandomLineIndex() return (RNGSeed >> 17) & 0x3; } +u8 ARMv5::DCacheRandom() +{ + // The random value, which line to select is derived from two LFSR of the + // same polynomial with different initial states, so that they reproduce + // the same 2047 bit sequence but with a random different starting point + u32 lowLFSRBits = DCacheLFSRStates & 0x00010001; + DCacheLFSRStates = (DCacheLFSRStates & ~0x00010001) >> 1; + DCacheLFSRStates ^= lowLFSRBits * 0x5E5 ; + return (lowLFSRBits | (lowLFSRBits >> 15)) & 3; +} + void ARMv5::ICacheLookup(u32 addr) { u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; @@ -490,7 +506,7 @@ void ARMv5::DCacheLookup(u32 addr) } else { - line = RandomLineIndex(); + line = DCacheRandom(); } if (DCacheLockDown) From 1dc15a0d075b455f9906f6b9e0bcf7b943964134 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 26 Jan 2024 10:09:10 +0100 Subject: [PATCH 022/306] Simplified set selection adjustment for the cache lock down --- src/CP15.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 848859e9..a59af2ab 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -406,11 +406,7 @@ void ARMv5::ICacheLookup(u32 addr) } else { u8 minSet = ICacheLockDown & (ICACHE_SETS-1); - if (minSet) - { - // part of the cache is locked up and only the cachelines - line = (line % (ICACHE_SETS - minSet)) + minSet; - } + line = line | minSet; } } @@ -518,12 +514,8 @@ void ARMv5::DCacheLookup(u32 addr) line = DCacheLockDown & (DCACHE_SETS-1); } else { - u8 minSet = DCacheLockDown & (DCACHE_SETS-1); - if (minSet) - { - // part of the cache is locked up and only the cachelines - line = (line % (DCACHE_SETS - minSet)) + minSet; - } + u8 minSet = ICacheLockDown & (DCACHE_SETS-1); + line = line | minSet; } } From 06ea3f68ec483dfdc8ac81737373e2486bdb13d8 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 31 Jan 2024 12:20:41 +0100 Subject: [PATCH 023/306] Cleaned up and fastened up --- src/ARM.h | 27 ++++--- src/CP15.cpp | 203 ++++++++++++++++++++++++++------------------------- 2 files changed, 118 insertions(+), 112 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 2eaf940d..eedd997b 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -310,28 +310,27 @@ public: u32 RandomLineIndex(); - void ICacheLookup(u32 addr); - bool IsAddressICachable(u32 addr); + u32 ICacheLookup(const u32 addr); + inline bool IsAddressICachable(const u32 addr) const; void ICacheInvalidateAll(); - void ICacheInvalidateByAddr(u32 addr); - void ICacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine); + void ICacheInvalidateByAddr(const u32 addr); + void ICacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); - u8 DCacheRandom(); - void DCacheLookup(u32 addr); - void DCacheWrite32(u32 addr, u32 val); - void DCacheWrite16(u32 addr, u16 val); - void DCacheWrite8(u32 addr, u8 val); - bool IsAddressDCachable(u32 addr); + u32 DCacheLookup(const u32 addr); + void DCacheWrite32(const u32 addr, const u32 val); + void DCacheWrite16(const u32 addr, const u16 val); + void DCacheWrite8(const u32 addr, const u8 val); + inline bool IsAddressDCachable(const u32 addr) const; void DCacheInvalidateAll(); - void DCacheInvalidateByAddr(u32 addr); - void DCacheInvalidateBySetAndWay(u8 cacheSet, u8 cacheLine); + void DCacheInvalidateByAddr(const u32 addr); + void DCacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); void DCacheClearAll(); - void DCacheClearByAddr(u32 addr); - void DCacheClearByASetAndWay(u8 cacheSet, u8 cacheLine); + void DCacheClearByAddr(const u32 addr); + void DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine); diff --git a/src/CP15.cpp b/src/CP15.cpp index a59af2ab..2dcce5c1 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -357,35 +357,33 @@ u32 ARMv5::RandomLineIndex() return (RNGSeed >> 17) & 0x3; } -u8 ARMv5::DCacheRandom() -{ - // The random value, which line to select is derived from two LFSR of the - // same polynomial with different initial states, so that they reproduce - // the same 2047 bit sequence but with a random different starting point - u32 lowLFSRBits = DCacheLFSRStates & 0x00010001; - DCacheLFSRStates = (DCacheLFSRStates & ~0x00010001) >> 1; - DCacheLFSRStates ^= lowLFSRBits * 0x5E5 ; - return (lowLFSRBits | (lowLFSRBits >> 15)) & 3; -} - -void ARMv5::ICacheLookup(u32 addr) +u32 ARMv5::ICacheLookup(const u32 addr) { - u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); + const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; - id <<= ICACHE_SETS_LOG2; for (int set=0;set> 2]; } } // cache miss u32 line; +#if 0 + // caclulate in which cacheline the data is to be filled + // The code below is doing the same as the if-less below + // It increases performance by reducing banches. + // The code is kept here for readability. + // + // NOTE: If you need to update either part, you need + // to update the other too to keep them in sync! + // + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) { line = ICacheCount; @@ -410,36 +408,44 @@ void ARMv5::ICacheLookup(u32 addr) } } - line += id; +#else + // Do the same as above but instead of using if-else + // utilize the && and || operators to skip parts of the operations + // With the order of comparison we can put the most likely path + // checked first - addr &= ~(ICACHE_LINELENGTH-1); - u8* ptr = &ICache[line << ICACHE_LINELENGTH_LOG2]; + bool doLockDown = (ICacheLockDown & CACHE_LOCKUP_L); + bool roundRobin = CP15Control & CP15_CACHE_CR_ROUNDROBIN; + (!roundRobin && (line = RandomLineIndex())) || (roundRobin && (ICacheCount = line = ((ICacheCount+1) & (ICACHE_SETS-1)))) ; + (!doLockDown && (line = (line | ICacheLockDown & (ICACHE_SETS-1))+id)) || (doLockDown && (line = (ICacheLockDown & (ICACHE_SETS-1))+id)); +#endif + + u32* ptr = (u32 *)&ICache[line << ICACHE_LINELENGTH_LOG2]; if (CodeMem.Mem) { - memcpy(ptr, &CodeMem.Mem[addr & CodeMem.Mask], ICACHE_LINELENGTH); + memcpy(ptr, &CodeMem.Mem[tag & CodeMem.Mask], ICACHE_LINELENGTH); } else { for (int i = 0; i < ICACHE_LINELENGTH; i+=sizeof(u32)) - *(u32*)&ptr[i] = NDS.ARM9Read32(addr+i); + ptr[i >> 2] = NDS.ARM9Read32(tag+i); } - ICacheTags[line] = addr | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; + ICacheTags[line] = tag | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); // first N32 remaining S32 - CodeCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; - CurICacheLine = ptr; + CodeCycles = (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; + return ptr[(addr & (ICACHE_LINELENGTH-1)) >> 2]; } -void ARMv5::ICacheInvalidateByAddr(u32 addr) +void ARMv5::ICacheInvalidateByAddr(const u32 addr) { - u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1); + const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; - id <<= ICACHE_SETS_LOG2; for (int set=0;set= ICACHE_SETS) return; @@ -468,33 +474,39 @@ void ARMv5::ICacheInvalidateAll() ICacheTags[i] &= ~CACHE_FLAG_VALID; ; } -bool ARMv5::IsAddressICachable(u32 addr) +bool ARMv5::IsAddressICachable(const u32 addr) const { return PU_Map[addr >> 12] & 0x40 ; } -void ARMv5::DCacheLookup(u32 addr) +u32 ARMv5::DCacheLookup(const u32 addr) { //Log(LogLevel::Debug,"DCache load @ %08x\n", addr); - addr &= ~3; - u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) ; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - id <<= DCACHE_SETS_LOG2; for (int set=0;set %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); - return; + u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } } // cache miss u32 line; +#if 0 + // caclulate in which cacheline the data is to be filled + // The code below is doing the same as the if-less below + // It increases performance by reducing banches. + // The code is kept here for readability. + // + // NOTE: If you need to update either part, you need + // to update the other too to keep them in sync! + // + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) { line = DCacheCount; @@ -505,66 +517,71 @@ void ARMv5::DCacheLookup(u32 addr) line = DCacheRandom(); } + // Update the selected set depending on the DCache LockDown register if (DCacheLockDown) { if (DCacheLockDown & CACHE_LOCKUP_L) { // load into locked up cache // into the selected set - line = DCacheLockDown & (DCACHE_SETS-1); + line = (DCacheLockDown & (DCACHE_SETS-1)) + id; } else { u8 minSet = ICacheLockDown & (DCACHE_SETS-1); - line = line | minSet; + line = (line | minSet) + id; } } +#else + // Do the same as above but instead of using if-else + // utilize the && and || operators to skip parts of the operations + // With the order of comparison we can put the most likely path + // checked first - line += id; + bool doLockDown = (DCacheLockDown & CACHE_LOCKUP_L); + bool roundRobin = CP15Control & CP15_CACHE_CR_ROUNDROBIN; + (!roundRobin && (line = RandomLineIndex())) || (roundRobin && (DCacheCount = line = ((DCacheCount+1) & (DCACHE_SETS-1)))); + (!doLockDown && (line = (line | DCacheLockDown & (DCACHE_SETS-1))+id)) || (doLockDown && (line = (DCacheLockDown & (DCACHE_SETS-1))+id)); +#endif - addr &= ~(DCACHE_LINELENGTH-1); - u8* ptr = &DCache[line << DCACHE_LINELENGTH_LOG2]; + u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; - //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", addr); + //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", tag); for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { - //DataRead32S(addr+i, (u32*)&ptr[i]); - if (addr+i < ITCMSize) + if (tag+i < ITCMSize) { - *((u32*)&ptr[i]) = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + ptr[i >> 2] = *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)]; } else - if (((addr+i) & DTCMMask) == DTCMBase) + if (((tag+i) & DTCMMask) == DTCMBase) { - *((u32*)&ptr[i]) = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + ptr[i >> 2] = *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)]; } else { - *((u32*)&ptr[i]) = BusRead32(addr+i); + ptr[i >> 2] = BusRead32(tag+i); } - //Log(LogLevel::Debug,"DCache store @ %08x: %08x\n", addr+i, *(u32*)&ptr[i]); + //Log(LogLevel::Debug,"DCache store @ %08x: %08x\n", tag+i, *(u32*)&ptr[i]); } - DCacheTags[line] = addr | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; + DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); // first N32 remaining S32 - DataCycles = (NDS.ARM9MemTimings[addr >> 14][2] + (NDS.ARM9MemTimings[addr >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; - CurDCacheLine = ptr; + DataCycles = (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; + return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } -void ARMv5::DCacheWrite32(u32 addr, u32 val) +void ARMv5::DCacheWrite32(const u32 addr, const u32 val) { - addr &= ~3; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); - - id <<= DCACHE_SETS_LOG2; for (int set=0;set> 2] = val; DataCycles = 1; //Log(LogLevel::Debug,"DCache write32 hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); @@ -573,20 +590,17 @@ void ARMv5::DCacheWrite32(u32 addr, u32 val) } } -void ARMv5::DCacheWrite16(u32 addr, u16 val) +void ARMv5::DCacheWrite16(const u32 addr, const u16 val) { - addr &= ~1; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); - - id <<= DCACHE_SETS_LOG2; for (int set=0;set> 1] = val; DataCycles = 1; //Log(LogLevel::Debug,"DCache write16 hit @ %08x -> %04x\n", addr, ((u16 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); @@ -595,18 +609,17 @@ void ARMv5::DCacheWrite16(u32 addr, u16 val) } } -void ARMv5::DCacheWrite8(u32 addr, u8 val) +void ARMv5::DCacheWrite8(const u32 addr, const u8 val) { - u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2;; - id <<= DCACHE_SETS_LOG2; for (int set=0;set %02x\n", addr, ((u8 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); @@ -615,12 +628,11 @@ void ARMv5::DCacheWrite8(u32 addr, u8 val) } } -void ARMv5::DCacheInvalidateByAddr(u32 addr) +void ARMv5::DCacheInvalidateByAddr(const u32 addr) { - u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; - u32 id = (addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - id <<= DCACHE_SETS_LOG2; for (int set=0;set= DCACHE_SETS) return; @@ -656,19 +668,19 @@ void ARMv5::DCacheClearAll() // underlying memory and invalidates the cache line. } -void ARMv5::DCacheClearByAddr(u32 addr) +void ARMv5::DCacheClearByAddr(const u32 addr) { // TODO: right now any write to cached data goes straight to the // underlying memory and invalidates the cache line. } -void ARMv5::DCacheClearByASetAndWay(u8 cacheSet, u8 cacheLine) +void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) { // TODO: right now any write to cached data goes straight to the // underlying memory and invalidates the cache line. } -bool ARMv5::IsAddressDCachable(u32 addr) +bool ARMv5::IsAddressDCachable(const u32 addr) const { return PU_Map[addr >> 12] & 0x10 ; } @@ -1371,8 +1383,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) { if (IsAddressICachable(addr)) { - ICacheLookup(addr); - return *(u32*)&CurICacheLine[addr & (ICACHE_LINELENGTH - 4)]; + return ICacheLookup(addr); } } } else @@ -1412,8 +1423,7 @@ void ARMv5::DataRead8(u32 addr, u32* val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheLookup(addr & ~3); - *val = CurDCacheLine[addr & (DCACHE_LINELENGTH - 1)]; + *val = (DCacheLookup(addr) >> (8* (addr & 3))) & 0xff; return; } } @@ -1455,8 +1465,7 @@ void ARMv5::DataRead16(u32 addr, u32* val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheLookup(addr & ~3); - *val = *(u16 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 2)]; + *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; return; } } @@ -1500,8 +1509,7 @@ void ARMv5::DataRead32(u32 addr, u32* val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheLookup(addr & ~3); - *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; + *val = DCacheLookup(addr); return; } } @@ -1538,8 +1546,7 @@ void ARMv5::DataRead32S(u32 addr, u32* val) { if (PU_Map[addr >> 12] & 0x10) { - DCacheLookup(addr & ~3); - *val = *(u32 *)&CurDCacheLine[addr & (DCACHE_LINELENGTH - 4)]; + *val = DCacheLookup(addr); return; } } From c8204a8c63c261937fbf39e9faf3af335bb5f5a0 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 1 Feb 2024 08:36:52 +0100 Subject: [PATCH 024/306] Further clean up, removal of magic numbers from CP15.cpp Split the cp15 constants into CP15_Constants.h instead from MemConstants. --- src/ARM.cpp | 40 ++---- src/ARM.h | 60 ++++---- src/CP15.cpp | 328 ++++++++++++++++++------------------------- src/CP15_Constants.h | 131 +++++++++++++++++ src/MemConstants.h | 38 ----- 5 files changed, 311 insertions(+), 286 deletions(-) create mode 100644 src/CP15_Constants.h diff --git a/src/ARM.cpp b/src/ARM.cpp index c2f6a6c2..b8961a8f 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1152,69 +1152,57 @@ u32 ARMv5::ReadMem(u32 addr, int size) } #endif -void ARMv4::DataRead8(u32 addr, u32* val) +void ARMv4::DataRead8(const u32 addr, u32* val) { *val = BusRead8(addr); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataRead16(u32 addr, u32* val) +void ARMv4::DataRead16(const u32 addr, u32* val) { - addr &= ~1; - - *val = BusRead16(addr); + *val = BusRead16(addr & ~1); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataRead32(u32 addr, u32* val) +void ARMv4::DataRead32(const u32 addr, u32* val) { - addr &= ~3; - - *val = BusRead32(addr); + *val = BusRead32(addr & ~3); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; } -void ARMv4::DataRead32S(u32 addr, u32* val) +void ARMv4::DataRead32S(const u32 addr, u32* val) { - addr &= ~3; - - *val = BusRead32(addr); + *val = BusRead32(addr & ~3); DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; } -void ARMv4::DataWrite8(u32 addr, u8 val) +void ARMv4::DataWrite8(const u32 addr, const u8 val) { BusWrite8(addr, val); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataWrite16(u32 addr, u16 val) +void ARMv4::DataWrite16(const u32 addr, const u16 val) { - addr &= ~1; - - BusWrite16(addr, val); + BusWrite16(addr & ~1, val); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataWrite32(u32 addr, u32 val) +void ARMv4::DataWrite32(const u32 addr, const u32 val) { - addr &= ~3; - - BusWrite32(addr, val); + BusWrite32(addr & ~3, val); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; } -void ARMv4::DataWrite32S(u32 addr, u32 val) +void ARMv4::DataWrite32S(const u32 addr, const u32 val) { - addr &= ~3; - - BusWrite32(addr, val); + BusWrite32(addr & ~3, val); DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; } diff --git a/src/ARM.h b/src/ARM.h index eedd997b..df06e84e 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -25,6 +25,7 @@ #include "types.h" #include "MemRegion.h" #include "MemConstants.h" +#include "CP15_Constants.h" #ifdef GDBSTUB_ENABLED #include "debug/GdbStub.h" @@ -128,14 +129,14 @@ public: void SetupCodeMem(u32 addr); - virtual void DataRead8(u32 addr, u32* val) = 0; - virtual void DataRead16(u32 addr, u32* val) = 0; - virtual void DataRead32(u32 addr, u32* val) = 0; - virtual void DataRead32S(u32 addr, u32* val) = 0; - virtual void DataWrite8(u32 addr, u8 val) = 0; - virtual void DataWrite16(u32 addr, u16 val) = 0; - virtual void DataWrite32(u32 addr, u32 val) = 0; - virtual void DataWrite32S(u32 addr, u32 val) = 0; + virtual void DataRead8(const u32 addr, u32* val) = 0; + virtual void DataRead16(const u32 addr, u32* val) = 0; + virtual void DataRead32(const u32 addr, u32* val) = 0; + virtual void DataRead32S(const u32 addr, u32* val) = 0; + virtual void DataWrite8(const u32 addr, const u8 val) = 0; + virtual void DataWrite16(const u32 addr, const u16 val) = 0; + virtual void DataWrite32(const u32 addr, const u32 val) = 0; + virtual void DataWrite32S(const u32 addr, const u32 val) = 0; virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; @@ -247,16 +248,16 @@ public: #endif // all code accesses are forced nonseq 32bit - u32 CodeRead32(u32 addr, bool branch); + u32 CodeRead32(const u32 addr, const bool branch); - void DataRead8(u32 addr, u32* val) override; - void DataRead16(u32 addr, u32* val) override; - void DataRead32(u32 addr, u32* val) override; - void DataRead32S(u32 addr, u32* val) override; - void DataWrite8(u32 addr, u8 val) override; - void DataWrite16(u32 addr, u16 val) override; - void DataWrite32(u32 addr, u32 val) override; - void DataWrite32S(u32 addr, u32 val) override; + void DataRead8(const u32 addr, u32* val) override; + void DataRead16(const u32 addr, u32* val) override; + void DataRead32(const u32 addr, u32* val) override; + void DataRead32S(const u32 addr, u32* val) override; + void DataWrite8(const u32 addr, const u8 val) override; + void DataWrite16(const u32 addr, const u16 val) override; + void DataWrite32(const u32 addr, const u32 val) override; + void DataWrite32S(const u32 addr, const u32 val) override; void AddCycles_C() override { @@ -317,7 +318,6 @@ public: void ICacheInvalidateByAddr(const u32 addr); void ICacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); - u32 DCacheLookup(const u32 addr); void DCacheWrite32(const u32 addr, const u32 val); void DCacheWrite16(const u32 addr, const u16 val); @@ -361,7 +361,6 @@ public: u8 DCache[DCACHE_SIZE]; u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; u8 DCacheCount; - u32 DCacheLFSRStates; u32 PU_CodeCacheable; u32 PU_DataCacheable; @@ -383,9 +382,6 @@ public: // code/16N/32N/32S u8 MemTimings[0x100000][4]; - u8* CurICacheLine; - u8* CurDCacheLine; - bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); #ifdef GDBSTUB_ENABLED @@ -416,24 +412,24 @@ public: void ExecuteJIT() override; #endif - u16 CodeRead16(u32 addr) + u16 CodeRead16(const u32 addr) { return BusRead16(addr); } - u32 CodeRead32(u32 addr) + u32 CodeRead32(const u32 addr) { return BusRead32(addr); } - void DataRead8(u32 addr, u32* val) override; - void DataRead16(u32 addr, u32* val) override; - void DataRead32(u32 addr, u32* val) override; - void DataRead32S(u32 addr, u32* val) override; - void DataWrite8(u32 addr, u8 val) override; - void DataWrite16(u32 addr, u16 val) override; - void DataWrite32(u32 addr, u32 val) override; - void DataWrite32S(u32 addr, u32 val) override; + void DataRead8(const u32 addr, u32* val) override; + void DataRead16(const u32 addr, u32* val) override; + void DataRead32(const u32 addr, u32* val) override; + void DataRead32S(const u32 addr, u32* val) override; + void DataWrite8(const u32 addr, const u8 val) override; + void DataWrite16(const u32 addr, const u16 val) override; + void DataWrite32(const u32 addr, const u32 val) override; + void DataWrite32S(const u32 addr, const u32 val) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; diff --git a/src/CP15.cpp b/src/CP15.cpp index 2dcce5c1..714e591d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -24,6 +24,7 @@ #include "Platform.h" #include "ARMJIT_Memory.h" #include "ARMJIT.h" +#include "CP15_Constants.h" namespace melonDS { @@ -67,10 +68,6 @@ void ARMv5::CP15Reset() DCacheInvalidateAll(); DCacheCount = 0; - // make sure that both half words are not the same otherwise the random of the DCache set selection only produces - // '00' and '11' - DCacheLFSRStates = 0xDEADBEEF; - PU_CodeCacheable = 0; PU_DataCacheable = 0; PU_DataCacheWrite = 0; @@ -78,10 +75,9 @@ void ARMv5::CP15Reset() PU_CodeRW = 0; PU_DataRW = 0; - memset(PU_Region, 0, 8*sizeof(u32)); + memset(PU_Region, 0, CP15_REGION_COUNT*sizeof(u32)); UpdatePURegions(true); - CurICacheLine = NULL; } void ARMv5::CP15DoSavestate(Savestate* file) @@ -103,7 +99,6 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->VarArray(DCache, sizeof(DCache)); file->VarArray(DCacheTags, sizeof(DCacheTags)); file->Var8(&DCacheCount); - file->Var32(&DCacheLFSRStates); file->Var32(&DCacheLockDown); file->Var32(&ICacheLockDown); @@ -116,7 +111,7 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&PU_CodeRW); file->Var32(&PU_DataRW); - file->VarArray(PU_Region, 8*sizeof(u32)); + file->VarArray(PU_Region, CP15_REGION_COUNT*sizeof(u32)); if (!file->Saving) { @@ -178,8 +173,11 @@ void ARMv5::UpdatePURegion(u32 n) if (!(CP15Control & CP15_CR_MPUENABLE)) return; - u32 coderw = (PU_CodeRW >> (4*n)) & 0xF; - u32 datarw = (PU_DataRW >> (4*n)) & 0xF; + if (n >= CP15_REGION_COUNT) + return; + + u32 coderw = (PU_CodeRW >> (CP15_REGIONACCESS_BITS_PER_REGION * n)) & CP15_REGIONACCESS_REGIONMASK; + u32 datarw = (PU_DataRW >> (CP15_REGIONACCESS_BITS_PER_REGION * n)) & CP15_REGIONACCESS_REGIONMASK; u32 codecache, datacache, datawrite; @@ -211,60 +209,60 @@ void ARMv5::UpdatePURegion(u32 n) return; } - u32 start = rgn >> 12; - u32 sz = 2 << ((rgn >> 1) & 0x1F); - u32 end = start + (sz >> 12); + u32 start = (rgn & CP15_REGION_BASE_MASK) >> CP15_MAP_ENTRYSIZE_LOG2; + u32 sz = 2 << ((rgn & CP15_REGION_SIZE_MASK) >> 1); + u32 end = start + (sz >> CP15_MAP_ENTRYSIZE_LOG2); // TODO: check alignment of start - u8 usermask = 0; - u8 privmask = 0; + u8 usermask = CP15_MAP_NOACCESS; + u8 privmask = CP15_MAP_NOACCESS; switch (datarw) { case 0: break; - case 1: privmask |= 0x03; break; - case 2: privmask |= 0x03; usermask |= 0x01; break; - case 3: privmask |= 0x03; usermask |= 0x03; break; - case 5: privmask |= 0x01; break; - case 6: privmask |= 0x01; usermask |= 0x01; break; + case 1: privmask |= CP15_MAP_READABLE | CP15_MAP_WRITEABLE; break; + case 2: privmask |= CP15_MAP_READABLE | CP15_MAP_WRITEABLE; usermask |= CP15_MAP_READABLE; break; + case 3: privmask |= CP15_MAP_READABLE | CP15_MAP_WRITEABLE; usermask |= CP15_MAP_READABLE | CP15_MAP_WRITEABLE; break; + case 5: privmask |= CP15_MAP_READABLE; break; + case 6: privmask |= CP15_MAP_READABLE; usermask |= CP15_MAP_READABLE; break; default: Log(LogLevel::Warn, "!! BAD DATARW VALUE %d\n", datarw&0xF); } switch (coderw) { case 0: break; - case 1: privmask |= 0x04; break; - case 2: privmask |= 0x04; usermask |= 0x04; break; - case 3: privmask |= 0x04; usermask |= 0x04; break; - case 5: privmask |= 0x04; break; - case 6: privmask |= 0x04; usermask |= 0x04; break; + case 1: privmask |= CP15_MAP_EXECUTABLE; break; + case 2: privmask |= CP15_MAP_EXECUTABLE; usermask |= CP15_MAP_EXECUTABLE; break; + case 3: privmask |= CP15_MAP_EXECUTABLE; usermask |= CP15_MAP_EXECUTABLE; break; + case 5: privmask |= CP15_MAP_EXECUTABLE; break; + case 6: privmask |= CP15_MAP_EXECUTABLE; usermask |= CP15_MAP_EXECUTABLE; break; default: Log(LogLevel::Warn, "!! BAD CODERW VALUE %d\n", datarw&0xF); } if (datacache & 0x1) { - privmask |= 0x10; - usermask |= 0x10; + privmask |= CP15_MAP_DCACHEABLE; + usermask |= CP15_MAP_DCACHEABLE; if (datawrite & 0x1) { - privmask |= 0x20; - usermask |= 0x20; + privmask |= CP15_MAP_DCACHEWRITEBACK; + usermask |= CP15_MAP_DCACHEWRITEBACK; } } if (codecache & 0x1) { - privmask |= 0x40; - usermask |= 0x40; + privmask |= CP15_MAP_ICACHEABLE; + usermask |= CP15_MAP_ICACHEABLE; } Log( LogLevel::Debug, "PU region %d: %08X-%08X, user=%02X priv=%02X, %08X/%08X\n", n, - start << 12, - end << 12, + start << CP15_MAP_ENTRYSIZE_LOG2, + end << CP15_MAP_ENTRYSIZE_LOG2, usermask, privmask, PU_DataRW, @@ -286,9 +284,9 @@ void ARMv5::UpdatePURegions(bool update_all) { // PU disabled - u8 mask = 0x07; - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) mask |= 0x30; - if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) mask |= 0x40; + u8 mask = CP15_MAP_READABLE | CP15_MAP_WRITEABLE | CP15_MAP_EXECUTABLE; + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) mask |= CP15_MAP_DCACHEABLE | CP15_MAP_DCACHEWRITEBACK ; + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) mask |= CP15_MAP_ICACHEABLE; memset(PU_UserMap, mask, 0x100000); memset(PU_PrivMap, mask, 0x100000); @@ -299,11 +297,11 @@ void ARMv5::UpdatePURegions(bool update_all) if (update_all) { - memset(PU_UserMap, 0, 0x100000); - memset(PU_PrivMap, 0, 0x100000); + memset(PU_UserMap, CP15_MAP_NOACCESS, 0x100000); + memset(PU_PrivMap, CP15_MAP_NOACCESS, 0x100000); } - for (int n = 0; n < 8; n++) + for (int n = 0; n < CP15_REGION_COUNT; n++) { UpdatePURegion(n); } @@ -322,7 +320,7 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) u8 pu = PU_Map[i]; u8* bustimings = NDS.ARM9MemTimings[i >> 2]; - if (pu & 0x40) + if (pu & CP15_MAP_ICACHEABLE) { MemTimings[i][0] = 0xFF;//kCodeCacheTiming; } @@ -331,7 +329,7 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) MemTimings[i][0] = bustimings[2] << NDS.ARM9ClockShift; } - if (pu & 0x10) + if (pu & CP15_MAP_DCACHEABLE) { MemTimings[i][1] = kDataCacheTiming; MemTimings[i][2] = kDataCacheTiming; @@ -364,7 +362,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) for (int set=0;set> 12] & 0x40 ; + return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_ICACHEABLE ; } u32 ARMv5::DCacheLookup(const u32 addr) @@ -487,7 +485,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) for (int set=0;set> 2] = val; DataCycles = 1; - - //Log(LogLevel::Debug,"DCache write32 hit @ %08x -> %08lx\n", addr, ((u32 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } } @@ -597,13 +593,11 @@ void ARMv5::DCacheWrite16(const u32 addr, const u16 val) for (int set=0;set> 1] = val; DataCycles = 1; - - //Log(LogLevel::Debug,"DCache write16 hit @ %08x -> %04x\n", addr, ((u16 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } } @@ -616,13 +610,11 @@ void ARMv5::DCacheWrite8(const u32 addr, const u8 val) for (int set=0;set %02x\n", addr, ((u8 *)CurDCacheLine)[(addr & (DCACHE_LINELENGTH-1)) >> 2]); return; } } @@ -635,7 +627,7 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) for (int set=0;set> 12] & 0x10 ; + return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE ; } void ARMv5::CP15Write(u32 id, u32 val) @@ -700,13 +692,14 @@ void ARMv5::CP15Write(u32 id, u32 val) //Log(LogLevel::Debug, "CP15Control = %08X (%08X->%08X)\n", CP15Control, old, val); UpdateDTCMSetting(); UpdateITCMSetting(); - if ((old & 0x1005) != (val & 0x1005)) + u32 changedBits = old^val; + if (changedBits & (CP15_CR_MPUENABLE | CP15_CACHE_CR_ICACHEENABLE| CP15_CACHE_CR_DCACHEENABLE)) { - UpdatePURegions((old & 0x1) != (val & 0x1)); + UpdatePURegions(changedBits & CP15_CR_MPUENABLE); } if (val & CP15_CR_BIGENDIAN) Log(LogLevel::Warn, "!!!! ARM9 BIG ENDIAN MODE. VERY BAD. SHIT GONNA ASPLODE NOW\n"); - if (val & CP15_CR_HIGHEXCEPTIONBASE) ExceptionBase = 0xFFFF0000; - else ExceptionBase = 0x00000000; + if (val & CP15_CR_HIGHEXCEPTIONBASE) ExceptionBase = CP15_EXCEPTIONBASE_HIGH; + else ExceptionBase = CP15_EXCEPTIONBASE_LOW; } return; @@ -715,7 +708,7 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 diff = PU_DataCacheable ^ val; PU_DataCacheable = val; - for (u32 i = 0; i < 8; i++) + for (u32 i = 0; i < CP15_REGION_COUNT; i++) { if (diff & (1<> 4) & 0xF] = val; + PU_Region[(id >> CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK] = val; - std::snprintf(log_output, - sizeof(log_output), + Log(LogLevel::Debug, "PU: region %d = %08X : %s, %08X-%08X\n", - (id >> 4) & 0xF, + (id >> CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK, val, val & 1 ? "enabled" : "disabled", - val & 0xFFFFF000, - (val & 0xFFFFF000) + (2 << ((val & 0x3E) >> 1)) + val & CP15_REGION_BASE_MASK, + (val & CP15_REGION_BASE_MASK) + (2 << ((val & CP15_REGION_SIZE_MASK) >> 1)) ); - Log(LogLevel::Debug, "%s", log_output); - // Some implementations of Log imply a newline, so we build up the line before printing it - // TODO: smarter region update for this? UpdatePURegions(true); return; @@ -854,7 +842,6 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x750: // Can be executed in user and priv mode ICacheInvalidateAll(); - //Halt(255); return; case 0x751: // requires priv mode or causes UNKNOWN INSTRUCTION exception @@ -1179,11 +1166,7 @@ void ARMv5::CP15Write(u32 id, u32 val) } - if ((id & 0xF00) == 0xF00) // test/debug shit? - return; - - if ((id & 0xF00) != 0x700) - Log(LogLevel::Debug, "unknown CP15 write op %03X %08X\n", id, val); + Log(LogLevel::Debug, "unknown CP15 write op %03X %08X\n", id, val); } u32 ARMv5::CP15Read(u32 id) const @@ -1198,7 +1181,7 @@ u32 ARMv5::CP15Read(u32 id) const case 0x005: case 0x006: case 0x007: - return 0x41059461; + return CP15_MAINID_IMPLEMENTOR_ARM | CP15_MAINID_VARIANT_0 | CP15_MAINID_ARCH_v5TE | CP15_MAINID_IMPLEMENTATION_946 | CP15_MAINID_REVISION_1; case 0x001: // cache type return CACHE_TR_LOCKDOWN_TYPE_B | CACHE_TR_NONUNIFIED @@ -1269,7 +1252,7 @@ u32 ARMv5::CP15Read(u32 id) const case 0x661: case 0x670: case 0x671: - return PU_Region[(id >> 4) & 0xF]; + return PU_Region[(id >> CP15_REGIONACCESS_BITS_PER_REGION) & 0xF]; case 0x7A6: // read Cache Dirty Bit (optional) @@ -1346,9 +1329,6 @@ u32 ARMv5::CP15Read(u32 id) const } } - if ((id & 0xF00) == 0xF00) // test/debug shit? - return 0; - Log(LogLevel::Debug, "unknown CP15 read op %03X\n", id); return 0; } @@ -1357,24 +1337,9 @@ u32 ARMv5::CP15Read(u32 id) const // TCM are handled here. // TODO: later on, handle PU -u32 ARMv5::CodeRead32(u32 addr, bool branch) +u32 ARMv5::CodeRead32(const u32 addr, bool const branch) { - /*if (branch || (!(addr & 0xFFF))) - { - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return 0; - } - }*/ - if (addr < ITCMSize) - { - CodeCycles = 1; - return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - } - - CodeCycles = RegionCodeCycles; #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) #endif @@ -1386,27 +1351,33 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) return ICacheLookup(addr); } } - } else - { - if (CodeCycles == 0xFF) // cached memory. hax - { - if (branch || !(addr & 0x1F)) - CodeCycles = kCodeCacheTiming;//ICacheLookup(addr); - else - CodeCycles = 1; + } - //return *(u32*)&CurICacheLine[addr & 0x1C]; - } + if (addr < ITCMSize) + { + CodeCycles = 1; + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } + + CodeCycles = RegionCodeCycles; + + if (CodeCycles == 0xFF) // cached memory. hax + { + if (branch || !(addr & (ICACHE_LINELENGTH-1))) + CodeCycles = kCodeCacheTiming;//ICacheLookup(addr); + else + CodeCycles = 1; + } + if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; return BusRead32(addr); } -void ARMv5::DataRead8(u32 addr, u32* val) +void ARMv5::DataRead8(const u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_READABLE)) { Log(LogLevel::Debug, "data8 abort @ %08lx\n", addr); DataAbort(); @@ -1421,9 +1392,9 @@ void ARMv5::DataRead8(u32 addr, u32* val) { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { - *val = (DCacheLookup(addr) >> (8* (addr & 3))) & 0xff; + *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; return; } } @@ -1443,12 +1414,12 @@ void ARMv5::DataRead8(u32 addr, u32* val) } *val = BusRead8(addr); - DataCycles = MemTimings[addr >> 12][1]; + DataCycles = MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S16]; } -void ARMv5::DataRead16(u32 addr, u32* val) +void ARMv5::DataRead16(const u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_READABLE)) { Log(LogLevel::Debug, "data16 abort @ %08lx\n", addr); DataAbort(); @@ -1463,7 +1434,7 @@ void ARMv5::DataRead16(u32 addr, u32* val) { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; return; @@ -1471,28 +1442,26 @@ void ARMv5::DataRead16(u32 addr, u32* val) } } - addr &= ~1; - if (addr < ITCMSize) { DataCycles = 1; - *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 2)]; return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; - *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 2)]; return; } - *val = BusRead16(addr); - DataCycles = MemTimings[addr >> 12][1]; + *val = BusRead16(addr & ~1); + DataCycles = MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S16]; } -void ARMv5::DataRead32(u32 addr, u32* val) +void ARMv5::DataRead32(const u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_READABLE)) { Log(LogLevel::Debug, "data32 abort @ %08lx\n", addr); DataAbort(); @@ -1507,7 +1476,7 @@ void ARMv5::DataRead32(u32 addr, u32* val) { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { *val = DCacheLookup(addr); return; @@ -1515,36 +1484,32 @@ void ARMv5::DataRead32(u32 addr, u32* val) } } - addr &= ~3; - if (addr < ITCMSize) { DataCycles = 1; - *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; - *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; return; } - *val = BusRead32(addr); - DataCycles = MemTimings[addr >> 12][2]; + *val = BusRead32(addr & ~0x03); + DataCycles = MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_N32]; } -void ARMv5::DataRead32S(u32 addr, u32* val) +void ARMv5::DataRead32S(const u32 addr, u32* val) { - addr &= ~3; - #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) #endif { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { *val = DCacheLookup(addr); return; @@ -1555,23 +1520,23 @@ void ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; - *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; - *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; return; } - *val = BusRead32(addr); - DataCycles += MemTimings[addr >> 12][3]; + *val = BusRead32(addr & ~0x03); + DataCycles += MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S32]; } -void ARMv5::DataWrite8(u32 addr, u8 val) +void ARMv5::DataWrite8(const u32 addr, const u8 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_WRITEABLE)) { DataAbort(); return; @@ -1583,10 +1548,9 @@ void ARMv5::DataWrite8(u32 addr, u8 val) { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { DCacheWrite8(addr, val); - //DCacheInvalidateByAddr(addr); } } } @@ -1608,12 +1572,12 @@ void ARMv5::DataWrite8(u32 addr, u8 val) } BusWrite8(addr, val); - DataCycles = MemTimings[addr >> 12][1]; + DataCycles = MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S16]; } -void ARMv5::DataWrite16(u32 addr, u16 val) +void ARMv5::DataWrite16(const u32 addr, const u16 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_WRITEABLE)) { DataAbort(); return; @@ -1625,39 +1589,36 @@ void ARMv5::DataWrite16(u32 addr, u16 val) { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { DCacheWrite16(addr, val); - // DCacheInvalidateByAddr(addr); } } } DataRegion = addr; - addr &= ~1; - if (addr < ITCMSize) { DataCycles = 1; - *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 2)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; - *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + *(u16*)&DTCM[addr & (DTCMPhysicalSize - 2)] = val; return; } - BusWrite16(addr, val); - DataCycles = MemTimings[addr >> 12][1]; + BusWrite16(addr & ~1, val); + DataCycles = MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S16]; } -void ARMv5::DataWrite32(u32 addr, u32 val) +void ARMv5::DataWrite32(const u32 addr, const u32 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_WRITEABLE)) { DataAbort(); return; @@ -1669,50 +1630,44 @@ void ARMv5::DataWrite32(u32 addr, u32 val) { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { DCacheWrite32(addr, val); - // DCacheInvalidateByAddr(addr); } } } DataRegion = addr; - addr &= ~3; - if (addr < ITCMSize) { DataCycles = 1; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)] = val; return; } - BusWrite32(addr, val); - DataCycles = MemTimings[addr >> 12][2]; + BusWrite32(addr & ~3, val); + DataCycles = MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_N32]; } -void ARMv5::DataWrite32S(u32 addr, u32 val) +void ARMv5::DataWrite32S(const u32 addr, const u32 val) { - addr &= ~3; - #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) #endif { if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (PU_Map[addr >> 12] & 0x10) + if (IsAddressDCachable(addr)) { DCacheWrite32(addr, val); - // DCacheInvalidateByAddr(addr); } } } @@ -1720,7 +1675,7 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles += 1; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)] = val; #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif @@ -1729,23 +1684,16 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)] = val; return; } - BusWrite32(addr, val); - DataCycles += MemTimings[addr >> 12][3]; + BusWrite32(addr & ~3, val); + DataCycles += MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S32]; } void ARMv5::GetCodeMemRegion(u32 addr, MemRegion* region) { - /*if (addr < ITCMSize) - { - region->Mem = ITCM; - region->Mask = 0x7FFF; - return; - }*/ - NDS.ARM9GetMemRegion(addr, false, &CodeMem); } diff --git a/src/CP15_Constants.h b/src/CP15_Constants.h new file mode 100644 index 00000000..80fadf31 --- /dev/null +++ b/src/CP15_Constants.h @@ -0,0 +1,131 @@ +/* + Copyright 2016-2023 melonDS team + + This file is part of melonDS. + + melonDS is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + melonDS is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with melonDS. If not, see http://www.gnu.org/licenses/. +*/ + +#ifndef MELONDS_CP15CONSTANTS_H +#define MELONDS_CP15CONSTANTS_H + +#include "types.h" + +namespace melonDS +{ + +/* ICACHE Layout constants */ +constexpr u32 ICACHE_SIZE_LOG2 = 13; +constexpr u32 ICACHE_SIZE = 1 << ICACHE_SIZE_LOG2; +constexpr u32 ICACHE_SETS_LOG2 = 2; +constexpr u32 ICACHE_SETS = 1 << ICACHE_SETS_LOG2; +constexpr u32 ICACHE_LINELENGTH_ENCODED = 2; +constexpr u32 ICACHE_LINELENGTH_LOG2 = ICACHE_LINELENGTH_ENCODED + 3; +constexpr u32 ICACHE_LINELENGTH = 8 * (1 << ICACHE_LINELENGTH_ENCODED); +constexpr u32 ICACHE_LINESPERSET = ICACHE_SIZE / (ICACHE_SETS * ICACHE_LINELENGTH); + +/* DCACHE Layout constants */ +constexpr u32 DCACHE_SIZE_LOG2 = 12; +constexpr u32 DCACHE_SIZE = 1 << DCACHE_SIZE_LOG2; +constexpr u32 DCACHE_SETS_LOG2 = 2; +constexpr u32 DCACHE_SETS = 1 << DCACHE_SETS_LOG2; +constexpr u32 DCACHE_LINELENGTH_ENCODED = 2; +constexpr u32 DCACHE_LINELENGTH_LOG2 = DCACHE_LINELENGTH_ENCODED + 3; +constexpr u32 DCACHE_LINELENGTH = 8 * (1 << DCACHE_LINELENGTH_ENCODED); +constexpr u32 DCACHE_LINESPERSET = DCACHE_SIZE / (DCACHE_SETS * DCACHE_LINELENGTH); + +/* CP15 Cache Data TAGs */ +constexpr u32 CACHE_FLAG_VALID = (1 << 4); +constexpr u32 CACHE_FLAG_DIRTY_LOWERHALF = (1 << 2); +constexpr u32 CACHE_FLAG_DIRTY_UPPERHALF = (1 << 3); +constexpr u32 CACHE_FLAG_DIRTY_MASK = (3 << 2); +constexpr u32 CACHE_FLAG_SET_MASK = (3 << 0); + +/* CP15 Cache Type Register */ +constexpr u32 CACHE_TR_LOCKDOWN_TYPE_B = (7 << 25); +constexpr u32 CACHE_TR_NONUNIFIED = (1 << 24); + +/* CP15 I/DCache LockDown registers */ +constexpr u32 CACHE_LOCKUP_L = (1 << 31); + +/* CP15 Main ID register */ +constexpr u32 CP15_MAINID_IMPLEMENTOR_ARM = (0x41 << 24); +constexpr u32 CP15_MAINID_IMPLEMENTOR_DEC = (0x44 << 24); +constexpr u32 CP15_MAINID_IMPLEMENTOR_MOTOROLA = (0x4D << 24); +constexpr u32 CP15_MAINID_IMPLEMENTOR_MARVELL = (0x56 << 24); +constexpr u32 CP15_MAINID_IMPLEMENTOR_INTEL = (0x69 << 24); +constexpr u32 CP15_MAINID_VARIANT_0 = (0 << 20); +constexpr u32 CP15_MAINID_ARCH_v4 = (1 << 16); +constexpr u32 CP15_MAINID_ARCH_v4T = (2 << 16); +constexpr u32 CP15_MAINID_ARCH_v5 = (3 << 16); +constexpr u32 CP15_MAINID_ARCH_v5T = (4 << 16); +constexpr u32 CP15_MAINID_ARCH_v5TE = (5 << 16); +constexpr u32 CP15_MAINID_ARCH_v5TEJ = (6 << 16); +constexpr u32 CP15_MAINID_ARCH_v6 = (7 << 16); +constexpr u32 CP15_MAINID_IMPLEMENTATION_946 = (0x946 << 4); +constexpr u32 CP15_MAINID_REVISION_0 = (0 << 0); +constexpr u32 CP15_MAINID_REVISION_1 = (1 << 0); + +/* CP15 Control Register */ +constexpr u32 CP15_CR_MPUENABLE = (1 << 0); +constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); +constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); + +/* CP15 Internal Exception base value */ +constexpr u32 CP15_EXCEPTIONBASE_HIGH = 0xFFFF0000; +constexpr u32 CP15_EXCEPTIONBASE_LOW = 0x00000000; + +/* CP15 Cache and Write Buffer Conrol Register */ +constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 << 14); +constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); +constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); +constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); + +/* CP15 TCM Control Register */ +constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); +constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); + +/* CP15 Region Base and Size Register */ +constexpr u32 CP15_REGION_COUNT = 8; +constexpr u32 CP15_REGION_ENABLE = (1 << 0); +constexpr u32 CP15_REGION_SIZE_MASK = (0x1F << 1); +constexpr u32 CP15_REGION_BASE_GRANULARITY_LOG2 = 12; +constexpr u32 CP15_REGION_BASE_GRANULARITY = (1 << CP15_REGION_BASE_GRANULARITY_LOG2); +constexpr u32 CP15_REGION_BASE_MASK = ~(CP15_REGION_BASE_GRANULARITY_LOG2-1); + +/* CP15 Region access mask registers */ +constexpr u32 CP15_REGIONACCESS_BITS_PER_REGION = 4; +constexpr u32 CP15_REGIONACCESS_REGIONMASK = (1 << CP15_REGIONACCESS_BITS_PER_REGION) - 1; + +/* Flags in the melonDS internal PU_PrivMap and PU_UserMap */ +constexpr u32 CP15_MAP_NOACCESS = 0x00; +constexpr u32 CP15_MAP_READABLE = 0x01; +constexpr u32 CP15_MAP_WRITEABLE = 0x02; +constexpr u32 CP15_MAP_EXECUTABLE = 0x04; +constexpr u32 CP15_MAP_DCACHEABLE = 0x10; +constexpr u32 CP15_MAP_DCACHEWRITEBACK = 0x20; +constexpr u32 CP15_MAP_ICACHEABLE = 0x40; + +constexpr u32 CP15_MAP_ENTRYSIZE_LOG2 = CP15_REGION_BASE_GRANULARITY_LOG2; +constexpr u32 CP15_MAP_ENTRYSIZE = (1 << CP15_MAP_ENTRYSIZE_LOG2); + +/* Internal Timing Constants */ +constexpr u32 BUSCYCLES_N16 = 0; +constexpr u32 BUSCYCLES_S16 = 1; +constexpr u32 BUSCYCLES_N32 = 2; +constexpr u32 BUSCYCLES_S32 = 3; + +constexpr u32 BUSCYCLES_MAP_GRANULARITY_LOG2 = CP15_REGION_BASE_GRANULARITY_LOG2; +} + +#endif // MELONDS_CP15CONSTANTS_H \ No newline at end of file diff --git a/src/MemConstants.h b/src/MemConstants.h index 332b9b18..e9aa6b2b 100644 --- a/src/MemConstants.h +++ b/src/MemConstants.h @@ -34,44 +34,6 @@ constexpr u32 ITCMPhysicalSize = 0x8000; constexpr u32 DTCMPhysicalSize = 0x4000; constexpr u32 ARM7BIOSCRC32 = 0x1280f0d5; constexpr u32 ARM9BIOSCRC32 = 0x2ab23573; - -constexpr u32 ICACHE_SIZE_LOG2 = 13; -constexpr u32 ICACHE_SIZE = 1 << ICACHE_SIZE_LOG2; -constexpr u32 ICACHE_SETS_LOG2 = 2; -constexpr u32 ICACHE_SETS = 1 << ICACHE_SETS_LOG2; -constexpr u32 ICACHE_LINELENGTH_ENCODED = 2; -constexpr u32 ICACHE_LINELENGTH_LOG2 = ICACHE_LINELENGTH_ENCODED + 3; -constexpr u32 ICACHE_LINELENGTH = 8 * (1 << ICACHE_LINELENGTH_ENCODED); -constexpr u32 ICACHE_LINESPERSET = ICACHE_SIZE / (ICACHE_SETS * ICACHE_LINELENGTH); - -constexpr u32 DCACHE_SIZE_LOG2 = 12; -constexpr u32 DCACHE_SIZE = 1 << DCACHE_SIZE_LOG2; -constexpr u32 DCACHE_SETS_LOG2 = 2; -constexpr u32 DCACHE_SETS = 1 << DCACHE_SETS_LOG2; -constexpr u32 DCACHE_LINELENGTH_ENCODED = 2; -constexpr u32 DCACHE_LINELENGTH_LOG2 = DCACHE_LINELENGTH_ENCODED + 3; -constexpr u32 DCACHE_LINELENGTH = 8 * (1 << DCACHE_LINELENGTH_ENCODED); -constexpr u32 DCACHE_LINESPERSET = DCACHE_SIZE / (DCACHE_SETS * DCACHE_LINELENGTH); - -constexpr u32 CACHE_FLAG_VALID = (1 << 4); -constexpr u32 CACHE_FLAG_DIRTY_LOWERHALF = (1 << 2); -constexpr u32 CACHE_FLAG_DIRTY_UPPERHALF = (1 << 3); - -constexpr u32 CACHE_TR_LOCKDOWN_TYPE_B = (7 << 25); -constexpr u32 CACHE_TR_NONUNIFIED = (1 << 24); - -constexpr u32 CACHE_LOCKUP_L = (1 << 31); - -constexpr u32 CP15_CR_MPUENABLE = (1 << 0); -constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); -constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); -constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 << 14); -constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); -constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); -constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); -constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); -constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); - } #endif // MELONDS_MEMCONSTANTS_H \ No newline at end of file From 1a9179b8d03ba46287b92460de77a756b5a16a71 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 1 Feb 2024 08:49:04 +0100 Subject: [PATCH 025/306] Removed unneccessary wfcsettings file --- wfcsettings.bin | Bin 2304 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 wfcsettings.bin diff --git a/wfcsettings.bin b/wfcsettings.bin deleted file mode 100644 index af3146aeb6423c6cff0a4d369941eb72c52e62d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2304 zcmezWe-w;{z-S1JhQMeDjKmONpb+Gy=H%x&22f}U-Lx};ZSpv2Zan{jrB=^(PP*AV buonG?T94-cg1ZCjEV|i_>VLSkr#}DyyOIjR From 81c943411654d49b0ab9220a7f03a18d88f9ccb5 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 1 Feb 2024 12:01:16 +0100 Subject: [PATCH 026/306] Added CP15 Trace Process ID --- src/ARM.h | 3 +-- src/CP15.cpp | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index df06e84e..e56534ad 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -332,8 +332,6 @@ public: void DCacheClearByAddr(const u32 addr); void DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine); - - void CP15Write(u32 id, u32 val); u32 CP15Read(u32 id) const; @@ -344,6 +342,7 @@ public: u32 DTCMSetting, ITCMSetting; u32 DCacheLockDown, ICacheLockDown; u32 CacheDebugRegisterIndex; + u32 CP15TraceProcessId; // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate diff --git a/src/CP15.cpp b/src/CP15.cpp index 714e591d..caa8de34 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -40,6 +40,16 @@ const int kDataCacheTiming = 3;//2; const int kCodeCacheTiming = 3;//5; +/* CP15 Reset sets the default values within each registers and + memories of the CP15. + This includes the Settings for + DTCM + ITCM + Caches + Regions + Process Trace +*/ + void ARMv5::CP15Reset() { CP15Control = 0x2078; // dunno @@ -68,6 +78,8 @@ void ARMv5::CP15Reset() DCacheInvalidateAll(); DCacheCount = 0; + CP15TraceProcessId = 0; + PU_CodeCacheable = 0; PU_DataCacheable = 0; PU_DataCacheWrite = 0; @@ -103,6 +115,7 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&DCacheLockDown); file->Var32(&ICacheLockDown); file->Var32(&CacheDebugRegisterIndex); + file->Var32(&CP15TraceProcessId); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -1086,6 +1099,11 @@ void ARMv5::CP15Write(u32 id, u32 val) UpdateITCMSetting(); return; + case 0xD01: + case 0xD11: + CP15TraceProcessId = val; + return; + case 0xF00: if (PU_Map != PU_PrivMap) { @@ -1277,6 +1295,10 @@ u32 ARMv5::CP15Read(u32 id) const case 0x911: return ITCMSetting; + case 0xD01: // See arm946E-S Rev 1 technical Reference Manual, Chapter 2.3.13 */ + case 0xD11: // backwards compatible read/write of the same register + return CP15TraceProcessId; + case 0xF00: if (PU_Map != PU_PrivMap) { From 9fa814b68e20bedb667f8938f9cbfce12285be26 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 1 Feb 2024 13:15:03 +0100 Subject: [PATCH 027/306] Added check of op1 in MCR/MRC Fixed Cache Debug registers were accessible, when op1 != 3 in MCR/MRC instructions Added BIST Test Status register and its cache linefill disable bits --- src/ARM.h | 1 + src/ARMInterpreter.cpp | 12 ++++---- src/CP15.cpp | 66 ++++++++++++++++++++++++++++++++++++++---- src/CP15_Constants.h | 4 +++ 4 files changed, 71 insertions(+), 12 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index e56534ad..abb0e686 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -343,6 +343,7 @@ public: u32 DCacheLockDown, ICacheLockDown; u32 CacheDebugRegisterIndex; u32 CP15TraceProcessId; + u32 CP15BISTTestStateRegister; // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index ff73e230..316db0a2 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -212,14 +212,14 @@ void A_MCR(ARM* cpu) return A_UNK(cpu); u32 cp = (cpu->CurInstr >> 8) & 0xF; - //u32 op = (cpu->CurInstr >> 21) & 0x7; + u32 op = (cpu->CurInstr >> 21) & 0x7; u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; if (cpu->Num==0 && cp==15) { - ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo, cpu->R[(cpu->CurInstr>>12)&0xF]); + ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo|(op<<12), cpu->R[(cpu->CurInstr>>12)&0xF]); } else if (cpu->Num==1 && cp==14) { @@ -227,7 +227,7 @@ void A_MCR(ARM* cpu) } else { - Log(LogLevel::Warn, "bad MCR opcode p%d,%d,%d,%d on ARM%d\n", cp, cn, cm, cpinfo, cpu->Num?7:9); + Log(LogLevel::Warn, "bad MCR opcode p%d, %d, reg, c%d, c%d, %d on ARM%d\n", cp, op, cn, cm, cpinfo, cpu->Num?7:9); return A_UNK(cpu); // TODO: check what kind of exception it really is } @@ -240,14 +240,14 @@ void A_MRC(ARM* cpu) return A_UNK(cpu); u32 cp = (cpu->CurInstr >> 8) & 0xF; - //u32 op = (cpu->CurInstr >> 21) & 0x7; + u32 op = (cpu->CurInstr >> 21) & 0x7; u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; if (cpu->Num==0 && cp==15) { - cpu->R[(cpu->CurInstr>>12)&0xF] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo); + cpu->R[(cpu->CurInstr>>12)&0xF] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo|(op<<12)); } else if (cpu->Num==1 && cp==14) { @@ -255,7 +255,7 @@ void A_MRC(ARM* cpu) } else { - Log(LogLevel::Warn, "bad MRC opcode p%d,%d,%d,%d on ARM%d\n", cp, cn, cm, cpinfo, cpu->Num?7:9); + Log(LogLevel::Warn, "bad MRC opcode p%d, %d, reg, c%d, c%d, %d on ARM%d\n", cp, op, cn, cm, cpinfo, cpu->Num?7:9); return A_UNK(cpu); // TODO: check what kind of exception it really is } diff --git a/src/CP15.cpp b/src/CP15.cpp index caa8de34..f6f11a7a 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -69,6 +69,7 @@ void ARMv5::CP15Reset() ICacheLockDown = 0; DCacheLockDown = 0; CacheDebugRegisterIndex = 0; + CP15BISTTestStateRegister = 0; memset(ICache, 0, ICACHE_SIZE); ICacheInvalidateAll(); @@ -116,6 +117,7 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&ICacheLockDown); file->Var32(&CacheDebugRegisterIndex); file->Var32(&CP15TraceProcessId); + file->Var32(&CP15BISTTestStateRegister); file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); @@ -384,6 +386,21 @@ u32 ARMv5::ICacheLookup(const u32 addr) } // cache miss + + // We do not fill the cacheline if it is disabled in the + // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) + { + CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; + if (CodeMem.Mem) + { + return *(u32*)&CodeMem.Mem[(addr & CodeMem.Mask) & ~3]; + } else + { + return NDS.ARM9Read32(addr & ~3); + } + } + u32 line; #if 0 // caclulate in which cacheline the data is to be filled @@ -507,6 +524,25 @@ u32 ARMv5::DCacheLookup(const u32 addr) } // cache miss + + // We do not fill the cacheline if it is disabled in the + // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) + { + DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; + if (addr < ITCMSize) + { + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; + } else + if ((addr & DTCMMask) == DTCMBase) + { + return *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; + } else + { + return BusRead32(addr & ~3); + } + } + u32 line; #if 0 // caclulate in which cacheline the data is to be filled @@ -694,7 +730,7 @@ void ARMv5::CP15Write(u32 id, u32 val) { //if(id!=0x704)printf("CP15 write op %03X %08X %08X\n", id, val, R[15]); - switch (id) + switch (id & 0xFFF) { case 0x100: { @@ -1112,7 +1148,20 @@ void ARMv5::CP15Write(u32 id, u32 val) else return ARMInterpreter::A_UNK(this); } else - CacheDebugRegisterIndex = val; + { + if (((id >> 12) & 0x0f) == 0x03) + CacheDebugRegisterIndex = val; + else if (((id >> 12) & 0x0f) == 0x00) + CP15BISTTestStateRegister = val; + else + { + if (CPSR & 0x20) // THUMB + return ARMInterpreter::T_UNK(this); + else + return ARMInterpreter::A_UNK(this); + } + + } return; case 0xF10: @@ -1184,14 +1233,14 @@ void ARMv5::CP15Write(u32 id, u32 val) } - Log(LogLevel::Debug, "unknown CP15 write op %03X %08X\n", id, val); + Log(LogLevel::Debug, "unknown CP15 write op %04X %08X\n", id, val); } u32 ARMv5::CP15Read(u32 id) const { //printf("CP15 read op %03X %08X\n", id, NDS::ARM9->R[15]); - switch (id) + switch (id & 0xFFF) { case 0x000: // CPU ID case 0x003: @@ -1304,7 +1353,12 @@ u32 ARMv5::CP15Read(u32 id) const { return 0; } else - return CacheDebugRegisterIndex; + { + if (((id >> 12) & 0x0f) == 0x03) + return CacheDebugRegisterIndex; + if (((id >> 12) & 0x0f) == 0x00) + return CP15BISTTestStateRegister; + } case 0xF10: // instruction cache Tag register if (PU_Map != PU_PrivMap) @@ -1351,7 +1405,7 @@ u32 ARMv5::CP15Read(u32 id) const } } - Log(LogLevel::Debug, "unknown CP15 read op %03X\n", id); + Log(LogLevel::Debug, "unknown CP15 read op %04X\n", id); return 0; } diff --git a/src/CP15_Constants.h b/src/CP15_Constants.h index 80fadf31..0d786afa 100644 --- a/src/CP15_Constants.h +++ b/src/CP15_Constants.h @@ -91,6 +91,10 @@ constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); +/* CP15 BIST Test State register */ +constexpr u32 CP15_BIST_TR_DISABLE_ICACHE_LINEFILL = (1 << 9); +constexpr u32 CP15_BIST_TR_DISABLE_DCACHE_LINEFILL = (1 << 10); + /* CP15 TCM Control Register */ constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); From c0075404fd32429820cce7f7c0e73b4936a2e13d Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Thu, 1 Feb 2024 13:25:07 +0100 Subject: [PATCH 028/306] Included the I/DCache Streaming disable bits in cache lookup --- src/CP15.cpp | 32 ++++++++++++++++++++++++++++++++ src/CP15_Constants.h | 2 ++ 2 files changed, 34 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index f6f11a7a..5861f15f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -381,6 +381,20 @@ u32 ARMv5::ICacheLookup(const u32 addr) { CodeCycles = 1; u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) + { + // Disabled ICACHE Streaming: + // retreive the data from memory, even if the data was cached + // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; + if (CodeMem.Mem) + { + return *(u32*)&CodeMem.Mem[(addr & CodeMem.Mask) & ~3]; + } else + { + return NDS.ARM9Read32(addr & ~3); + } + } return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } } @@ -519,6 +533,24 @@ u32 ARMv5::DCacheLookup(const u32 addr) { DataCycles = 1; u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) + { + // Disabled DCACHE Streaming: + // retreive the data from memory, even if the data was cached + // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; + if (addr < ITCMSize) + { + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; + } else + if ((addr & DTCMMask) == DTCMBase) + { + return *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; + } else + { + return BusRead32(addr & ~3); + } + } return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } } diff --git a/src/CP15_Constants.h b/src/CP15_Constants.h index 0d786afa..b148372d 100644 --- a/src/CP15_Constants.h +++ b/src/CP15_Constants.h @@ -92,6 +92,8 @@ constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); /* CP15 BIST Test State register */ +constexpr u32 CP15_BIST_TR_DISABLE_ICACHE_STREAMING = (1 << 11); +constexpr u32 CP15_BIST_TR_DISABLE_DCACHE_STREAMING = (1 << 12); constexpr u32 CP15_BIST_TR_DISABLE_ICACHE_LINEFILL = (1 << 9); constexpr u32 CP15_BIST_TR_DISABLE_DCACHE_LINEFILL = (1 << 10); From 02d6fbacf6d75757ae32fba86e51a3d464487d35 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Fri, 2 Feb 2024 14:43:23 +0100 Subject: [PATCH 029/306] Added several doxygen-style comments for documentation --- src/ARM.h | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 206 insertions(+), 13 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index abb0e686..2f9f4507 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -311,39 +311,232 @@ public: u32 RandomLineIndex(); + /** + * @brief Perform an instruction cache lookup handle + * @details + * A cache lookup is performed, if not disabled in + * @ref CP15BISTTestStateRegister, a hit will returned the + * cached data, otherwise it returns the result of an memory + * access instead. + * If the cache lookup results in a cachemiss and linefill is + * not disabled in @ref CP15BISTTestStateRegister, will fill + * fetch all data to fill the entire cacheline directly + * from the ITCM or bus + * @param [in] addr Address of the memory to be retreived from + * cache. The address is internally aligned to an word boundary + * @return Value of the word at addr + */ u32 ICacheLookup(const u32 addr); + + /** + * @brief Check if an address is within a instruction cachable + * region + * @details + * Checks the address by looking up the PU_map flags for + * the address and returns the status of the instruction + * cache enable flag + * + * @param [in] addr Address. May be unaligned. + * @retval true If the address points to a region, that is + * enabled for instruction fetches to be cached. + */ inline bool IsAddressICachable(const u32 addr) const; + /** + * @brief Invalidates all instruction cache lines + * @details + * Clears the @ref CACHE_FLAG_VALID of each cache line in the + * instruction cache. All other flags and values are kept. + * @par Returns + * Nothing + */ void ICacheInvalidateAll(); + + /** + * @brief Invalidates the instruction cacheline containing + * the data of an address. + * @details + * Searches the cacheline containing the data of an address, and + * if found clears the @ref CACHE_FLAG_VALID of this cache line. + * Nothing is done if the address is not present in the cache. + * @param [in] addr Memory address of the data in the cache line + * @par Returns + * Nothing + */ void ICacheInvalidateByAddr(const u32 addr); + + /** + * @brief Invalidates an instruction cache line + * @details + * Clears the @ref CACHE_FLAG_VALID of the cacheline given by + * set and index within the set. Nothing is done if the cache + * line does not exist. + * @param [in] cacheSet index of the internal cache set from + * 0 to @ref ICACHE_SETS - 1 + * @param [in] cacheLine index of the line within the cache set + * from 0 to @ref ICACHE_LINESPERSET - 1 + * @par Returns + * Nothing + */ void ICacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); + /** + * @brief Perform an data cache lookup handle + * @details + * A cache lookup is performed, if not disabled in + * @ref CP15BISTTestStateRegister, a hit will returned the + * cached data, otherwise it returns the result of an memory + * access instead. + * If the cache lookup results in a cachemiss and linefill is + * not disabled in @ref CP15BISTTestStateRegister, will fill + * fetch all data to fill the entire cacheline directly + * from the ITCM, DTCM or bus + * @param [in] addr Address of the memory to be retreived from + * cache. The address is internally aligned to an word boundary + * @return Value of the word at addr + */ u32 DCacheLookup(const u32 addr); + + /** + * @brief Updates a word in the data cache if present + * @param [in] addr Memory address which is written + * @param [in] val Word value to be written + * @par Returns + * Nothing + */ void DCacheWrite32(const u32 addr, const u32 val); + + /** + * @brief Updates a word in the data cache if present + * @param [in] addr Memory address which is written + * @param [in] val Half-Word value to be written + * @par Returns + * Nothing + */ void DCacheWrite16(const u32 addr, const u16 val); + + /** + * @brief Updates a word in the data cache if present + * @param [in] addr Memory address which is written + * @param [in] val Byte value to be written + * @par Returns + * Nothing + */ void DCacheWrite8(const u32 addr, const u8 val); + + /** + * @brief Check if an address is within a data cachable region + * @details + * Checks the address by looking up the PU_map flags for + * the address and returns the status of the data cache enable + * flag + * + * @param [in] addr Address. May be unaligned. + * @retval true If the address points to a region, that is + * enabled for instruction fetches to be cached. + */ inline bool IsAddressDCachable(const u32 addr) const; + /** + * @brief Invalidates the data cacheline containing the data of + * an address. + * @details + * Searches the cacheline containing the data of an address, and + * if found clears the @ref CACHE_FLAG_VALID of this cache line. + * Nothing is done if the address is not present in the cache. + * @par Returns + * Nothing + */ void DCacheInvalidateAll(); + + /** + * @brief Invalidates the data cacheline containing the data of + * an address. + * @details + * Searches the cacheline containing the data of an address, and + * if found clears the @ref CACHE_FLAG_VALID of this cache line. + * Nothing is done if the address is not present in the cache. + * @par Returns + * Nothing + */ void DCacheInvalidateByAddr(const u32 addr); + + /** + * @brief Invalidates an data cache line + * @details + * Clears the @ref CACHE_FLAG_VALID of the cacheline given by + * set and index within the set. Nothing is done if the cache + * line does not exist. + * @param [in] cacheSet index of the internal cache set from + * 0 to @ref DCACHE_SETS - 1 + * @param [in] cacheLine index of the line within the cache set + * from 0 to @ref DCACHE_LINESPERSET - 1 + * @par Returns + * Nothing + */ void DCacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine); + /** + * @brief Cleans the entire data cache + * @details + * In melonDS, the data cache is instantly cleaned on writes, the + * @ref CACHE_FLAG_DIRTY_LOWERHALF and @ref CACHE_FLAG_DIRTY_UPPERHALF are + * not set. + * If they are implemented at a later time, the cache content has to be + * written to memory, the dirty bit cleared. The call should require + * as much cycles as needed for this write operation. + * @par Returns + * Nothing + */ void DCacheClearAll(); + + /** + * @brief Cleans a data cache line + * @details + * In melonDS, the data cache is instantly cleaned on writes, the + * @ref CACHE_FLAG_DIRTY_LOWERHALF and @ref CACHE_FLAG_DIRTY_UPPERHALF are + * not set. + * If they are implemented at a later time, the cache content has to be + * written to memory, the dirty bit cleared. The call should require + * as much cycles as needed for this write operation. + * @param [in] addr Memory address of the data in the cache line + * @par Returns + * Nothing + */ void DCacheClearByAddr(const u32 addr); + + /** + * @brief Cleans a data cache line + * @details + * In melonDS, the data cache is instantly cleaned on writes, the + * @ref CACHE_FLAG_DIRTY_LOWERHALF and @ref CACHE_FLAG_DIRTY_UPPERHALF are + * not set. + * If they are implemented at a later time, the cache content has to be + * written to memory, the dirty bit cleared. The call should require + * as much cycles as needed for this write operation. + * @param [in] cacheSet index of the internal cache set from + * 0 to @ref DCACHE_SETS - 1 + * @param [in] cacheLine index of the line within the cache set + * from 0 to @ref DCACHE_LINESPERSET - 1 + * @par Returns + * Nothing + */ void DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine); void CP15Write(u32 id, u32 val); u32 CP15Read(u32 id) const; - u32 CP15Control; + u32 CP15Control; //! CP15 Register 1: Control Register - u32 RNGSeed; + u32 RNGSeed; //! Global cache line fill seed. Used for pseudo random replacement strategy with the instruction and data cache - u32 DTCMSetting, ITCMSetting; - u32 DCacheLockDown, ICacheLockDown; - u32 CacheDebugRegisterIndex; - u32 CP15TraceProcessId; - u32 CP15BISTTestStateRegister; + u32 DTCMSetting; + u32 ITCMSetting; + u32 DCacheLockDown; //! CP15: Data Cache Lockdown Register + u32 ICacheLockDown; //! CP15: Instruction Cache Lockdown Register + u32 CacheDebugRegisterIndex; //! CP15: Cache Debug Index Register + u32 CP15TraceProcessId; //! CP15: Trace Process Id Register + u32 CP15BISTTestStateRegister; //! CP15: BIST Test State Register // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate @@ -354,13 +547,13 @@ public: u8 ITCM[ITCMPhysicalSize]; u8* DTCM; - u8 ICache[ICACHE_SIZE]; - u32 ICacheTags[ICACHE_LINESPERSET*ICACHE_SETS]; - u8 ICacheCount; + u8 ICache[ICACHE_SIZE]; //! Instruction Cache Content organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS times @ref ICACHE_LINELENGTH bytes + u32 ICacheTags[ICACHE_LINESPERSET*ICACHE_SETS]; //! Instruction Cache Tags organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS Tags + u8 ICacheCount; //! Global instruction line fill counter. Used for round-robin replacement strategy with the instruction cache - u8 DCache[DCACHE_SIZE]; - u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; - u8 DCacheCount; + u8 DCache[DCACHE_SIZE]; //! Data Cache Content organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS times @ref DCACHE_LINELENGTH bytes + u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; //! Data Cache Tags organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS Tags + u8 DCacheCount; //! Global data line fill counter. Used for round-robin replacement strategy with the instruction cache u32 PU_CodeCacheable; u32 PU_DataCacheable; From f9a831e446b377b2a059dfa105904922272f60a6 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Sat, 3 Feb 2024 16:20:40 +0100 Subject: [PATCH 030/306] Removed Thumb Check on CP15 Access restriction as MCR/MRC are not present in thumb --- src/CP15.cpp | 105 +++++++++++---------------------------------------- 1 file changed, 21 insertions(+), 84 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 5861f15f..5f84543b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -928,10 +928,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } ICacheInvalidateByAddr(val); //Halt(255); @@ -940,10 +937,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { // Cache invalidat by line number and set number @@ -959,10 +953,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } DCacheInvalidateAll(); //printf("inval data cache %08X\n", val); @@ -971,10 +962,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } DCacheInvalidateByAddr(val); //printf("inval data cache SI\n"); @@ -983,10 +971,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { // Cache invalidat by line number and set number @@ -1007,10 +992,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } //Log(LogLevel::Debug,"clean data cache\n"); DCacheClearAll(); @@ -1019,10 +1001,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } //Log(LogLevel::Debug,"clean data cache MVA\n"); DCacheClearByAddr(val); @@ -1032,10 +1011,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { // Cache invalidat by line number and set number @@ -1048,10 +1024,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } // Test and clean (optional) // Is not present on the NDS/DSi @@ -1067,10 +1040,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } // we force a fill by looking up the value from cache // if it wasn't cached yet, it will be loaded into cache @@ -1082,10 +1052,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } DCacheClearAll(); DCacheInvalidateAll(); @@ -1095,10 +1062,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } DCacheClearByAddr(val); DCacheInvalidateByAddr(val); @@ -1108,10 +1072,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { // Cache invalidat by line number and set number @@ -1126,10 +1087,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } // Cache Lockdown - Format B // Bit 31: Lock bit @@ -1143,10 +1101,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } // Cache Lockdown - Format B // Bit 31: Lock bit @@ -1175,10 +1130,7 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0xF00: if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { if (((id >> 12) & 0x0f) == 0x03) @@ -1187,10 +1139,7 @@ void ARMv5::CP15Write(u32 id, u32 val) CP15BISTTestStateRegister = val; else { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } } @@ -1200,10 +1149,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // instruction cache Tag register if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); @@ -1216,10 +1162,7 @@ void ARMv5::CP15Write(u32 id, u32 val) // data cache Tag register if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); @@ -1233,10 +1176,7 @@ void ARMv5::CP15Write(u32 id, u32 val) //printf("cache debug instruction cache %08X\n", val); if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-ICACHE_SETS_LOG2)) & (ICACHE_SETS-1); @@ -1250,10 +1190,7 @@ void ARMv5::CP15Write(u32 id, u32 val) //printf("cache debug data cache %08X\n", val); if (PU_Map != PU_PrivMap) { - if (CPSR & 0x20) // THUMB - return ARMInterpreter::T_UNK(this); - else - return ARMInterpreter::A_UNK(this); + return ARMInterpreter::A_UNK(this); } else { uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); From 2a385b52773a6a19f309f02760731ff2fa50ee18 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Sun, 4 Feb 2024 12:31:57 +0100 Subject: [PATCH 031/306] Cleaned up some more magic numbers Fixed a bug causing overlapping protection regions priority not taken into account, when access permission or cachability bits were changed only on the least priority overlap --- src/ARM.h | 12 +-- src/CP15.cpp | 223 ++++++++++++++++++++++++++++++------------- src/CP15_Constants.h | 34 ++++--- 3 files changed, 184 insertions(+), 85 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 2f9f4507..61a087fd 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -555,14 +555,14 @@ public: u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; //! Data Cache Tags organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS Tags u8 DCacheCount; //! Global data line fill counter. Used for round-robin replacement strategy with the instruction cache - u32 PU_CodeCacheable; - u32 PU_DataCacheable; - u32 PU_DataCacheWrite; + u32 PU_CodeCacheable; //! CP15 Register 2 Opcode 1: Code Cachable Bits + u32 PU_DataCacheable; //! CP15 Register 2 Opcode 0: Data Cachable Bits + u32 PU_DataCacheWrite; //! CP15 Register 3 Opcode 0: WriteBuffer Control Register - u32 PU_CodeRW; - u32 PU_DataRW; + u32 PU_CodeRW; //! CP15 Register 5 Opcode 3: Code Access Permission register + u32 PU_DataRW; //! CP15 Register 5 Opcode 2: Data Access Permission register - u32 PU_Region[8]; + u32 PU_Region[8]; //! CP15 Register 6 Opcode 0..7: Protection Region Base and Size Register // 0=dataR 1=dataW 2=codeR 4=datacache 5=datawrite 6=codecache u8 PU_PrivMap[0x100000]; diff --git a/src/CP15.cpp b/src/CP15.cpp index 5f84543b..fa0a426d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -512,6 +512,7 @@ void ARMv5::ICacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine) void ARMv5::ICacheInvalidateAll() { + #pragma GCC ivdep for (int i = 0; i < ICACHE_SIZE / ICACHE_LINELENGTH; i++) ICacheTags[i] &= ~CACHE_FLAG_VALID; ; } @@ -731,6 +732,7 @@ void ARMv5::DCacheInvalidateBySetAndWay(const u8 cacheSet, const u8 cacheLine) void ARMv5::DCacheInvalidateAll() { + #pragma GCC ivdep for (int i = 0; i < DCACHE_SIZE / DCACHE_LINELENGTH; i++) DCacheTags[i] &= ~CACHE_FLAG_VALID; ; } @@ -767,13 +769,11 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x100: { u32 old = CP15Control; - val &= 0x000FF085; - CP15Control &= ~0x000FF085; - CP15Control |= val; + CP15Control = (CP15Control & ~CP15_CR_CHANGEABLE_MASK) | (val & CP15_CR_CHANGEABLE_MASK); //Log(LogLevel::Debug, "CP15Control = %08X (%08X->%08X)\n", CP15Control, old, val); UpdateDTCMSetting(); UpdateITCMSetting(); - u32 changedBits = old^val; + u32 changedBits = old ^ CP15Control; if (changedBits & (CP15_CR_MPUENABLE | CP15_CACHE_CR_ICACHEENABLE| CP15_CACHE_CR_DCACHEENABLE)) { UpdatePURegions(changedBits & CP15_CR_MPUENABLE); @@ -789,10 +789,25 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 diff = PU_DataCacheable ^ val; PU_DataCacheable = val; - for (u32 i = 0; i < CP15_REGION_COUNT; i++) - { - if (diff & (1<> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); + + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries. On HW the changed + // access permissions would not be applied because of a higher priority + // region overwriting them. + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + + u32 diff = old ^ PU_DataRW; + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + UpdatePURegions(true); + #endif } return; @@ -844,19 +901,30 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 old = PU_CodeRW; PU_CodeRW = 0; - PU_CodeRW |= (val & 0x0003); - PU_CodeRW |= ((val & 0x000C) << 2); - PU_CodeRW |= ((val & 0x0030) << 4); - PU_CodeRW |= ((val & 0x00C0) << 6); - PU_CodeRW |= ((val & 0x0300) << 8); - PU_CodeRW |= ((val & 0x0C00) << 10); - PU_CodeRW |= ((val & 0x3000) << 12); - PU_CodeRW |= ((val & 0xC000) << 14); - u32 diff = old ^ PU_CodeRW; - for (u32 i = 0; i < CP15_REGION_COUNT; i++) - { - if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); - } + #pragma GCC ivdep + #pragma GCC unroll 8 + for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); + + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries, because it + // would on HW be overridden by the higher priority region + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + + u32 diff = old ^ PU_CodeRW; + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + UpdatePURegions(true); + #endif } return; @@ -864,10 +932,23 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 diff = PU_DataRW ^ val; PU_DataRW = val; - for (u32 i = 0; i < CP15_REGION_COUNT; i++) - { - if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); - } + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries, because it + // would on HW be overridden by the higher priority region + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + UpdatePURegions(true); + #endif } return; @@ -875,10 +956,23 @@ void ARMv5::CP15Write(u32 id, u32 val) { u32 diff = PU_CodeRW ^ val; PU_CodeRW = val; - for (u32 i = 0; i < CP15_REGION_COUNT; i++) - { - if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); - } + #if 0 + // This code just updates the PU_Map entries of the given region + // this works fine, if the regions do not overlap + // If overlapping and the least priority region access permission + // would change, this results in wrong map entries, because it + // would on HW be overridden by the higher priority region + // + // Writing to the data permission bits is sparse, so we + // should just take the long but correct update via all regions + // so the permission priority is correct + for (u32 i = 0; i < CP15_REGION_COUNT; i++) + { + if (diff & (CP15_REGIONACCESS_REGIONMASK<<(i*CP15_REGIONACCESS_BITS_PER_REGION))) UpdatePURegion(i); + } + #else + UpdatePURegions(true); + #endif } return; @@ -1242,28 +1336,25 @@ u32 ARMv5::CP15Read(u32 id) const case 0x500: { + // this format has 2 bits per region, but we store 4 per region + // so we reduce and consoldate the bits + // 0x502 returns all 4 bits per region u32 ret = 0; - ret |= (PU_DataRW & 0x00000003); - ret |= ((PU_DataRW & 0x00000030) >> 2); - ret |= ((PU_DataRW & 0x00000300) >> 4); - ret |= ((PU_DataRW & 0x00003000) >> 6); - ret |= ((PU_DataRW & 0x00030000) >> 8); - ret |= ((PU_DataRW & 0x00300000) >> 10); - ret |= ((PU_DataRW & 0x03000000) >> 12); - ret |= ((PU_DataRW & 0x30000000) >> 14); + #pragma GCC ivdep + #pragma GCC unroll 8 + for (int i=0;i> (i * CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK) << (i*2); return ret; } case 0x501: { + // this format has 2 bits per region, but we store 4 per region + // so we reduce and consoldate the bits + // 0x503 returns all 4 bits per region u32 ret = 0; - ret |= (PU_CodeRW & 0x00000003); - ret |= ((PU_CodeRW & 0x00000030) >> 2); - ret |= ((PU_CodeRW & 0x00000300) >> 4); - ret |= ((PU_CodeRW & 0x00003000) >> 6); - ret |= ((PU_CodeRW & 0x00030000) >> 8); - ret |= ((PU_CodeRW & 0x00300000) >> 10); - ret |= ((PU_CodeRW & 0x03000000) >> 12); - ret |= ((PU_CodeRW & 0x30000000) >> 14); + #pragma GCC unroll 8 + for (int i=0;i> (i * CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK) << (i*2); return ret; } case 0x502: diff --git a/src/CP15_Constants.h b/src/CP15_Constants.h index b148372d..98ddf84d 100644 --- a/src/CP15_Constants.h +++ b/src/CP15_Constants.h @@ -76,31 +76,39 @@ constexpr u32 CP15_MAINID_IMPLEMENTATION_946 = (0x946 << 4); constexpr u32 CP15_MAINID_REVISION_0 = (0 << 0); constexpr u32 CP15_MAINID_REVISION_1 = (1 << 0); -/* CP15 Control Register */ -constexpr u32 CP15_CR_MPUENABLE = (1 << 0); -constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); -constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); - -/* CP15 Internal Exception base value */ -constexpr u32 CP15_EXCEPTIONBASE_HIGH = 0xFFFF0000; -constexpr u32 CP15_EXCEPTIONBASE_LOW = 0x00000000; - /* CP15 Cache and Write Buffer Conrol Register */ constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 << 14); constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); constexpr u32 CP15_CACHE_CR_DCACHEENABLE = (1 << 2); constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); +/* CP15 TCM Control Register */ +constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); +constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); +constexpr u32 CP15_TCM_CR_DTCM_LOADMODE = (1 << 17); +constexpr u32 CP15_TCM_CR_ITCM_LOADMODE = (1 << 19); + +/* CP15 Control Register */ +constexpr u32 CP15_CR_MPUENABLE = (1 << 0); +constexpr u32 CP15_CR_BIGENDIAN = (1 << 7); +constexpr u32 CP15_CR_HIGHEXCEPTIONBASE = (1 << 13); +constexpr u32 CP15_CR_DISABLE_THUMBBIT = (1 << 15); +constexpr u32 CP15_CR_CHANGEABLE_MASK = CP15_CR_MPUENABLE | CP15_CR_BIGENDIAN | CP15_CACHE_CR_DCACHEENABLE + | CP15_CACHE_CR_ICACHEENABLE | CP15_CR_HIGHEXCEPTIONBASE + | CP15_TCM_CR_DTCM_ENABLE | CP15_TCM_CR_ITCM_ENABLE + | CP15_TCM_CR_DTCM_LOADMODE | CP15_TCM_CR_ITCM_LOADMODE + | CP15_CACHE_CR_ROUNDROBIN | CP15_CR_DISABLE_THUMBBIT; + +/* CP15 Internal Exception base value */ +constexpr u32 CP15_EXCEPTIONBASE_HIGH = 0xFFFF0000; +constexpr u32 CP15_EXCEPTIONBASE_LOW = 0x00000000; + /* CP15 BIST Test State register */ constexpr u32 CP15_BIST_TR_DISABLE_ICACHE_STREAMING = (1 << 11); constexpr u32 CP15_BIST_TR_DISABLE_DCACHE_STREAMING = (1 << 12); constexpr u32 CP15_BIST_TR_DISABLE_ICACHE_LINEFILL = (1 << 9); constexpr u32 CP15_BIST_TR_DISABLE_DCACHE_LINEFILL = (1 << 10); -/* CP15 TCM Control Register */ -constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); -constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); - /* CP15 Region Base and Size Register */ constexpr u32 CP15_REGION_COUNT = 8; constexpr u32 CP15_REGION_ENABLE = (1 << 0); From b1637e25a46fb77d75f061332cd35fb7fd0387e5 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Sun, 4 Feb 2024 12:48:06 +0100 Subject: [PATCH 032/306] Added more function documenting comments Added const properties to the CP15Write/Read functions --- src/ARM.h | 32 ++++++++++++++++++++++++++++++-- src/CP15.cpp | 4 ++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 61a087fd..3207628e 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -523,8 +523,36 @@ public: */ void DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine); - void CP15Write(u32 id, u32 val); - u32 CP15Read(u32 id) const; + /** + * @brief Handles MCR operations writing to cp15 registers + * @details + * This function updates the internal state of the emulator when + * a cp15 register is written, or triggers the corresponding action + * like flushing caches. + * + * @param [in] id the operation id to be performed, consisting of + * (from lower to higher nibble) opcode2, intermediate register, + * register and opcode1. Most write operations just take the first 3 + * into account. + * param [in] val value to be written to the cp15 register + * @par Returns + * Nothing + */ + void CP15Write(const u32 id, const u32 val); + + /** + * @brief handles MRC operations reading from cp15 registers + * @details + * This function accumulates the regsiter states from the internal + * emulator state. It does not modify the internal state of the + * emulator or cp15. + * @param [in] id the operation id to be performed, consisting of + * (from lower to higher nibble) opcode2, intermediate register, + * register and opcode1. Most read operations just take the first 3 + * into account. + * @return Value of the cp15 register + */ + u32 CP15Read(const u32 id) const; u32 CP15Control; //! CP15 Register 1: Control Register diff --git a/src/CP15.cpp b/src/CP15.cpp index fa0a426d..78cb8992 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -760,7 +760,7 @@ bool ARMv5::IsAddressDCachable(const u32 addr) const return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE ; } -void ARMv5::CP15Write(u32 id, u32 val) +void ARMv5::CP15Write(const u32 id, const u32 val) { //if(id!=0x704)printf("CP15 write op %03X %08X %08X\n", id, val, R[15]); @@ -1299,7 +1299,7 @@ void ARMv5::CP15Write(u32 id, u32 val) Log(LogLevel::Debug, "unknown CP15 write op %04X %08X\n", id, val); } -u32 ARMv5::CP15Read(u32 id) const +u32 ARMv5::CP15Read(const u32 id) const { //printf("CP15 read op %03X %08X\n", id, NDS::ARM9->R[15]); From d5a351aefec6a013f8f235b8aa45849a42f14e55 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Sun, 4 Feb 2024 13:56:03 +0100 Subject: [PATCH 033/306] Added more documenting comments --- src/ARM.h | 60 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 3207628e..2e900851 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -303,7 +303,18 @@ public: void CP15Reset(); void CP15DoSavestate(Savestate* file); + /** + * @brief Caclulates the internal state from @ref DTCMSettings + * @par Returns + * Nothing + */ void UpdateDTCMSetting(); + + /** + * @brief Caclulates the internal state from @ref ITCMSettings + * @par Returns + * Nothing + */ void UpdateITCMSetting(); void UpdatePURegion(u32 n); @@ -558,22 +569,23 @@ public: u32 RNGSeed; //! Global cache line fill seed. Used for pseudo random replacement strategy with the instruction and data cache - u32 DTCMSetting; - u32 ITCMSetting; - u32 DCacheLockDown; //! CP15: Data Cache Lockdown Register - u32 ICacheLockDown; //! CP15: Instruction Cache Lockdown Register + u32 DTCMSetting; //! CP15 Register 9 Intermediate 1 Opcode2 0: Data Tightly-Coupled Memory register + u32 ITCMSetting; //! CP15 Register 9 Intermediate 1 Opcode2 1: Instruction Tightly-Coupled Memory register + u32 DCacheLockDown; //! CP15 Register 9 Intermediate 0 Opcode2 0: Data Cache Lockdown Register + u32 ICacheLockDown; //! CP15 Register 9 Intermediate 0 Opcode2 1: Instruction Cache Lockdown Register u32 CacheDebugRegisterIndex; //! CP15: Cache Debug Index Register u32 CP15TraceProcessId; //! CP15: Trace Process Id Register u32 CP15BISTTestStateRegister; //! CP15: BIST Test State Register // for aarch64 JIT they need to go up here // to be addressable by a 12-bit immediate - u32 ITCMSize; - u32 DTCMBase, DTCMMask; - s32 RegionCodeCycles; + u32 ITCMSize; //! Internal: Size of the memory ITCM is mapped to. @ref ITCM data repeats every @ref ITCMPhysicalSize withhin + u32 DTCMBase; //! Internal: DTCMBase Address. The DTCM can be accessed if the address & ~ @ref DTCMMask is equal to thhis base address + u32 DTCMMask; //! Internal: DTCM Address Mask used in conjunction with @ref DTCMBase to check for DTCM access + s32 RegionCodeCycles; //! Internal: Cached amount of cycles to fetch instruction from the current code region. - u8 ITCM[ITCMPhysicalSize]; - u8* DTCM; + u8 ITCM[ITCMPhysicalSize]; //! Content of the ITCM + u8* DTCM; //! Content of the DTCM u8 ICache[ICACHE_SIZE]; //! Instruction Cache Content organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS times @ref ICACHE_LINELENGTH bytes u32 ICacheTags[ICACHE_LINESPERSET*ICACHE_SETS]; //! Instruction Cache Tags organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS Tags @@ -583,22 +595,28 @@ public: u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; //! Data Cache Tags organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS Tags u8 DCacheCount; //! Global data line fill counter. Used for round-robin replacement strategy with the instruction cache - u32 PU_CodeCacheable; //! CP15 Register 2 Opcode 1: Code Cachable Bits - u32 PU_DataCacheable; //! CP15 Register 2 Opcode 0: Data Cachable Bits - u32 PU_DataCacheWrite; //! CP15 Register 3 Opcode 0: WriteBuffer Control Register + u32 PU_CodeCacheable; //! CP15 Register 2 Opcode2 1: Code Cachable Bits + u32 PU_DataCacheable; //! CP15 Register 2 Opcode2 0: Data Cachable Bits + u32 PU_DataCacheWrite; //! CP15 Register 3 Opcode2 0: WriteBuffer Control Register - u32 PU_CodeRW; //! CP15 Register 5 Opcode 3: Code Access Permission register - u32 PU_DataRW; //! CP15 Register 5 Opcode 2: Data Access Permission register + u32 PU_CodeRW; //! CP15 Register 5 Opcode2 3: Code Access Permission register + u32 PU_DataRW; //! CP15 Register 5 Opcode2 2: Data Access Permission register - u32 PU_Region[8]; //! CP15 Register 6 Opcode 0..7: Protection Region Base and Size Register + u32 PU_Region[8]; //! CP15 Register 6 Opcode2 0..7: Protection Region Base and Size Register // 0=dataR 1=dataW 2=codeR 4=datacache 5=datawrite 6=codecache - u8 PU_PrivMap[0x100000]; - u8 PU_UserMap[0x100000]; - - // games operate under system mode, generally - //#define PU_Map PU_PrivMap - u8* PU_Map; + u8 PU_PrivMap[0x100000]; /** + * Memory mapping flags for Privileged Modes + * Bits: + * 0 - CP15_MAP_READABLE + * 1 - CP15_MAP_WRITEABLE + * 2 - CP15_MAP_EXECUTABLE + * 4 - CP15_MAP_DCACHEABLE + * 5 - CP15_MAP_DCACHEWRITEBACK + * 6 - CP15_MAP_ICACHEABLE + */ + u8 PU_UserMap[0x100000]; //! Memory mapping flags for User Mode + u8* PU_Map; //! Current valid Region Mapping (is either @ref PU_PrivMap or PU_UserMap) // code/16N/32N/32S u8 MemTimings[0x100000][4]; From 4b20c1bc0f3d333dae0c19cd3c2335c552f96d66 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Mon, 5 Feb 2024 12:30:39 +0100 Subject: [PATCH 034/306] Added more documenting comments Replaced mogic values with named constants Added const specifier to some argument and subsequent calls --- src/ARM.h | 55 ++++++++++++++++++++----- src/CP15.cpp | 96 +++++++++++++++++++++----------------------- src/CP15_Constants.h | 25 +++++++++++- src/DSi.cpp | 2 +- src/DSi.h | 2 +- src/NDS.cpp | 2 +- src/NDS.h | 2 +- 7 files changed, 119 insertions(+), 65 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 2e900851..6b32983d 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -298,27 +298,64 @@ public: // Cycles += numC + numD; } - void GetCodeMemRegion(u32 addr, MemRegion* region); + void GetCodeMemRegion(const u32 addr, MemRegion* region); + /** + * @brief Resets the state of all CP15 registers and variables + * to power up state. + * @par Returns + * Nothing + */ void CP15Reset(); + + /** + * @brief handles read and write operations to a save-state + * file. + * @param [in] file Savestate file + * @par Returns + * Nothing + */ void CP15DoSavestate(Savestate* file); /** - * @brief Caclulates the internal state from @ref DTCMSettings + * @brief Calculates the internal state from @ref DTCMSettings * @par Returns * Nothing */ void UpdateDTCMSetting(); /** - * @brief Caclulates the internal state from @ref ITCMSettings + * @brief Calculates the internal state from @ref ITCMSettings * @par Returns * Nothing */ void UpdateITCMSetting(); - void UpdatePURegion(u32 n); - void UpdatePURegions(bool update_all); + /** + * @brief Calculates the internal state from the + * region protection bits of a specific region number + * @details + * This function updates the PU_####Map array in all + * parts that are occupied by this region. Updating a single + * region does not take into account the priority of the + * regions. + * @param [in] n index of the region from 0 to @ref CP15_REGION_COUNT - 1 + * @par Returns + * Nothing + */ + void UpdatePURegion(const u32 n); + + /** + * @brief Calculates the internal state from all region + * protection bits. + * @details + * This function updates the internal state in order from the + * least to the most priotized regions, so that the + * priority of the regions match the internal state + * @par Returns + * Nothing + */ + void UpdatePURegions(const bool update_all); u32 RandomLineIndex(); @@ -602,10 +639,10 @@ public: u32 PU_CodeRW; //! CP15 Register 5 Opcode2 3: Code Access Permission register u32 PU_DataRW; //! CP15 Register 5 Opcode2 2: Data Access Permission register - u32 PU_Region[8]; //! CP15 Register 6 Opcode2 0..7: Protection Region Base and Size Register + u32 PU_Region[CP15_REGION_COUNT]; //! CP15 Register 6 Opcode2 0..7: Protection Region Base and Size Register // 0=dataR 1=dataW 2=codeR 4=datacache 5=datawrite 6=codecache - u8 PU_PrivMap[0x100000]; /** + u8 PU_PrivMap[CP15_MAP_ENTRYCOUNT]; /** * Memory mapping flags for Privileged Modes * Bits: * 0 - CP15_MAP_READABLE @@ -615,11 +652,11 @@ public: * 5 - CP15_MAP_DCACHEWRITEBACK * 6 - CP15_MAP_ICACHEABLE */ - u8 PU_UserMap[0x100000]; //! Memory mapping flags for User Mode + u8 PU_UserMap[CP15_MAP_ENTRYCOUNT]; //! Memory mapping flags for User Mode u8* PU_Map; //! Current valid Region Mapping (is either @ref PU_PrivMap or PU_UserMap) // code/16N/32N/32S - u8 MemTimings[0x100000][4]; + u8 MemTimings[CP15_MAP_ENTRYCOUNT][4]; bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); diff --git a/src/CP15.cpp b/src/CP15.cpp index 78cb8992..576f9f8d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -40,36 +40,32 @@ const int kDataCacheTiming = 3;//2; const int kCodeCacheTiming = 3;//5; -/* CP15 Reset sets the default values within each registers and - memories of the CP15. - This includes the Settings for - DTCM - ITCM - Caches - Regions - Process Trace -*/ - void ARMv5::CP15Reset() { CP15Control = 0x2078; // dunno RNGSeed = 44203; + // Memory Regions Protection + PU_CodeRW = 0; + PU_DataRW = 0; + + memset(PU_Region, 0, CP15_REGION_COUNT*sizeof(*PU_Region)); + + // TCM-Settings DTCMSetting = 0; ITCMSetting = 0; memset(ITCM, 0, ITCMPhysicalSize); memset(DTCM, 0, DTCMPhysicalSize); - ITCMSize = 0; - DTCMBase = 0xFFFFFFFF; - DTCMMask = 0; + // Cache Settings + PU_CodeCacheable = 0; + PU_DataCacheable = 0; + PU_DataCacheWrite = 0; ICacheLockDown = 0; DCacheLockDown = 0; - CacheDebugRegisterIndex = 0; - CP15BISTTestStateRegister = 0; memset(ICache, 0, ICACHE_SIZE); ICacheInvalidateAll(); @@ -79,18 +75,15 @@ void ARMv5::CP15Reset() DCacheInvalidateAll(); DCacheCount = 0; + // Debug / Misc Registers + CacheDebugRegisterIndex = 0; + CP15BISTTestStateRegister = 0; CP15TraceProcessId = 0; - PU_CodeCacheable = 0; - PU_DataCacheable = 0; - PU_DataCacheWrite = 0; - - PU_CodeRW = 0; - PU_DataRW = 0; - - memset(PU_Region, 0, CP15_REGION_COUNT*sizeof(u32)); + // And now Update the internal state + UpdateDTCMSetting(); + UpdateITCMSetting(); UpdatePURegions(true); - } void ARMv5::CP15DoSavestate(Savestate* file) @@ -145,13 +138,16 @@ void ARMv5::UpdateDTCMSetting() if (CP15Control & CP15_TCM_CR_DTCM_ENABLE) { - newDTCMSize = 0x200 << ((DTCMSetting >> 1) & 0x1F); - if (newDTCMSize < 0x1000) newDTCMSize = 0x1000; - newDTCMMask = 0xFFFFF000 & ~(newDTCMSize-1); + newDTCMSize = CP15_DTCM_SIZE_BASE << ((DTCMSetting & CP15_DTCM_SIZE_MASK) >> CP15_DTCM_SIZE_POS); + if (newDTCMSize < (CP15_DTCM_SIZE_BASE << CP15_DTCM_SIZE_MIN)) + newDTCMSize = CP15_DTCM_SIZE_BASE << CP15_DTCM_SIZE_MIN; + + newDTCMMask = CP15_DTCM_BASE_MASK & ~(newDTCMSize-1); newDTCMBase = DTCMSetting & newDTCMMask; } else { + // DTCM Disabled newDTCMSize = 0; newDTCMBase = 0xFFFFFFFF; newDTCMMask = 0; @@ -169,7 +165,7 @@ void ARMv5::UpdateITCMSetting() { if (CP15Control & CP15_TCM_CR_ITCM_ENABLE) { - ITCMSize = 0x200 << ((ITCMSetting >> 1) & 0x1F); + ITCMSize = CP15_ITCM_SIZE_BASE << ((ITCMSetting & CP15_ITCM_SIZE_MASK) >> CP15_ITCM_SIZE_POS); #ifdef JIT_ENABLED FastBlockLookupSize = 0; #endif @@ -183,7 +179,7 @@ void ARMv5::UpdateITCMSetting() // covers updates to a specific PU region's cache/etc settings // (not to the region range/enabled status) -void ARMv5::UpdatePURegion(u32 n) +void ARMv5::UpdatePURegion(const u32 n) { if (!(CP15Control & CP15_CR_MPUENABLE)) return; @@ -194,7 +190,7 @@ void ARMv5::UpdatePURegion(u32 n) u32 coderw = (PU_CodeRW >> (CP15_REGIONACCESS_BITS_PER_REGION * n)) & CP15_REGIONACCESS_REGIONMASK; u32 datarw = (PU_DataRW >> (CP15_REGIONACCESS_BITS_PER_REGION * n)) & CP15_REGIONACCESS_REGIONMASK; - u32 codecache, datacache, datawrite; + bool codecache, datacache, datawrite; // datacache/datawrite // 0/0: goes to memory @@ -205,7 +201,7 @@ void ARMv5::UpdatePURegion(u32 n) if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) codecache = (PU_CodeCacheable >> n) & 0x1; else - codecache = 0; + codecache = false; if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { @@ -214,12 +210,12 @@ void ARMv5::UpdatePURegion(u32 n) } else { - datacache = 0; - datawrite = 0; + datacache = false; + datawrite = false; } u32 rgn = PU_Region[n]; - if (!(rgn & (1<<0))) + if (!(rgn & CP15_REGION_ENABLE)) { return; } @@ -240,7 +236,7 @@ void ARMv5::UpdatePURegion(u32 n) case 3: privmask |= CP15_MAP_READABLE | CP15_MAP_WRITEABLE; usermask |= CP15_MAP_READABLE | CP15_MAP_WRITEABLE; break; case 5: privmask |= CP15_MAP_READABLE; break; case 6: privmask |= CP15_MAP_READABLE; usermask |= CP15_MAP_READABLE; break; - default: Log(LogLevel::Warn, "!! BAD DATARW VALUE %d\n", datarw&0xF); + default: Log(LogLevel::Warn, "!! BAD DATARW VALUE %d\n", datarw & ((1 << CP15_REGIONACCESS_BITS_PER_REGION)-1)); } switch (coderw) @@ -251,22 +247,22 @@ void ARMv5::UpdatePURegion(u32 n) case 3: privmask |= CP15_MAP_EXECUTABLE; usermask |= CP15_MAP_EXECUTABLE; break; case 5: privmask |= CP15_MAP_EXECUTABLE; break; case 6: privmask |= CP15_MAP_EXECUTABLE; usermask |= CP15_MAP_EXECUTABLE; break; - default: Log(LogLevel::Warn, "!! BAD CODERW VALUE %d\n", datarw&0xF); + default: Log(LogLevel::Warn, "!! BAD CODERW VALUE %d\n", datarw & ((1 << CP15_REGIONACCESS_BITS_PER_REGION)-1)); } - if (datacache & 0x1) + if (datacache) { privmask |= CP15_MAP_DCACHEABLE; usermask |= CP15_MAP_DCACHEABLE; - if (datawrite & 0x1) + if (datawrite) { privmask |= CP15_MAP_DCACHEWRITEBACK; usermask |= CP15_MAP_DCACHEWRITEBACK; } } - if (codecache & 0x1) + if (codecache) { privmask |= CP15_MAP_ICACHEABLE; usermask |= CP15_MAP_ICACHEABLE; @@ -293,7 +289,7 @@ void ARMv5::UpdatePURegion(u32 n) UpdateRegionTimings(start, end); } -void ARMv5::UpdatePURegions(bool update_all) +void ARMv5::UpdatePURegions(const bool update_all) { if (!(CP15Control & CP15_CR_MPUENABLE)) { @@ -303,17 +299,17 @@ void ARMv5::UpdatePURegions(bool update_all) if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) mask |= CP15_MAP_DCACHEABLE | CP15_MAP_DCACHEWRITEBACK ; if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) mask |= CP15_MAP_ICACHEABLE; - memset(PU_UserMap, mask, 0x100000); - memset(PU_PrivMap, mask, 0x100000); + memset(PU_UserMap, mask, CP15_MAP_ENTRYCOUNT); + memset(PU_PrivMap, mask, CP15_MAP_ENTRYCOUNT); - UpdateRegionTimings(0x00000, 0x100000); + UpdateRegionTimings(0x00000, CP15_MAP_ENTRYCOUNT); return; } if (update_all) { - memset(PU_UserMap, CP15_MAP_NOACCESS, 0x100000); - memset(PU_PrivMap, CP15_MAP_NOACCESS, 0x100000); + memset(PU_UserMap, CP15_MAP_NOACCESS, CP15_MAP_ENTRYCOUNT); + memset(PU_PrivMap, CP15_MAP_NOACCESS, CP15_MAP_ENTRYCOUNT); } for (int n = 0; n < CP15_REGION_COUNT; n++) @@ -323,7 +319,7 @@ void ARMv5::UpdatePURegions(bool update_all) // TODO: this is way unoptimized // should be okay unless the game keeps changing shit, tho - if (update_all) UpdateRegionTimings(0x00000, 0x100000); + if (update_all) UpdateRegionTimings(0x00000, CP15_MAP_ENTRYCOUNT); // TODO: throw exception if the region we're running in has become non-executable, I guess } @@ -1207,12 +1203,12 @@ void ARMv5::CP15Write(const u32 id, const u32 val) return; case 0x910: - DTCMSetting = val & 0xFFFFF03E; + DTCMSetting = val & (CP15_DTCM_BASE_MASK | CP15_DTCM_SIZE_MASK); UpdateDTCMSetting(); return; case 0x911: - ITCMSetting = val & 0x0000003E; + ITCMSetting = val & (CP15_ITCM_BASE_MASK | CP15_ITCM_SIZE_MASK); UpdateITCMSetting(); return; @@ -1319,7 +1315,7 @@ u32 ARMv5::CP15Read(const u32 id) const | (ICACHE_LINELENGTH_ENCODED << 0) | (ICACHE_SETS_LOG2 << 3) | ((ICACHE_SIZE_LOG2 - 9) << 6); case 0x002: // TCM size - return (6 << 6) | (5 << 18); + return CP15_TCMSIZE_ITCM_32KB | CP15_TCMSIZE_DTCM_16KB; case 0x100: // control reg @@ -1828,7 +1824,7 @@ void ARMv5::DataWrite32S(const u32 addr, const u32 val) DataCycles += MemTimings[addr >> BUSCYCLES_MAP_GRANULARITY_LOG2][BUSCYCLES_S32]; } -void ARMv5::GetCodeMemRegion(u32 addr, MemRegion* region) +void ARMv5::GetCodeMemRegion(const u32 addr, MemRegion* region) { NDS.ARM9GetMemRegion(addr, false, &CodeMem); } diff --git a/src/CP15_Constants.h b/src/CP15_Constants.h index 98ddf84d..3b3cbf9a 100644 --- a/src/CP15_Constants.h +++ b/src/CP15_Constants.h @@ -76,6 +76,10 @@ constexpr u32 CP15_MAINID_IMPLEMENTATION_946 = (0x946 << 4); constexpr u32 CP15_MAINID_REVISION_0 = (0 << 0); constexpr u32 CP15_MAINID_REVISION_1 = (1 << 0); +/* CP15 TCM Size Register */ +constexpr u32 CP15_TCMSIZE_DTCM_16KB = (5 << 18); +constexpr u32 CP15_TCMSIZE_ITCM_32KB = (6 << 6); + /* CP15 Cache and Write Buffer Conrol Register */ constexpr u32 CP15_CACHE_CR_ROUNDROBIN = (1 << 14); constexpr u32 CP15_CACHE_CR_ICACHEENABLE = (1 << 12); @@ -85,8 +89,24 @@ constexpr u32 CP15_CACHE_CR_WRITEBUFFERENABLE = (1 << 3); /* CP15 TCM Control Register */ constexpr u32 CP15_TCM_CR_DTCM_ENABLE = (1 << 16); constexpr u32 CP15_TCM_CR_ITCM_ENABLE = (1 << 18); -constexpr u32 CP15_TCM_CR_DTCM_LOADMODE = (1 << 17); -constexpr u32 CP15_TCM_CR_ITCM_LOADMODE = (1 << 19); +constexpr u32 CP15_TCM_CR_DTCM_LOADMODE = (1 << 17); // TODO +constexpr u32 CP15_TCM_CR_ITCM_LOADMODE = (1 << 19); // TODO + +/* CP15 DTCM Settings Register */ +constexpr u32 CP15_DTCM_SIZE_BASE = 0x200; +constexpr u32 CP15_DTCM_SIZE_MASK = 0x3E; +constexpr u32 CP15_DTCM_SIZE_POS = 1; +constexpr u32 CP15_DTCM_SIZE_MIN = 0b00011; +constexpr u32 CP15_DTCM_SIZE_MAX = 0b10111; +constexpr u32 CP15_DTCM_BASE_MASK = 0xFFFFF000; + +/* CP15 ITCM Settings Register */ +constexpr u32 CP15_ITCM_SIZE_BASE = 0x200; +constexpr u32 CP15_ITCM_SIZE_MASK = 0x3E; +constexpr u32 CP15_ITCM_SIZE_POS = 1; +constexpr u32 CP15_ITCM_SIZE_MIN = 0b00011; +constexpr u32 CP15_ITCM_SIZE_MAX = 0b10111; +constexpr u32 CP15_ITCM_BASE_MASK = 0x00000000; /* CP15 Control Register */ constexpr u32 CP15_CR_MPUENABLE = (1 << 0); @@ -132,6 +152,7 @@ constexpr u32 CP15_MAP_ICACHEABLE = 0x40; constexpr u32 CP15_MAP_ENTRYSIZE_LOG2 = CP15_REGION_BASE_GRANULARITY_LOG2; constexpr u32 CP15_MAP_ENTRYSIZE = (1 << CP15_MAP_ENTRYSIZE_LOG2); +constexpr u32 CP15_MAP_ENTRYCOUNT = 1 << (32 - CP15_MAP_ENTRYSIZE_LOG2); /* Internal Timing Constants */ constexpr u32 BUSCYCLES_N16 = 0; diff --git a/src/DSi.cpp b/src/DSi.cpp index c929c6d2..fbc72ed3 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -1705,7 +1705,7 @@ void DSi::ARM9Write32(u32 addr, u32 val) return NDS::ARM9Write32(addr, val); } -bool DSi::ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) +bool DSi::ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region) { assert(ConsoleType == 1); switch (addr & 0xFF000000) diff --git a/src/DSi.h b/src/DSi.h index 1d010e0f..0da0596c 100644 --- a/src/DSi.h +++ b/src/DSi.h @@ -104,7 +104,7 @@ public: void ARM9Write16(u32 addr, u16 val) override; void ARM9Write32(u32 addr, u32 val) override; - bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) override; + bool ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region) override; u8 ARM7Read8(u32 addr) override; u16 ARM7Read16(u32 addr) override; diff --git a/src/NDS.cpp b/src/NDS.cpp index 1f9597ce..b5cf7b4f 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -2227,7 +2227,7 @@ void NDS::ARM9Write32(u32 addr, u32 val) //Log(LogLevel::Warn, "unknown arm9 write32 %08X %08X | %08X\n", addr, val, ARM9.R[15]); } -bool NDS::ARM9GetMemRegion(u32 addr, bool write, MemRegion* region) +bool NDS::ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region) { switch (addr & 0xFF000000) { diff --git a/src/NDS.h b/src/NDS.h index cfb8e3b5..65dc2764 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -442,7 +442,7 @@ public: // TODO: Encapsulate the rest of these members virtual void ARM9Write16(u32 addr, u16 val); virtual void ARM9Write32(u32 addr, u32 val); - virtual bool ARM9GetMemRegion(u32 addr, bool write, MemRegion* region); + virtual bool ARM9GetMemRegion(const u32 addr, const bool write, MemRegion* region); virtual u8 ARM7Read8(u32 addr); virtual u16 ARM7Read16(u32 addr); From 129a3e0535bee095ed8ff660613b69aec4d05ed7 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Tue, 6 Feb 2024 08:19:05 +0100 Subject: [PATCH 035/306] Added write-back ability in addition to write-through for the data cache --- src/ARM.h | 24 +++-- src/CP15.cpp | 205 +++++++++++++++++++++++++++++++++++++------ src/CP15_Constants.h | 4 + 3 files changed, 195 insertions(+), 38 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 6b32983d..9f0cbf64 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -449,28 +449,34 @@ public: * @brief Updates a word in the data cache if present * @param [in] addr Memory address which is written * @param [in] val Word value to be written - * @par Returns - * Nothing + * @retval true, if the data was written into the cache and + * does not need to be updated until cache is + * cleaned + * false, to write through */ - void DCacheWrite32(const u32 addr, const u32 val); + bool DCacheWrite32(const u32 addr, const u32 val); /** * @brief Updates a word in the data cache if present * @param [in] addr Memory address which is written * @param [in] val Half-Word value to be written - * @par Returns - * Nothing + * @retval true, if the data was written into the cache and + * does not need to be updated until cache is + * cleaned + * false, to write through */ - void DCacheWrite16(const u32 addr, const u16 val); + bool DCacheWrite16(const u32 addr, const u16 val); /** * @brief Updates a word in the data cache if present * @param [in] addr Memory address which is written * @param [in] val Byte value to be written - * @par Returns - * Nothing + * @retval true, if the data was written into the cache and + * does not need to be updated until cache is + * cleaned + * false, to write through */ - void DCacheWrite8(const u32 addr, const u8 val); + bool DCacheWrite8(const u32 addr, const u8 val); /** * @brief Check if an address is within a data cachable region diff --git a/src/CP15.cpp b/src/CP15.cpp index 576f9f8d..22be6a52 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -548,6 +548,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) return BusRead32(addr & ~3); } } + //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2], set, id>>2); return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } } @@ -621,6 +622,11 @@ u32 ARMv5::DCacheLookup(const u32 addr) u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; + DataCycles = 0; + // Before we fill the cacheline, we need to write back dirty content + // Datacycles will be incremented by the required cycles to do so + DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); + //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", tag); for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { @@ -635,7 +641,13 @@ u32 ARMv5::DCacheLookup(const u32 addr) { ptr[i >> 2] = BusRead32(tag+i); } - //Log(LogLevel::Debug,"DCache store @ %08x: %08x\n", tag+i, *(u32*)&ptr[i]); + //Log(LogLevel::Debug,"DCache store @ %08x: %08x in set %i, line %i\n", tag+i, *(u32*)&ptr[i >> 2], line & 3, line >> 2); + if ((addr == 0x02007ea0) && ((*(u32*)&ptr[i]) == 0x1239 )) + { + // Halt(1); + // exit(1); + } + } DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; @@ -643,59 +655,107 @@ u32 ARMv5::DCacheLookup(const u32 addr) // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); // first N32 remaining S32 - DataCycles = (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } -void ARMv5::DCacheWrite32(const u32 addr, const u32 val) +bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) { const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + //Log(LogLevel::Debug, "Cache write 32: %08lx <= %08lx\n", addr, val); + for (int set=0;set> 2] = val; + cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 2] = val; DataCycles = 1; - return; + //if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) + { + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; + } + // just mark dirty and abort the data write through the bus + return true; + } + return false; } } + return false; } -void ARMv5::DCacheWrite16(const u32 addr, const u16 val) +bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) { const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + //Log(LogLevel::Debug, "Cache write 16: %08lx <= %04x\n", addr, val); for (int set=0;set> 1] = val; + cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 1] = val; DataCycles = 1; - return; + //if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) + { + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; + } + // just mark dirtyand abort the data write through the bus + return true; + } + return false; } } + return false; } -void ARMv5::DCacheWrite8(const u32 addr, const u8 val) +bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) { const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2;; + //Log(LogLevel::Debug, "Cache write 8: %08lx <= %02x\n", addr, val); + for (int set=0;set> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) + { + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; + } + + // just mark dirty and abort the data write through the bus + return true; + } + return false; } - } + } + return false; } void ARMv5::DCacheInvalidateByAddr(const u32 addr) @@ -735,20 +795,84 @@ void ARMv5::DCacheInvalidateAll() void ARMv5::DCacheClearAll() { - // TODO: right now any write to cached data goes straight to the - // underlying memory and invalidates the cache line. + for (int set = 0;set> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + + for (int set=0;set> DCACHE_SETS_LOG2); + return; + } + } } void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) { - // TODO: right now any write to cached data goes straight to the - // underlying memory and invalidates the cache line. + const u32 index = cacheSet | (cacheLine << DCACHE_SETS_LOG2); + + // Only write back if valid + if (!(DCacheTags[index] & CACHE_FLAG_VALID)) + return ; + + const u32 tag = DCacheTags[index] & ~CACHE_FLAG_MASK; + u32* ptr = (u32 *)&DCache[index << DCACHE_LINELENGTH_LOG2]; + + if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) + { + //Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); +#if 1 + for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) + { + //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); + if (tag+i < ITCMSize) + { + *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); + } else + if (((tag+i) & DTCMMask) == DTCMBase) + { + *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; + } else + { + BusWrite32(tag+i, ptr[i >> 2]); + } + } +#endif + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; + } + if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) + { + //Log(LogLevel::Debug, "Writing back %i / %i, upper half-> %08lx\n", cacheSet, cacheLine, tag); +#if 1 + for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) + { + //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); + if (tag+i < ITCMSize) + { + *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); + } else + if (((tag+i) & DTCMMask) == DTCMBase) + { + *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; + } else + { + BusWrite32(tag+i, ptr[i >> 2]); + } + } +#endif + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; + } + DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); } bool ARMv5::IsAddressDCachable(const u32 addr) const @@ -780,7 +904,6 @@ void ARMv5::CP15Write(const u32 id, const u32 val) } return; - case 0x200: // data cacheable { u32 diff = PU_DataCacheable ^ val; @@ -1674,6 +1797,8 @@ void ARMv5::DataWrite8(const u32 addr, const u8 val) return; } + DataRegion = addr; + #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) #endif @@ -1682,13 +1807,17 @@ void ARMv5::DataWrite8(const u32 addr, const u8 val) { if (IsAddressDCachable(addr)) { - DCacheWrite8(addr, val); + if (DCacheWrite8(addr, val)) + return; +// DCacheInvalidateByAddr(addr); +// DCacheLookup(addr); +// DCacheWrite8(addr, val); +// DCacheClearByAddr(addr); +// return; } } } - DataRegion = addr; - if (addr < ITCMSize) { DataCycles = 1; @@ -1715,6 +1844,8 @@ void ARMv5::DataWrite16(const u32 addr, const u16 val) return; } + DataRegion = addr; + #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) #endif @@ -1723,13 +1854,17 @@ void ARMv5::DataWrite16(const u32 addr, const u16 val) { if (IsAddressDCachable(addr)) { - DCacheWrite16(addr, val); + if (DCacheWrite16(addr, val)) + return; +// DCacheInvalidateByAddr(addr); +// DCacheLookup(addr); +// DCacheWrite16(addr, val); +// DCacheClearByAddr(addr); +// return; } } } - DataRegion = addr; - if (addr < ITCMSize) { DataCycles = 1; @@ -1756,6 +1891,8 @@ void ARMv5::DataWrite32(const u32 addr, const u32 val) return; } + DataRegion = addr; + #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) #endif @@ -1764,13 +1901,17 @@ void ARMv5::DataWrite32(const u32 addr, const u32 val) { if (IsAddressDCachable(addr)) { - DCacheWrite32(addr, val); + if (DCacheWrite32(addr, val)) + return; +// DCacheInvalidateByAddr(addr); +// DCacheLookup(addr); +// DCacheWrite32(addr, val); +// DCacheClearByAddr(addr); +// return; } } } - DataRegion = addr; - if (addr < ITCMSize) { DataCycles = 1; @@ -1799,7 +1940,13 @@ void ARMv5::DataWrite32S(const u32 addr, const u32 val) { if (IsAddressDCachable(addr)) { - DCacheWrite32(addr, val); + if (DCacheWrite32(addr, val)) + return; +// DCacheInvalidateByAddr(addr); +// DCacheLookup(addr); +// DCacheWrite32(addr, val); +// DCacheClearByAddr(addr); +// return; } } } diff --git a/src/CP15_Constants.h b/src/CP15_Constants.h index 3b3cbf9a..de8a71f4 100644 --- a/src/CP15_Constants.h +++ b/src/CP15_Constants.h @@ -50,6 +50,7 @@ constexpr u32 CACHE_FLAG_DIRTY_LOWERHALF = (1 << 2); constexpr u32 CACHE_FLAG_DIRTY_UPPERHALF = (1 << 3); constexpr u32 CACHE_FLAG_DIRTY_MASK = (3 << 2); constexpr u32 CACHE_FLAG_SET_MASK = (3 << 0); +constexpr u32 CACHE_FLAG_MASK = 0x1F; /* CP15 Cache Type Register */ constexpr u32 CACHE_TR_LOCKDOWN_TYPE_B = (7 << 25); @@ -118,6 +119,9 @@ constexpr u32 CP15_CR_CHANGEABLE_MASK = CP15_CR_MPUENABLE | CP15_CR_BIGENDIAN | | CP15_TCM_CR_DTCM_ENABLE | CP15_TCM_CR_ITCM_ENABLE | CP15_TCM_CR_DTCM_LOADMODE | CP15_TCM_CR_ITCM_LOADMODE | CP15_CACHE_CR_ROUNDROBIN | CP15_CR_DISABLE_THUMBBIT; + /* Note: ARM946E-S Technical reference manual, Chapter 6.5.2 "You cannot directly enable or disable the write buffer" + CP15_CACHE_CR_WRITEBUFFERENABLE is always set on the cp15 + */ /* CP15 Internal Exception base value */ constexpr u32 CP15_EXCEPTIONBASE_HIGH = 0xFFFF0000; From a8306f2aa002506acfd97c9b6ed891d239a04f91 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Tue, 6 Feb 2024 08:29:01 +0100 Subject: [PATCH 036/306] Removed some debug remains Updated documenting comments for the DCacheClear* methods --- src/ARM.h | 36 ++++++++++++++++++------------------ src/CP15.cpp | 32 +------------------------------- 2 files changed, 19 insertions(+), 49 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 9f0cbf64..b1ba3c29 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -533,12 +533,12 @@ public: /** * @brief Cleans the entire data cache * @details - * In melonDS, the data cache is instantly cleaned on writes, the - * @ref CACHE_FLAG_DIRTY_LOWERHALF and @ref CACHE_FLAG_DIRTY_UPPERHALF are - * not set. - * If they are implemented at a later time, the cache content has to be - * written to memory, the dirty bit cleared. The call should require - * as much cycles as needed for this write operation. + * If write-back is enabled in conjunction with the data cache + * the dirty flags in tags are set if the corresponding cache + * line is written to. + * A clean will write the parts of the cache line back + * that is marked dirty and adds the required cycles to the + * @ref DataCyces member. * @par Returns * Nothing */ @@ -547,12 +547,12 @@ public: /** * @brief Cleans a data cache line * @details - * In melonDS, the data cache is instantly cleaned on writes, the - * @ref CACHE_FLAG_DIRTY_LOWERHALF and @ref CACHE_FLAG_DIRTY_UPPERHALF are - * not set. - * If they are implemented at a later time, the cache content has to be - * written to memory, the dirty bit cleared. The call should require - * as much cycles as needed for this write operation. + * If write-back is enabled in conjunction with the data cache + * the dirty flags in tags are set if the corresponding cache + * line is written to. + * A clean will write the parts of the cache line back + * that is marked dirty and adds the required cycles to the + * @ref DataCyces member. * @param [in] addr Memory address of the data in the cache line * @par Returns * Nothing @@ -562,12 +562,12 @@ public: /** * @brief Cleans a data cache line * @details - * In melonDS, the data cache is instantly cleaned on writes, the - * @ref CACHE_FLAG_DIRTY_LOWERHALF and @ref CACHE_FLAG_DIRTY_UPPERHALF are - * not set. - * If they are implemented at a later time, the cache content has to be - * written to memory, the dirty bit cleared. The call should require - * as much cycles as needed for this write operation. + * If write-back is enabled in conjunction with the data cache + * the dirty flags in tags are set if the corresponding cache + * line is written to. + * A clean will write the parts of the cache line back + * that is marked dirty and adds the required cycles to the + * @ref DataCyces member. * @param [in] cacheSet index of the internal cache set from * 0 to @ref DCACHE_SETS - 1 * @param [in] cacheLine index of the line within the cache set diff --git a/src/CP15.cpp b/src/CP15.cpp index 22be6a52..66d26b03 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -641,13 +641,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) { ptr[i >> 2] = BusRead32(tag+i); } - //Log(LogLevel::Debug,"DCache store @ %08x: %08x in set %i, line %i\n", tag+i, *(u32*)&ptr[i >> 2], line & 3, line >> 2); - if ((addr == 0x02007ea0) && ((*(u32*)&ptr[i]) == 0x1239 )) - { - // Halt(1); - // exit(1); - } - + //Log(LogLevel::Debug,"DCache store @ %08x: %08x in set %i, line %i\n", tag+i, *(u32*)&ptr[i >> 2], line & 3, line >> 2); } DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; @@ -829,7 +823,6 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) { //Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); -#if 1 for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) { //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); @@ -846,13 +839,11 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) BusWrite32(tag+i, ptr[i >> 2]); } } -#endif DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; } if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) { //Log(LogLevel::Debug, "Writing back %i / %i, upper half-> %08lx\n", cacheSet, cacheLine, tag); -#if 1 for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); @@ -869,7 +860,6 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) BusWrite32(tag+i, ptr[i >> 2]); } } -#endif DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); @@ -1809,11 +1799,6 @@ void ARMv5::DataWrite8(const u32 addr, const u8 val) { if (DCacheWrite8(addr, val)) return; -// DCacheInvalidateByAddr(addr); -// DCacheLookup(addr); -// DCacheWrite8(addr, val); -// DCacheClearByAddr(addr); -// return; } } } @@ -1856,11 +1841,6 @@ void ARMv5::DataWrite16(const u32 addr, const u16 val) { if (DCacheWrite16(addr, val)) return; -// DCacheInvalidateByAddr(addr); -// DCacheLookup(addr); -// DCacheWrite16(addr, val); -// DCacheClearByAddr(addr); -// return; } } } @@ -1903,11 +1883,6 @@ void ARMv5::DataWrite32(const u32 addr, const u32 val) { if (DCacheWrite32(addr, val)) return; -// DCacheInvalidateByAddr(addr); -// DCacheLookup(addr); -// DCacheWrite32(addr, val); -// DCacheClearByAddr(addr); -// return; } } } @@ -1942,11 +1917,6 @@ void ARMv5::DataWrite32S(const u32 addr, const u32 val) { if (DCacheWrite32(addr, val)) return; -// DCacheInvalidateByAddr(addr); -// DCacheLookup(addr); -// DCacheWrite32(addr, val); -// DCacheClearByAddr(addr); -// return; } } } From 4164687bd29acbc0baed7adaa8c1dcfa81842bfa Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Tue, 6 Feb 2024 10:18:53 +0100 Subject: [PATCH 037/306] Fixed an issue with caclulating DTCM/ITCM masks after addr was declared constant Encapsuled the cache features in #if to disable the features via compile flags --- src/CP15.cpp | 405 +++++++++++++++++++++++++++------------------------ 1 file changed, 218 insertions(+), 187 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 66d26b03..c25470bf 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -623,10 +623,11 @@ u32 ARMv5::DCacheLookup(const u32 addr) u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; DataCycles = 0; - // Before we fill the cacheline, we need to write back dirty content - // Datacycles will be incremented by the required cycles to do so - DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); - + #if !DISABLE_CACHEWRITEBACK + // Before we fill the cacheline, we need to write back dirty content + // Datacycles will be incremented by the required cycles to do so + DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); + #endif //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", tag); for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { @@ -667,19 +668,21 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 2] = val; DataCycles = 1; - //if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) - { - if (addr & (DCACHE_LINELENGTH / 2)) + #if !DISABLE_CACHEWRITEBACK + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) { - DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; + } + // just mark dirty and abort the data write through the bus + return true; } - else - { - DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; - } - // just mark dirty and abort the data write through the bus - return true; - } + #endif return false; } } @@ -699,19 +702,21 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) u16 *cacheLine = (u16 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 1] = val; DataCycles = 1; - //if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) - { - if (addr & (DCACHE_LINELENGTH / 2)) + #if !DISABLE_CACHEWRITEBACK + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) { - DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; + } + // just mark dirtyand abort the data write through the bus + return true; } - else - { - DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; - } - // just mark dirtyand abort the data write through the bus - return true; - } + #endif return false; } } @@ -732,20 +737,22 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) u8 *cacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[addr & (DCACHE_LINELENGTH-1)] = val; DataCycles = 1; - //if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) - { - if (addr & (DCACHE_LINELENGTH / 2)) + #if !DISABLE_CACHEWRITEBACK + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) { - DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + if (addr & (DCACHE_LINELENGTH / 2)) + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_UPPERHALF ; + } + else + { + DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; + } + + // just mark dirty and abort the data write through the bus + return true; } - else - { - DCacheTags[id+set] |= CACHE_FLAG_DIRTY_LOWERHALF ; - } - - // just mark dirty and abort the data write through the bus - return true; - } + #endif return false; } } @@ -789,80 +796,86 @@ void ARMv5::DCacheInvalidateAll() void ARMv5::DCacheClearAll() { - for (int set = 0;set> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + #if !DISABLE_CACHEWRITEBACK + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - for (int set=0;set> DCACHE_SETS_LOG2); - return; + if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) + { + DCacheClearByASetAndWay(set, id >> DCACHE_SETS_LOG2); + return; + } } - } + #endif } void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) { - const u32 index = cacheSet | (cacheLine << DCACHE_SETS_LOG2); + #if !DISABLE_CACHEWRITEBACK + const u32 index = cacheSet | (cacheLine << DCACHE_SETS_LOG2); - // Only write back if valid - if (!(DCacheTags[index] & CACHE_FLAG_VALID)) - return ; + // Only write back if valid + if (!(DCacheTags[index] & CACHE_FLAG_VALID)) + return ; - const u32 tag = DCacheTags[index] & ~CACHE_FLAG_MASK; - u32* ptr = (u32 *)&DCache[index << DCACHE_LINELENGTH_LOG2]; + const u32 tag = DCacheTags[index] & ~CACHE_FLAG_MASK; + u32* ptr = (u32 *)&DCache[index << DCACHE_LINELENGTH_LOG2]; - if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) - { - //Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); - for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) + if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) { - //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); - if (tag+i < ITCMSize) + //Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); + for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) { - *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); - } else - if (((tag+i) & DTCMMask) == DTCMBase) - { - *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; - } else - { - BusWrite32(tag+i, ptr[i >> 2]); + //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); + if (tag+i < ITCMSize) + { + *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); + } else + if (((tag+i) & DTCMMask) == DTCMBase) + { + *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; + } else + { + BusWrite32(tag+i, ptr[i >> 2]); + } } + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; } - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; - } - if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) - { - //Log(LogLevel::Debug, "Writing back %i / %i, upper half-> %08lx\n", cacheSet, cacheLine, tag); - for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) + if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) { - //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); - if (tag+i < ITCMSize) + //Log(LogLevel::Debug, "Writing back %i / %i, upper half-> %08lx\n", cacheSet, cacheLine, tag); + for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { - *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); - } else - if (((tag+i) & DTCMMask) == DTCMBase) - { - *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; - } else - { - BusWrite32(tag+i, ptr[i >> 2]); + //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); + if (tag+i < ITCMSize) + { + *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); + } else + if (((tag+i) & DTCMMask) == DTCMBase) + { + *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; + } else + { + BusWrite32(tag+i, ptr[i >> 2]); + } } + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; } - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; - } - DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); + DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); + #endif } bool ARMv5::IsAddressDCachable(const u32 addr) const @@ -898,7 +911,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_DataCacheable ^ val; PU_DataCacheable = val; - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region cachable bit @@ -924,7 +937,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_CodeCacheable ^ val; PU_CodeCacheable = val; - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region cachable bit @@ -951,7 +964,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_DataCacheWrite ^ val; PU_DataCacheWrite = val; - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region write buffer @@ -983,7 +996,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1015,7 +1028,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1041,7 +1054,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_DataRW ^ val; PU_DataRW = val; - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1065,7 +1078,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_CodeRW ^ val; PU_CodeRW = val; - #if 0 + #if 1 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1399,7 +1412,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) uint8_t segment = (CacheDebugRegisterIndex >> (32-DCACHE_SETS_LOG2)) & (DCACHE_SETS-1); uint8_t wordAddress = (CacheDebugRegisterIndex & (DCACHE_LINELENGTH-1)) >> 2; uint8_t index = (CacheDebugRegisterIndex >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1); - *(u32 *)&DCache[((index << DCACHE_SETS_LOG2) + segment) << DCACHE_LINELENGTH_LOG2 + wordAddress*4] = val; + *(u32 *)&DCache[(((index << DCACHE_SETS_LOG2) + segment) << DCACHE_LINELENGTH_LOG2) + wordAddress*4] = val; } return; @@ -1488,7 +1501,7 @@ u32 ARMv5::CP15Read(const u32 id) const case 0x661: case 0x670: case 0x671: - return PU_Region[(id >> CP15_REGIONACCESS_BITS_PER_REGION) & 0xF]; + return PU_Region[(id >> 4) & 0xF]; case 0x7A6: // read Cache Dirty Bit (optional) @@ -1585,18 +1598,20 @@ u32 ARMv5::CP15Read(const u32 id) const u32 ARMv5::CodeRead32(const u32 addr, bool const branch) { -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) + #if !DISABLE_ICACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressICachable(addr)) + if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) { - return ICacheLookup(addr); + if (IsAddressICachable(addr)) + { + return ICacheLookup(addr); + } } } - } + #endif if (addr < ITCMSize) { @@ -1631,19 +1646,21 @@ void ARMv5::DataRead8(const u32 addr, u32* val) DataRegion = addr; -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; - return; + if (IsAddressDCachable(addr)) + { + *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; + return; + } } } - } + #endif if (addr < ITCMSize) { @@ -1673,19 +1690,21 @@ void ARMv5::DataRead16(const u32 addr, u32* val) DataRegion = addr; -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; - return; + if (IsAddressDCachable(addr)) + { + *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; + return; + } } } - } + #endif if (addr < ITCMSize) { @@ -1715,30 +1734,32 @@ void ARMv5::DataRead32(const u32 addr, u32* val) DataRegion = addr; -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - *val = DCacheLookup(addr); - return; + if (IsAddressDCachable(addr)) + { + *val = DCacheLookup(addr); + return; + } } } - } + #endif if (addr < ITCMSize) { DataCycles = 1; - *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 4)]; return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; - *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; + *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 4)]; return; } @@ -1748,30 +1769,32 @@ void ARMv5::DataRead32(const u32 addr, u32* val) void ARMv5::DataRead32S(const u32 addr, u32* val) { -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - *val = DCacheLookup(addr); - return; + if (IsAddressDCachable(addr)) + { + *val = DCacheLookup(addr); + return; + } } } - } + #endif if (addr < ITCMSize) { DataCycles += 1; - *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 4)]; return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; - *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; + *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 4)]; return; } @@ -1789,19 +1812,21 @@ void ARMv5::DataWrite8(const u32 addr, const u8 val) DataRegion = addr; -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (DCacheWrite8(addr, val)) - return; + if (IsAddressDCachable(addr)) + { + if (DCacheWrite8(addr, val)) + return; + } } } - } + #endif if (addr < ITCMSize) { @@ -1831,19 +1856,21 @@ void ARMv5::DataWrite16(const u32 addr, const u16 val) DataRegion = addr; -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (DCacheWrite16(addr, val)) - return; + if (IsAddressDCachable(addr)) + { + if (DCacheWrite16(addr, val)) + return; + } } } - } + #endif if (addr < ITCMSize) { @@ -1873,31 +1900,33 @@ void ARMv5::DataWrite32(const u32 addr, const u32 val) DataRegion = addr; -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (DCacheWrite32(addr, val)) - return; + if (IsAddressDCachable(addr)) + { + if (DCacheWrite32(addr, val)) + return; + } } } - } + #endif if (addr < ITCMSize) { DataCycles = 1; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 4)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)] = val; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 4)] = val; return; } @@ -1907,24 +1936,26 @@ void ARMv5::DataWrite32(const u32 addr, const u32 val) void ARMv5::DataWrite32S(const u32 addr, const u32 val) { -#ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) -#endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif { - if (IsAddressDCachable(addr)) + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) { - if (DCacheWrite32(addr, val)) - return; + if (IsAddressDCachable(addr)) + { + if (DCacheWrite32(addr, val)) + return; + } } } - } + #endif if (addr < ITCMSize) { DataCycles += 1; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)] = val; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 4)] = val; #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif @@ -1933,7 +1964,7 @@ void ARMv5::DataWrite32S(const u32 addr, const u32 val) if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)] = val; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 4)] = val; return; } From 01bb5f1fe2796a73d1aa3e5995a53d22d997a4c5 Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Tue, 6 Feb 2024 11:17:23 +0100 Subject: [PATCH 038/306] Enabled Overlapping region fix again --- src/CP15.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index c25470bf..f2fb9b75 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -911,7 +911,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_DataCacheable ^ val; PU_DataCacheable = val; - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region cachable bit @@ -937,7 +937,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_CodeCacheable ^ val; PU_CodeCacheable = val; - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region cachable bit @@ -964,7 +964,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_DataCacheWrite ^ val; PU_DataCacheWrite = val; - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region write buffer @@ -996,7 +996,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1028,7 +1028,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1054,7 +1054,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_DataRW ^ val; PU_DataRW = val; - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission @@ -1078,7 +1078,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) { u32 diff = PU_CodeRW ^ val; PU_CodeRW = val; - #if 1 + #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap // If overlapping and the least priority region access permission From b2d196cd64bcec42ed10836f587fa2d7c4bca7cc Mon Sep 17 00:00:00 2001 From: DesperateProgrammer Date: Wed, 7 Feb 2024 08:10:27 +0100 Subject: [PATCH 039/306] Formatting corrections Removed premature optimization and replaced them with [[(un)likely]] --- src/CP15.cpp | 139 ++++++++++++++++++--------------------------------- 1 file changed, 50 insertions(+), 89 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index f2fb9b75..be0c0839 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -296,7 +296,7 @@ void ARMv5::UpdatePURegions(const bool update_all) // PU disabled u8 mask = CP15_MAP_READABLE | CP15_MAP_WRITEABLE | CP15_MAP_EXECUTABLE; - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) mask |= CP15_MAP_DCACHEABLE | CP15_MAP_DCACHEWRITEBACK ; + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) mask |= CP15_MAP_DCACHEABLE | CP15_MAP_DCACHEWRITEBACK; if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) mask |= CP15_MAP_ICACHEABLE; memset(PU_UserMap, mask, CP15_MAP_ENTRYCOUNT); @@ -371,13 +371,13 @@ u32 ARMv5::ICacheLookup(const u32 addr) const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; - for (int set=0;set> 14][2]; if (CodeMem.Mem) @@ -412,17 +412,8 @@ u32 ARMv5::ICacheLookup(const u32 addr) } u32 line; -#if 0 - // caclulate in which cacheline the data is to be filled - // The code below is doing the same as the if-less below - // It increases performance by reducing banches. - // The code is kept here for readability. - // - // NOTE: If you need to update either part, you need - // to update the other too to keep them in sync! - // - if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] { line = ICacheCount; ICacheCount = (line+1) & (ICACHE_SETS-1); @@ -434,7 +425,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) if (ICacheLockDown) { - if (ICacheLockDown & CACHE_LOCKUP_L) + if (ICacheLockDown & CACHE_LOCKUP_L) [[unlikely]] { // load into locked up cache // into the selected set @@ -446,17 +437,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) } } -#else - // Do the same as above but instead of using if-else - // utilize the && and || operators to skip parts of the operations - // With the order of comparison we can put the most likely path - // checked first - - bool doLockDown = (ICacheLockDown & CACHE_LOCKUP_L); - bool roundRobin = CP15Control & CP15_CACHE_CR_ROUNDROBIN; - (!roundRobin && (line = RandomLineIndex())) || (roundRobin && (ICacheCount = line = ((ICacheCount+1) & (ICACHE_SETS-1)))) ; - (!doLockDown && (line = (line | ICacheLockDown & (ICACHE_SETS-1))+id)) || (doLockDown && (line = (ICacheLockDown & (ICACHE_SETS-1))+id)); -#endif + line += id; u32* ptr = (u32 *)&ICache[line << ICACHE_LINELENGTH_LOG2]; @@ -484,11 +465,11 @@ void ARMv5::ICacheInvalidateByAddr(const u32 addr) const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; - for (int set=0;set> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_ICACHEABLE ; + return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_ICACHEABLE; } u32 ARMv5::DCacheLookup(const u32 addr) { //Log(LogLevel::Debug,"DCache load @ %08x\n", addr); - const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) ; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - for (int set=0;set> 14][2]; if (addr < ITCMSize) @@ -574,51 +555,31 @@ u32 ARMv5::DCacheLookup(const u32 addr) } u32 line; -#if 0 - // caclulate in which cacheline the data is to be filled - // The code below is doing the same as the if-less below - // It increases performance by reducing banches. - // The code is kept here for readability. - // - // NOTE: If you need to update either part, you need - // to update the other too to keep them in sync! - // - if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) + if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] { line = DCacheCount; DCacheCount = (line+1) & (DCACHE_SETS-1); } else { - line = DCacheRandom(); + line = RandomLineIndex(); } - // Update the selected set depending on the DCache LockDown register if (DCacheLockDown) { - if (DCacheLockDown & CACHE_LOCKUP_L) + if (DCacheLockDown & CACHE_LOCKUP_L) [[unlikely]] { // load into locked up cache // into the selected set - line = (DCacheLockDown & (DCACHE_SETS-1)) + id; + line = DCacheLockDown & (DCACHE_SETS-1); } else { - u8 minSet = ICacheLockDown & (DCACHE_SETS-1); - line = (line | minSet) + id; + u8 minSet = DCacheLockDown & (DCACHE_SETS-1); + line = line | minSet; } - } -#else - // Do the same as above but instead of using if-else - // utilize the && and || operators to skip parts of the operations - // With the order of comparison we can put the most likely path - // checked first - - bool doLockDown = (DCacheLockDown & CACHE_LOCKUP_L); - bool roundRobin = CP15Control & CP15_CACHE_CR_ROUNDROBIN; - (!roundRobin && (line = RandomLineIndex())) || (roundRobin && (DCacheCount = line = ((DCacheCount+1) & (DCACHE_SETS-1)))); - (!doLockDown && (line = (line | DCacheLockDown & (DCACHE_SETS-1))+id)) || (doLockDown && (line = (DCacheLockDown & (DCACHE_SETS-1))+id)); -#endif + } + line += id; u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; @@ -661,7 +622,7 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) //Log(LogLevel::Debug, "Cache write 32: %08lx <= %08lx\n", addr, val); - for (int set=0;set> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; //Log(LogLevel::Debug, "Cache write 16: %08lx <= %04x\n", addr, val); - for (int set=0;set> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2;; + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; //Log(LogLevel::Debug, "Cache write 8: %08lx <= %02x\n", addr, val); - for (int set=0;set> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - for (int set=0;set> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; - for (int set=0;set> 2]); if (tag+i < ITCMSize) { - *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; + *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2]; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); } else if (((tag+i) & DTCMMask) == DTCMBase) @@ -861,7 +822,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); if (tag+i < ITCMSize) { - *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2] ; + *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2]; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); } else if (((tag+i) & DTCMMask) == DTCMBase) @@ -880,7 +841,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) bool ARMv5::IsAddressDCachable(const u32 addr) const { - return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE ; + return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE; } void ARMv5::CP15Write(const u32 id, const u32 val) @@ -993,7 +954,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) PU_DataRW = 0; #pragma GCC ivdep #pragma GCC unroll 8 - for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); #if 0 @@ -1025,7 +986,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) PU_CodeRW = 0; #pragma GCC ivdep #pragma GCC unroll 8 - for (int i=0;i> (i * 2) & 3) << (i * CP15_REGIONACCESS_BITS_PER_REGION); #if 0 @@ -1310,7 +1271,7 @@ void ARMv5::CP15Write(const u32 id, const u32 val) // Bit 0..Way-1: locked ways // The Cache is 4 way associative // But all bits are r/w - DCacheLockDown = val ; + DCacheLockDown = val; Log(LogLevel::Debug,"ICacheLockDown\n"); return; case 0x901: @@ -1464,7 +1425,7 @@ u32 ARMv5::CP15Read(const u32 id) const u32 ret = 0; #pragma GCC ivdep #pragma GCC unroll 8 - for (int i=0;i> (i * CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK) << (i*2); return ret; } @@ -1475,7 +1436,7 @@ u32 ARMv5::CP15Read(const u32 id) const // 0x503 returns all 4 bits per region u32 ret = 0; #pragma GCC unroll 8 - for (int i=0;i> (i * CP15_REGIONACCESS_BITS_PER_REGION) & CP15_REGIONACCESS_REGIONMASK) << (i*2); return ret; } From 065573f316c1c2003837d81e645f6050e0fbd006 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 31 May 2024 18:09:45 -0400 Subject: [PATCH 040/306] fix writebacks overwriting registers swapped with spsr fixes gbarunner3 --- src/ARMInterpreter_LoadStore.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 91acaacc..e21d7757 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -429,10 +429,10 @@ void A_LDM(ARM* cpu) if (!preinc) base += 4; } } - + + u32 pc; if (cpu->CurInstr & (1<<15)) { - u32 pc; if (preinc) base += 4; if (first) cpu->DataRead32 (base, &pc); else cpu->DataRead32S(base, &pc); @@ -440,8 +440,6 @@ void A_LDM(ARM* cpu) if (cpu->Num == 1) pc &= ~0x1; - - cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); } if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) @@ -466,6 +464,9 @@ void A_LDM(ARM* cpu) cpu->R[baseid] = wbbase; } + if (cpu->CurInstr & (1<<15)) + cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + cpu->AddCycles_CDI(); } From 960f063eaa8c298600198916f91811d34114e249 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 2 Jun 2024 00:11:01 -0400 Subject: [PATCH 041/306] improve data aborts for ldm --- src/ARMInterpreter_LoadStore.cpp | 73 +++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index e21d7757..dfdb98c2 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -397,6 +397,7 @@ void A_LDM(ARM* cpu) u32 wbbase; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; + int abortreg = 16; if (!(cpu->CurInstr & (1<<23))) { @@ -415,10 +416,32 @@ void A_LDM(ARM* cpu) preinc = !preinc; } + // check for data aborts + if (cpu->Num == 0) + { + u32 tmpbase = base; + for (int i = 0; i < 16; i++) + { + if (cpu->CurInstr & (1<PU_Map[tmpbase>>12] & 0x01) + { + if (!preinc) tmpbase += 4; + } + else + { + abortreg = i; + break; + } + } + } + } + if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true); - for (int i = 0; i < 15; i++) + for (int i = 0; i < std::min(15, abortreg); i++) { if (cpu->CurInstr & (1<CurInstr & (1<<15)) + if ((cpu->CurInstr & (1<<15)) && (abortreg == 16)) { if (preinc) base += 4; if (first) cpu->DataRead32 (base, &pc); @@ -445,27 +468,35 @@ void A_LDM(ARM* cpu) if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - if (cpu->CurInstr & (1<<21)) + // if it's 16 then there was no data abort + if (abortreg == 16) { - // post writeback - if (cpu->CurInstr & (1<<23)) - wbbase = base; - - if (cpu->CurInstr & (1 << baseid)) + if (cpu->CurInstr & (1<<21)) { - if (cpu->Num == 0) - { - u32 rlist = cpu->CurInstr & 0xFFFF; - if ((!(rlist & ~(1 << baseid))) || (rlist & ~((2 << baseid) - 1))) - cpu->R[baseid] = wbbase; - } - } - else - cpu->R[baseid] = wbbase; - } + // post writeback + if (cpu->CurInstr & (1<<23)) + wbbase = base; - if (cpu->CurInstr & (1<<15)) - cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + if (cpu->CurInstr & (1 << baseid)) + { + if (cpu->Num == 0) + { + u32 rlist = cpu->CurInstr & 0xFFFF; + if ((!(rlist & ~(1 << baseid))) || (rlist & ~((2 << baseid) - 1))) + cpu->R[baseid] = wbbase; + } + } + else + cpu->R[baseid] = wbbase; + } + + if (cpu->CurInstr & (1<<15)) + cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + } + else + { + ((ARMv5*)cpu)->DataAbort(); + } cpu->AddCycles_CDI(); } From 63d4b787334f2d2f41e220c88044e933a24b0266 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 2 Jun 2024 10:13:50 -0400 Subject: [PATCH 042/306] improve implementation --- src/ARM.cpp | 6 ++-- src/ARM.h | 12 +++---- src/ARMInterpreter_LoadStore.cpp | 56 +++++++++++--------------------- src/CP15.cpp | 22 +++++++++---- 4 files changed, 44 insertions(+), 52 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index c2f6a6c2..c96cb65d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1168,21 +1168,23 @@ void ARMv4::DataRead16(u32 addr, u32* val) DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataRead32(u32 addr, u32* val) +bool ARMv4::DataRead32(u32 addr, u32* val) { addr &= ~3; *val = BusRead32(addr); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; + return true; } -void ARMv4::DataRead32S(u32 addr, u32* val) +bool ARMv4::DataRead32S(u32 addr, u32* val) { addr &= ~3; *val = BusRead32(addr); DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; + return true; } void ARMv4::DataWrite8(u32 addr, u8 val) diff --git a/src/ARM.h b/src/ARM.h index 1e0b71b8..56a6306e 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -130,8 +130,8 @@ public: virtual void DataRead8(u32 addr, u32* val) = 0; virtual void DataRead16(u32 addr, u32* val) = 0; - virtual void DataRead32(u32 addr, u32* val) = 0; - virtual void DataRead32S(u32 addr, u32* val) = 0; + virtual bool DataRead32(u32 addr, u32* val) = 0; + virtual bool DataRead32S(u32 addr, u32* val) = 0; virtual void DataWrite8(u32 addr, u8 val) = 0; virtual void DataWrite16(u32 addr, u16 val) = 0; virtual void DataWrite32(u32 addr, u32 val) = 0; @@ -251,8 +251,8 @@ public: void DataRead8(u32 addr, u32* val) override; void DataRead16(u32 addr, u32* val) override; - void DataRead32(u32 addr, u32* val) override; - void DataRead32S(u32 addr, u32* val) override; + bool DataRead32(u32 addr, u32* val) override; + bool DataRead32S(u32 addr, u32* val) override; void DataWrite8(u32 addr, u8 val) override; void DataWrite16(u32 addr, u16 val) override; void DataWrite32(u32 addr, u32 val) override; @@ -400,8 +400,8 @@ public: void DataRead8(u32 addr, u32* val) override; void DataRead16(u32 addr, u32* val) override; - void DataRead32(u32 addr, u32* val) override; - void DataRead32S(u32 addr, u32* val) override; + bool DataRead32(u32 addr, u32* val) override; + bool DataRead32S(u32 addr, u32* val) override; void DataWrite8(u32 addr, u8 val) override; void DataWrite16(u32 addr, u16 val) override; void DataWrite32(u32 addr, u32 val) override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index dfdb98c2..806b4c3e 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -395,12 +395,14 @@ void A_LDM(ARM* cpu) u32 baseid = (cpu->CurInstr >> 16) & 0xF; u32 base = cpu->R[baseid]; u32 wbbase; + u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - int abortreg = 16; + bool dataabort = false; - if (!(cpu->CurInstr & (1<<23))) + if (!(cpu->CurInstr & (1<<23))) // decrement { + // decrement is actually an increment starting from the end address for (int i = 0; i < 16; i++) { if (cpu->CurInstr & (1<Num == 0) - { - u32 tmpbase = base; - for (int i = 0; i < 16; i++) - { - if (cpu->CurInstr & (1<PU_Map[tmpbase>>12] & 0x01) - { - if (!preinc) tmpbase += 4; - } - else - { - abortreg = i; - break; - } - } - } - } - + // switch to user mode regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true); - for (int i = 0; i < std::min(15, abortreg); i++) + for (int i = 0; i < 15; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]); - else cpu->DataRead32S(base, &cpu->R[i]); + if (first) {if (!cpu->DataRead32 (base, &cpu->R[i])) {dataabort = true; goto abortjump;}} + else if (!cpu->DataRead32S(base, &cpu->R[i])) {dataabort = true; goto abortjump;} first = false; if (!preinc) base += 4; } } u32 pc; - if ((cpu->CurInstr & (1<<15)) && (abortreg == 16)) + if ((cpu->CurInstr & (1<<15))) { if (preinc) base += 4; - if (first) cpu->DataRead32 (base, &pc); - else cpu->DataRead32S(base, &pc); + if (first) {if (!cpu->DataRead32 (base, &pc)) dataabort = true;} + else if (!cpu->DataRead32S(base, &pc)) dataabort = true; if (!preinc) base += 4; if (cpu->Num == 1) pc &= ~0x1; } + abortjump: + + // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - // if it's 16 then there was no data abort - if (abortreg == 16) + if (!dataabort) { + // writeback to base if (cpu->CurInstr & (1<<21)) { // post writeback @@ -489,14 +473,12 @@ void A_LDM(ARM* cpu) else cpu->R[baseid] = wbbase; } - + + // jump if pc got written if (cpu->CurInstr & (1<<15)) cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); } - else - { - ((ARMv5*)cpu)->DataAbort(); - } + else cpu->R[baseid] = oldbase; // restore original value of base in case the reg got written to cpu->AddCycles_CDI(); } diff --git a/src/CP15.cpp b/src/CP15.cpp index 5e5b35ea..fa55853d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -863,12 +863,12 @@ void ARMv5::DataRead16(u32 addr, u32* val) DataCycles = MemTimings[addr >> 12][1]; } -void ARMv5::DataRead32(u32 addr, u32* val) +bool ARMv5::DataRead32(u32 addr, u32* val) { if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); - return; + return false; } DataRegion = addr; @@ -879,38 +879,46 @@ void ARMv5::DataRead32(u32 addr, u32* val) { DataCycles = 1; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return; + return true; } *val = BusRead32(addr); DataCycles = MemTimings[addr >> 12][2]; + return true; } -void ARMv5::DataRead32S(u32 addr, u32* val) +bool ARMv5::DataRead32S(u32 addr, u32* val) { + if (!(PU_Map[addr>>12] & 0x01)) + { + DataAbort(); + return false; + } + addr &= ~3; if (addr < ITCMSize) { DataCycles += 1; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return; + return true; } *val = BusRead32(addr); DataCycles += MemTimings[addr >> 12][3]; + return true; } void ARMv5::DataWrite8(u32 addr, u8 val) From b5c1ee33fbaf4ff428c3b5f2b2e5d71c37a70041 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 2 Jun 2024 10:33:29 -0400 Subject: [PATCH 043/306] implement stm --- src/ARM.cpp | 6 ++++-- src/ARM.h | 12 ++++++------ src/ARMInterpreter_LoadStore.cpp | 9 +++++---- src/CP15.cpp | 22 +++++++++++++++------- 4 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index c96cb65d..acf1b6e4 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1203,21 +1203,23 @@ void ARMv4::DataWrite16(u32 addr, u16 val) DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; } -void ARMv4::DataWrite32(u32 addr, u32 val) +bool ARMv4::DataWrite32(u32 addr, u32 val) { addr &= ~3; BusWrite32(addr, val); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; + return true; } -void ARMv4::DataWrite32S(u32 addr, u32 val) +bool ARMv4::DataWrite32S(u32 addr, u32 val) { addr &= ~3; BusWrite32(addr, val); DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; + return true; } diff --git a/src/ARM.h b/src/ARM.h index 56a6306e..67087433 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -134,8 +134,8 @@ public: virtual bool DataRead32S(u32 addr, u32* val) = 0; virtual void DataWrite8(u32 addr, u8 val) = 0; virtual void DataWrite16(u32 addr, u16 val) = 0; - virtual void DataWrite32(u32 addr, u32 val) = 0; - virtual void DataWrite32S(u32 addr, u32 val) = 0; + virtual bool DataWrite32(u32 addr, u32 val) = 0; + virtual bool DataWrite32S(u32 addr, u32 val) = 0; virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; @@ -255,8 +255,8 @@ public: bool DataRead32S(u32 addr, u32* val) override; void DataWrite8(u32 addr, u8 val) override; void DataWrite16(u32 addr, u16 val) override; - void DataWrite32(u32 addr, u32 val) override; - void DataWrite32S(u32 addr, u32 val) override; + bool DataWrite32(u32 addr, u32 val) override; + bool DataWrite32S(u32 addr, u32 val) override; void AddCycles_C() override { @@ -404,8 +404,8 @@ public: bool DataRead32S(u32 addr, u32* val) override; void DataWrite8(u32 addr, u8 val) override; void DataWrite16(u32 addr, u16 val) override; - void DataWrite32(u32 addr, u32 val) override; - void DataWrite32S(u32 addr, u32 val) override; + bool DataWrite32(u32 addr, u32 val) override; + bool DataWrite32S(u32 addr, u32 val) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 806b4c3e..5c6b4c42 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -490,6 +490,7 @@ void A_STM(ARM* cpu) u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; + bool dataabort = false; if (!(cpu->CurInstr & (1<<23))) { @@ -526,12 +527,12 @@ void A_STM(ARM* cpu) if (i == baseid && !isbanked) { if ((cpu->Num == 0) || (!(cpu->CurInstr & ((1<DataWrite32(base, oldbase) : cpu->DataWrite32S(base, oldbase); + {if (!(first ? cpu->DataWrite32(base, oldbase) : cpu->DataWrite32S(base, oldbase))) {dataabort = true; break;}} else - first ? cpu->DataWrite32(base, base) : cpu->DataWrite32S(base, base); // checkme + if (!(first ? cpu->DataWrite32(base, base) : cpu->DataWrite32S(base, base))) {dataabort = true; break;} // checkme } else - first ? cpu->DataWrite32(base, cpu->R[i]) : cpu->DataWrite32S(base, cpu->R[i]); + if (!(first ? cpu->DataWrite32(base, cpu->R[i]) : cpu->DataWrite32S(base, cpu->R[i]))) {dataabort = true; break;} first = false; @@ -542,7 +543,7 @@ void A_STM(ARM* cpu) if (cpu->CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) + if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21)) && !dataabort) cpu->R[baseid] = base; cpu->AddCycles_CD(); diff --git a/src/CP15.cpp b/src/CP15.cpp index fa55853d..b2ab9f91 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -979,12 +979,12 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = MemTimings[addr >> 12][1]; } -void ARMv5::DataWrite32(u32 addr, u32 val) +bool ARMv5::DataWrite32(u32 addr, u32 val) { if (!(PU_Map[addr>>12] & 0x02)) { DataAbort(); - return; + return false; } DataRegion = addr; @@ -996,21 +996,28 @@ void ARMv5::DataWrite32(u32 addr, u32 val) DataCycles = 1; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; + return true; } BusWrite32(addr, val); DataCycles = MemTimings[addr >> 12][2]; + return true; } -void ARMv5::DataWrite32S(u32 addr, u32 val) +bool ARMv5::DataWrite32S(u32 addr, u32 val) { + if (!(PU_Map[addr>>12] & 0x02)) + { + DataAbort(); + return false; + } + addr &= ~3; if (addr < ITCMSize) @@ -1020,17 +1027,18 @@ void ARMv5::DataWrite32S(u32 addr, u32 val) #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; + return true; } BusWrite32(addr, val); DataCycles += MemTimings[addr >> 12][3]; + return true; } void ARMv5::GetCodeMemRegion(u32 addr, MemRegion* region) From 5e760a15361bb20b1d2a659caa74242e8a157344 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 2 Jun 2024 19:34:29 -0400 Subject: [PATCH 044/306] slightly cleaner code --- src/ARMInterpreter_LoadStore.cpp | 33 +++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 5c6b4c42..afcca05d 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -427,8 +427,13 @@ void A_LDM(ARM* cpu) if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i])) {dataabort = true; goto abortjump;}} - else if (!cpu->DataRead32S(base, &cpu->R[i])) {dataabort = true; goto abortjump;} + if (!(first ? cpu->DataRead32 (base, &cpu->R[i]) + : cpu->DataRead32S(base, &cpu->R[i]))) + { + dataabort = true; + goto abortjump; + } + first = false; if (!preinc) base += 4; } @@ -438,8 +443,12 @@ void A_LDM(ARM* cpu) if ((cpu->CurInstr & (1<<15))) { if (preinc) base += 4; - if (first) {if (!cpu->DataRead32 (base, &pc)) dataabort = true;} - else if (!cpu->DataRead32S(base, &pc)) dataabort = true; + if (!(first ? cpu->DataRead32 (base, &pc) + : cpu->DataRead32S(base, &pc))) + { + dataabort = true; + } + if (!preinc) base += 4; if (cpu->Num == 1) @@ -524,15 +533,21 @@ void A_STM(ARM* cpu) { if (preinc) base += 4; + u32 val; if (i == baseid && !isbanked) { if ((cpu->Num == 0) || (!(cpu->CurInstr & ((1<DataWrite32(base, oldbase) : cpu->DataWrite32S(base, oldbase))) {dataabort = true; break;}} - else - if (!(first ? cpu->DataWrite32(base, base) : cpu->DataWrite32S(base, base))) {dataabort = true; break;} // checkme + val = oldbase; + else val = base; + } + else val = cpu->R[i]; + + if (!(first ? cpu->DataWrite32 (base, val) + : cpu->DataWrite32S(base, val))) + { + dataabort = true; + break; } - else - if (!(first ? cpu->DataWrite32(base, cpu->R[i]) : cpu->DataWrite32S(base, cpu->R[i]))) {dataabort = true; break;} first = false; From c2a57b79a03adaead1044fccb67988ddfe0d5b67 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 2 Jun 2024 22:41:01 -0400 Subject: [PATCH 045/306] fix stmd(a/b) writeback --- src/ARMInterpreter_LoadStore.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index afcca05d..96766288 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -558,8 +558,12 @@ void A_STM(ARM* cpu) if (cpu->CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21)) && !dataabort) - cpu->R[baseid] = base; + if (!dataabort) + { + if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) + cpu->R[baseid] = base; + } + else cpu->R[baseid] = oldbase; cpu->AddCycles_CD(); } From 1e8194e367517b6c08b0bc4ae38971843973c656 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 4 Jun 2024 19:06:54 -0400 Subject: [PATCH 046/306] fix ldr and str --- src/ARMInterpreter_LoadStore.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 96766288..fe9bfd0c 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -65,9 +65,10 @@ namespace melonDS::ARMInterpreter u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ - cpu->DataWrite32(offset, storeval); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite32(offset, storeval); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; // TODO: user mode (bit21) #define A_STR_POST \ @@ -75,9 +76,10 @@ namespace melonDS::ARMInterpreter u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ - cpu->DataWrite32(addr, storeval); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite32(addr, storeval); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ @@ -94,10 +96,11 @@ namespace melonDS::ARMInterpreter #define A_LDR \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead32(offset, &val); \ + u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ + cpu->AddCycles_CDI(); \ + if (dataabort) return; \ val = ROR(val, ((offset&0x3)<<3)); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CDI(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ if (cpu->Num==1) val &= ~0x1; \ @@ -111,10 +114,11 @@ namespace melonDS::ARMInterpreter // TODO: user mode #define A_LDR_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead32(addr, &val); \ + u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ + cpu->AddCycles_CDI(); \ + if (dataabort) return; \ val = ROR(val, ((addr&0x3)<<3)); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CDI(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ if (cpu->Num==1) val &= ~0x1; \ From 317a8c61e592e310e738381f2fccfdc81521cc27 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 4 Jun 2024 21:22:39 -0400 Subject: [PATCH 047/306] data abort handling for (almost) all (arm) instructions full list: strb, ldrb, strh, ldrd, strd, ldrh, ldrsb, ldrsh --- src/ARM.cpp | 12 ++-- src/ARM.h | 24 +++---- src/ARMInterpreter_LoadStore.cpp | 110 +++++++++++++++++-------------- src/CP15.cpp | 36 +++++----- 4 files changed, 102 insertions(+), 80 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index acf1b6e4..cf45a564 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1152,20 +1152,22 @@ u32 ARMv5::ReadMem(u32 addr, int size) } #endif -void ARMv4::DataRead8(u32 addr, u32* val) +bool ARMv4::DataRead8(u32 addr, u32* val) { *val = BusRead8(addr); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; + return true; } -void ARMv4::DataRead16(u32 addr, u32* val) +bool ARMv4::DataRead16(u32 addr, u32* val) { addr &= ~1; *val = BusRead16(addr); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; + return true; } bool ARMv4::DataRead32(u32 addr, u32* val) @@ -1187,20 +1189,22 @@ bool ARMv4::DataRead32S(u32 addr, u32* val) return true; } -void ARMv4::DataWrite8(u32 addr, u8 val) +bool ARMv4::DataWrite8(u32 addr, u8 val) { BusWrite8(addr, val); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; + return true; } -void ARMv4::DataWrite16(u32 addr, u16 val) +bool ARMv4::DataWrite16(u32 addr, u16 val) { addr &= ~1; BusWrite16(addr, val); DataRegion = addr; DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; + return true; } bool ARMv4::DataWrite32(u32 addr, u32 val) diff --git a/src/ARM.h b/src/ARM.h index 67087433..f2277253 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -128,12 +128,12 @@ public: void SetupCodeMem(u32 addr); - virtual void DataRead8(u32 addr, u32* val) = 0; - virtual void DataRead16(u32 addr, u32* val) = 0; + virtual bool DataRead8(u32 addr, u32* val) = 0; + virtual bool DataRead16(u32 addr, u32* val) = 0; virtual bool DataRead32(u32 addr, u32* val) = 0; virtual bool DataRead32S(u32 addr, u32* val) = 0; - virtual void DataWrite8(u32 addr, u8 val) = 0; - virtual void DataWrite16(u32 addr, u16 val) = 0; + virtual bool DataWrite8(u32 addr, u8 val) = 0; + virtual bool DataWrite16(u32 addr, u16 val) = 0; virtual bool DataWrite32(u32 addr, u32 val) = 0; virtual bool DataWrite32S(u32 addr, u32 val) = 0; @@ -249,12 +249,12 @@ public: // all code accesses are forced nonseq 32bit u32 CodeRead32(u32 addr, bool branch); - void DataRead8(u32 addr, u32* val) override; - void DataRead16(u32 addr, u32* val) override; + bool DataRead8(u32 addr, u32* val) override; + bool DataRead16(u32 addr, u32* val) override; bool DataRead32(u32 addr, u32* val) override; bool DataRead32S(u32 addr, u32* val) override; - void DataWrite8(u32 addr, u8 val) override; - void DataWrite16(u32 addr, u16 val) override; + bool DataWrite8(u32 addr, u8 val) override; + bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; bool DataWrite32S(u32 addr, u32 val) override; @@ -398,12 +398,12 @@ public: return BusRead32(addr); } - void DataRead8(u32 addr, u32* val) override; - void DataRead16(u32 addr, u32* val) override; + bool DataRead8(u32 addr, u32* val) override; + bool DataRead16(u32 addr, u32* val) override; bool DataRead32(u32 addr, u32* val) override; bool DataRead32S(u32 addr, u32* val) override; - void DataWrite8(u32 addr, u8 val) override; - void DataWrite16(u32 addr, u16 val) override; + bool DataWrite8(u32 addr, u8 val) override; + bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; bool DataWrite32S(u32 addr, u32 val) override; void AddCycles_C() override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index fe9bfd0c..67e09a7b 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -83,16 +83,18 @@ namespace melonDS::ARMInterpreter #define A_STRB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite8(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite8(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; // TODO: user mode (bit21) #define A_STRB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite8(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite8(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDR \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ @@ -131,18 +133,20 @@ namespace melonDS::ARMInterpreter #define A_LDRB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead8(offset, &val); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ + u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRB PC %08X\n", cpu->R[15]); \ // TODO: user mode #define A_LDRB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; cpu->DataRead8(addr, &val); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ + u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRB PC %08X\n", cpu->R[15]); \ @@ -229,103 +233,113 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite16(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite16(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->DataWrite16(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite16(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; // TODO: CHECK LDRD/STRD TIMINGS!! #define A_LDRD \ if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { r--; printf("!! MISALIGNED LDRD %d\n", r+1); } \ - cpu->DataRead32 (offset , &cpu->R[r ]); \ - cpu->DataRead32S(offset+4, &cpu->R[r+1]); \ - cpu->AddCycles_CDI(); + if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ + if (!cpu->DataRead32S(offset+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ + cpu->AddCycles_CDI(); \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { r--; printf("!! MISALIGNED LDRD_POST %d\n", r+1); } \ - cpu->DataRead32 (addr , &cpu->R[r ]); \ - cpu->DataRead32S(addr+4, &cpu->R[r+1]); \ - cpu->AddCycles_CDI(); + if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ + if (!cpu->DataRead32S(addr+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ + cpu->AddCycles_CDI(); \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { r--; printf("!! MISALIGNED STRD %d\n", r+1); } \ - cpu->DataWrite32 (offset , cpu->R[r ]); \ - cpu->DataWrite32S(offset+4, cpu->R[r+1]); \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite32(offset, cpu->R[r ]); /* yes, this data abort behavior is on purpose */ \ + dataabort |= !cpu->DataWrite32S (offset+4, cpu->R[r+1]); /* no, i dont understand it either */ \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRD_POST \ if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { r--; printf("!! MISALIGNED STRD_POST %d\n", r+1); } \ - cpu->DataWrite32 (addr , cpu->R[r ]); \ - cpu->DataWrite32S(addr+4, cpu->R[r+1]); \ - cpu->AddCycles_CD(); + bool dataabort = !cpu->DataWrite32(addr, cpu->R[r ]); \ + dataabort |= !cpu->DataWrite32S (addr+4, cpu->R[r+1]); \ + cpu->AddCycles_CD(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + bool dataabort = !cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRH PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + bool dataabort = !cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRH PC %08X\n", cpu->R[15]); \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->DataRead8(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + bool dataabort = !cpu->DataRead8(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSB PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->DataRead8(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + bool dataabort = !cpu->DataRead8(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSB PC %08X\n", cpu->R[15]); \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + bool dataabort = !cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSH PC %08X\n", cpu->R[15]); \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + bool dataabort = !cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSH PC %08X\n", cpu->R[15]); \ + cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_IMPLEMENT_HD_LDRSTR(x) \ diff --git a/src/CP15.cpp b/src/CP15.cpp index b2ab9f91..857c5c90 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -807,12 +807,12 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } -void ARMv5::DataRead8(u32 addr, u32* val) +bool ARMv5::DataRead8(u32 addr, u32* val) { if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); - return; + return false; } DataRegion = addr; @@ -821,25 +821,26 @@ void ARMv5::DataRead8(u32 addr, u32* val) { DataCycles = 1; *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return; + return true; } *val = BusRead8(addr); DataCycles = MemTimings[addr >> 12][1]; + return true; } -void ARMv5::DataRead16(u32 addr, u32* val) +bool ARMv5::DataRead16(u32 addr, u32* val) { if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); - return; + return false; } DataRegion = addr; @@ -850,17 +851,18 @@ void ARMv5::DataRead16(u32 addr, u32* val) { DataCycles = 1; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return; + return true; } *val = BusRead16(addr); DataCycles = MemTimings[addr >> 12][1]; + return true; } bool ARMv5::DataRead32(u32 addr, u32* val) @@ -921,12 +923,12 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) return true; } -void ARMv5::DataWrite8(u32 addr, u8 val) +bool ARMv5::DataWrite8(u32 addr, u8 val) { if (!(PU_Map[addr>>12] & 0x02)) { DataAbort(); - return; + return false; } DataRegion = addr; @@ -936,25 +938,26 @@ void ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; + return true; } BusWrite8(addr, val); DataCycles = MemTimings[addr >> 12][1]; + return true; } -void ARMv5::DataWrite16(u32 addr, u16 val) +bool ARMv5::DataWrite16(u32 addr, u16 val) { if (!(PU_Map[addr>>12] & 0x02)) { DataAbort(); - return; + return false; } DataRegion = addr; @@ -966,17 +969,18 @@ void ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return; + return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return; + return true; } BusWrite16(addr, val); DataCycles = MemTimings[addr >> 12][1]; + return true; } bool ARMv5::DataWrite32(u32 addr, u32 val) From 1871c48849949d8700271e57c7fef7a85216d45e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 5 Jun 2024 10:28:51 -0400 Subject: [PATCH 048/306] fix double data aborts with strd --- src/ARM.cpp | 2 +- src/ARM.h | 6 +++--- src/ARMInterpreter_LoadStore.cpp | 4 ++-- src/CP15.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index cf45a564..0d2976d2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1217,7 +1217,7 @@ bool ARMv4::DataWrite32(u32 addr, u32 val) return true; } -bool ARMv4::DataWrite32S(u32 addr, u32 val) +bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) { addr &= ~3; diff --git a/src/ARM.h b/src/ARM.h index f2277253..1f68567c 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -135,7 +135,7 @@ public: virtual bool DataWrite8(u32 addr, u8 val) = 0; virtual bool DataWrite16(u32 addr, u16 val) = 0; virtual bool DataWrite32(u32 addr, u32 val) = 0; - virtual bool DataWrite32S(u32 addr, u32 val) = 0; + virtual bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) = 0; virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; @@ -256,7 +256,7 @@ public: bool DataWrite8(u32 addr, u8 val) override; bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; - bool DataWrite32S(u32 addr, u32 val) override; + bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; void AddCycles_C() override { @@ -405,7 +405,7 @@ public: bool DataWrite8(u32 addr, u8 val) override; bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; - bool DataWrite32S(u32 addr, u32 val) override; + bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 67e09a7b..d28aed0f 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -273,7 +273,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { r--; printf("!! MISALIGNED STRD %d\n", r+1); } \ bool dataabort = !cpu->DataWrite32(offset, cpu->R[r ]); /* yes, this data abort behavior is on purpose */ \ - dataabort |= !cpu->DataWrite32S (offset+4, cpu->R[r+1]); /* no, i dont understand it either */ \ + dataabort |= !cpu->DataWrite32S (offset+4, cpu->R[r+1], dataabort); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -284,7 +284,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { r--; printf("!! MISALIGNED STRD_POST %d\n", r+1); } \ bool dataabort = !cpu->DataWrite32(addr, cpu->R[r ]); \ - dataabort |= !cpu->DataWrite32S (addr+4, cpu->R[r+1]); \ + dataabort |= !cpu->DataWrite32S (addr+4, cpu->R[r+1], dataabort); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; diff --git a/src/CP15.cpp b/src/CP15.cpp index 857c5c90..34c8addf 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1014,11 +1014,11 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) return true; } -bool ARMv5::DataWrite32S(u32 addr, u32 val) +bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) { if (!(PU_Map[addr>>12] & 0x02)) { - DataAbort(); + if (!dataabort) DataAbort(); return false; } From 7c3108e20f0e8ec5391df2c09bd5af99464f361f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:31:44 -0400 Subject: [PATCH 049/306] handle swp instruction aborts --- src/ARMInterpreter_LoadStore.cpp | 34 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index d28aed0f..c8544a67 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -382,13 +382,16 @@ void A_SWP(ARM* cpu) u32 rm = cpu->R[cpu->CurInstr & 0xF]; u32 val; - cpu->DataRead32(base, &val); - cpu->R[(cpu->CurInstr >> 12) & 0xF] = ROR(val, 8*(base&0x3)); - - u32 numD = cpu->DataCycles; - cpu->DataWrite32(base, rm); - cpu->DataCycles += numD; - + if (cpu->DataRead32(base, &val)) + { + u32 numD = cpu->DataCycles; + if (cpu->DataWrite32(base, rm)) + { + // rd only gets updated if both read and write succeed + cpu->R[(cpu->CurInstr >> 12) & 0xF] = ROR(val, 8*(base&0x3)); + } + cpu->DataCycles += numD; + } cpu->AddCycles_CDI(); } @@ -397,12 +400,17 @@ void A_SWPB(ARM* cpu) u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; - cpu->DataRead8(base, &cpu->R[(cpu->CurInstr >> 12) & 0xF]); - - u32 numD = cpu->DataCycles; - cpu->DataWrite8(base, rm); - cpu->DataCycles += numD; - + u32 val; + if (cpu->DataRead8(base, &val)) + { + u32 numD = cpu->DataCycles; + if (cpu->DataWrite8(base, rm)) + { + // rd only gets updated if both read and write succeed + cpu->R[(cpu->CurInstr >> 12) & 0xF] = val; + } + cpu->DataCycles += numD; + } cpu->AddCycles_CDI(); } From 13ae96b4e3540696bc7de5aeb5a4ee5f5999380a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:32:12 -0400 Subject: [PATCH 050/306] simple thumb instructions (untested but probably right) --- src/ARMInterpreter_LoadStore.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index c8544a67..c518adfb 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -631,8 +631,8 @@ void T_LDR_REG(ARM* cpu) u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; u32 val; - cpu->DataRead32(addr, &val); - cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); + if (cpu->DataRead32(addr, &val)) + cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); cpu->AddCycles_CDI(); } @@ -657,8 +657,8 @@ void T_STRH_REG(ARM* cpu) void T_LDRSB_REG(ARM* cpu) { u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; + if (cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7])) + cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI(); } @@ -674,8 +674,8 @@ void T_LDRH_REG(ARM* cpu) void T_LDRSH_REG(ARM* cpu) { u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; + if (cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7])) + cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI(); } @@ -696,8 +696,8 @@ void T_LDR_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 val; - cpu->DataRead32(offset, &val); - cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); + if (cpu->DataRead32(offset, &val)) + cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); cpu->AddCycles_CDI(); } From d6cd18945561d7e1edecb01de5368d687372307c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:58:43 -0400 Subject: [PATCH 051/306] rework data abort handling for ldm/stm; implement thumb stmia+push --- src/ARMInterpreter_LoadStore.cpp | 106 +++++++++++++++++++------------ 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index c518adfb..b615e9e1 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -424,7 +424,6 @@ void A_LDM(ARM* cpu) u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - bool dataabort = false; if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -456,8 +455,7 @@ void A_LDM(ARM* cpu) if (!(first ? cpu->DataRead32 (base, &cpu->R[i]) : cpu->DataRead32S(base, &cpu->R[i]))) { - dataabort = true; - goto abortjump; + goto dataabort; } first = false; @@ -472,7 +470,7 @@ void A_LDM(ARM* cpu) if (!(first ? cpu->DataRead32 (base, &pc) : cpu->DataRead32S(base, &pc))) { - dataabort = true; + goto dataabort; } if (!preinc) base += 4; @@ -481,39 +479,46 @@ void A_LDM(ARM* cpu) pc &= ~0x1; } - abortjump: - // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - if (!dataabort) + // writeback to base + if (cpu->CurInstr & (1<<21)) { - // writeback to base - if (cpu->CurInstr & (1<<21)) - { - // post writeback - if (cpu->CurInstr & (1<<23)) - wbbase = base; + // post writeback + if (cpu->CurInstr & (1<<23)) + wbbase = base; - if (cpu->CurInstr & (1 << baseid)) + if (cpu->CurInstr & (1 << baseid)) + { + if (cpu->Num == 0) { - if (cpu->Num == 0) - { - u32 rlist = cpu->CurInstr & 0xFFFF; - if ((!(rlist & ~(1 << baseid))) || (rlist & ~((2 << baseid) - 1))) - cpu->R[baseid] = wbbase; - } + u32 rlist = cpu->CurInstr & 0xFFFF; + if ((!(rlist & ~(1 << baseid))) || (rlist & ~((2 << baseid) - 1))) + cpu->R[baseid] = wbbase; } - else - cpu->R[baseid] = wbbase; } - - // jump if pc got written - if (cpu->CurInstr & (1<<15)) - cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + else + cpu->R[baseid] = wbbase; + } + + // jump if pc got written + if (cpu->CurInstr & (1<<15)) + cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + + // jump here if a data abort occurred; writeback is ignored, and any jumps were aborted + if (false) + { + dataabort: + + // switch back to original set of regs + if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) + cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + + // restore original value of base in case the reg got written to + cpu->R[baseid] = oldbase; } - else cpu->R[baseid] = oldbase; // restore original value of base in case the reg got written to cpu->AddCycles_CDI(); } @@ -525,7 +530,6 @@ void A_STM(ARM* cpu) u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - bool dataabort = false; if (!(cpu->CurInstr & (1<<23))) { @@ -571,8 +575,7 @@ void A_STM(ARM* cpu) if (!(first ? cpu->DataWrite32 (base, val) : cpu->DataWrite32S(base, val))) { - dataabort = true; - break; + goto dataabort; } first = false; @@ -584,12 +587,20 @@ void A_STM(ARM* cpu) if (cpu->CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - if (!dataabort) + if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) + cpu->R[baseid] = base; + + // jump here if a data abort occurred + if (false) { - if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) - cpu->R[baseid] = base; + dataabort: + + if (cpu->CurInstr & (1<<22)) + cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + + // restore original value of base + cpu->R[baseid] = oldbase; } - else cpu->R[baseid] = oldbase; cpu->AddCycles_CD(); } @@ -774,14 +785,17 @@ void T_PUSH(ARM* cpu) u32 base = cpu->R[13]; base -= (nregs<<2); - cpu->R[13] = base; + u32 wbbase = base; for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]); - else cpu->DataWrite32S(base, cpu->R[i]); + if (!(first ? cpu->DataWrite32 (base, cpu->R[i]) + : cpu->DataWrite32S(base, cpu->R[i]))) + { + goto dataabort; + } first = false; base += 4; } @@ -789,10 +803,16 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) { - if (first) cpu->DataWrite32 (base, cpu->R[14]); - else cpu->DataWrite32S(base, cpu->R[14]); + if (!(first ? cpu->DataWrite32 (base, cpu->R[14]) + : cpu->DataWrite32S(base, cpu->R[14]))) + { + goto dataabort; + } } + cpu->R[13] = wbbase; + + dataabort: cpu->AddCycles_CD(); } @@ -835,8 +855,11 @@ void T_STMIA(ARM* cpu) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]); - else cpu->DataWrite32S(base, cpu->R[i]); + if (!(first ? cpu->DataWrite32 (base, cpu->R[i]) + : cpu->DataWrite32S(base, cpu->R[i]))) + { + goto dataabort; + } first = false; base += 4; } @@ -844,6 +867,7 @@ void T_STMIA(ARM* cpu) // TODO: check "Rb included in Rlist" case cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; + dataabort: cpu->AddCycles_CD(); } From 8bc7e4591c4851a90b6f245b3c51fa2f13785a32 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 6 Jun 2024 19:05:28 -0400 Subject: [PATCH 052/306] thumb ldmia/pop data aborts --- src/ARMInterpreter_LoadStore.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index b615e9e1..144ecec5 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -825,8 +825,11 @@ void T_POP(ARM* cpu) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]); - else cpu->DataRead32S(base, &cpu->R[i]); + if (!(first ? cpu->DataRead32 (base, &cpu->R[i]) + : cpu->DataRead32S(base, &cpu->R[i]))) + { + goto dataabort; + } first = false; base += 4; } @@ -835,14 +838,19 @@ void T_POP(ARM* cpu) if (cpu->CurInstr & (1<<8)) { u32 pc; - if (first) cpu->DataRead32 (base, &pc); - else cpu->DataRead32S(base, &pc); + if (!(first ? cpu->DataRead32 (base, &pc) + : cpu->DataRead32S(base, &pc))) + { + goto dataabort; + } if (cpu->Num==1) pc |= 0x1; cpu->JumpTo(pc); base += 4; } cpu->R[13] = base; + + dataabort: cpu->AddCycles_CDI(); } @@ -880,8 +888,11 @@ void T_LDMIA(ARM* cpu) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]); - else cpu->DataRead32S(base, &cpu->R[i]); + if (!(first ? cpu->DataRead32 (base, &cpu->R[i]) + : cpu->DataRead32S(base, &cpu->R[i]))) + { + goto dataabort; + } first = false; base += 4; } @@ -890,6 +901,7 @@ void T_LDMIA(ARM* cpu) if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; + dataabort: cpu->AddCycles_CDI(); } From bd3611b51d9e6ccc41f305607c0cb824df106734 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 7 Jun 2024 20:43:02 -0400 Subject: [PATCH 053/306] unaligned registers with strd/ldrd raise an exception --- src/ARMInterpreter_LoadStore.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 144ecec5..9782140b 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -251,7 +251,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED LDRD %d\n", r+1); } \ + if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ if (!cpu->DataRead32S(offset+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ cpu->AddCycles_CDI(); \ @@ -261,7 +261,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED LDRD_POST %d\n", r+1); } \ + if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ if (!cpu->DataRead32S(addr+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ cpu->AddCycles_CDI(); \ @@ -271,7 +271,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED STRD %d\n", r+1); } \ + if (r&1) { A_UNK(cpu); return; } \ bool dataabort = !cpu->DataWrite32(offset, cpu->R[r ]); /* yes, this data abort behavior is on purpose */ \ dataabort |= !cpu->DataWrite32S (offset+4, cpu->R[r+1], dataabort); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ @@ -282,7 +282,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { r--; printf("!! MISALIGNED STRD_POST %d\n", r+1); } \ + if (r&1) { A_UNK(cpu); return; } \ bool dataabort = !cpu->DataWrite32(addr, cpu->R[r ]); \ dataabort |= !cpu->DataWrite32S (addr+4, cpu->R[r+1], dataabort); \ cpu->AddCycles_CD(); \ From 2b0ed459e125af0df9665937115d0bfff6d7aaf8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 7 Jun 2024 23:46:49 -0400 Subject: [PATCH 054/306] fully implement r15 stores being +12 of addr --- src/ARMInterpreter_LoadStore.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 9782140b..cfa8e3d4 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -251,7 +251,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } \ + if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ if (!cpu->DataRead32S(offset+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ cpu->AddCycles_CDI(); \ @@ -261,7 +261,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } \ + if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ if (!cpu->DataRead32S(addr+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ cpu->AddCycles_CDI(); \ @@ -271,9 +271,10 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(offset, cpu->R[r ]); /* yes, this data abort behavior is on purpose */ \ - dataabort |= !cpu->DataWrite32S (offset+4, cpu->R[r+1], dataabort); /* no, i dont understand it either */ \ + if (r&1) { A_UNK(cpu); return; } /* checkme */ \ + bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ + u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -282,9 +283,10 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(addr, cpu->R[r ]); \ - dataabort |= !cpu->DataWrite32S (addr+4, cpu->R[r+1], dataabort); \ + if (r&1) { A_UNK(cpu); return; } /* checkme */ \ + bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ + u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -380,6 +382,7 @@ void A_SWP(ARM* cpu) { u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 rm = cpu->R[cpu->CurInstr & 0xF]; + if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; if (cpu->DataRead32(base, &val)) @@ -399,6 +402,7 @@ void A_SWPB(ARM* cpu) { u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; + if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; if (cpu->DataRead8(base, &val)) @@ -572,6 +576,8 @@ void A_STM(ARM* cpu) } else val = cpu->R[i]; + if (i == 15) val+=4; + if (!(first ? cpu->DataWrite32 (base, val) : cpu->DataWrite32S(base, val))) { From 73507621f5b1460191eaf0242978859b5aad9c45 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 7 Jun 2024 23:50:31 -0400 Subject: [PATCH 055/306] idk why it took me two tries to get these instructions to work properly --- src/ARMInterpreter_LoadStore.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index cfa8e3d4..1d8595ab 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -83,7 +83,9 @@ namespace melonDS::ARMInterpreter #define A_STRB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataWrite8(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ + bool dataabort = !cpu->DataWrite8(offset, storeval); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -91,7 +93,9 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STRB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataWrite8(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ + bool dataabort = !cpu->DataWrite8(addr, storeval); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -233,14 +237,18 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataWrite16(offset, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ + bool dataabort = !cpu->DataWrite16(offset, storeval); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataWrite16(addr, cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ + bool dataabort = !cpu->DataWrite16(addr, storeval); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; From 0c887202e7622d2474945fee2e23a059261f1efa Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 10:40:23 -0400 Subject: [PATCH 056/306] fix some more instructions? --- src/ARMInterpreter_LoadStore.cpp | 64 +++++++++++++++++++------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 1d8595ab..aa4a90eb 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -141,8 +141,8 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRB PC %08X\n", cpu->R[15]); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ // TODO: user mode #define A_LDRB_POST \ @@ -151,8 +151,8 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRB PC %08X\n", cpu->R[15]); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ @@ -261,7 +261,9 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - if (!cpu->DataRead32S(offset+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ + u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ + if (r == 14) A_UNK(cpu); /* hang??? */ \ + else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -271,7 +273,9 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - if (!cpu->DataRead32S(addr+4, &cpu->R[r+1])) {cpu->AddCycles_CDI(); return;} \ + u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ + if (r == 14) A_UNK(cpu); /*hang??? */ \ + else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -301,54 +305,60 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRH PC %08X\n", cpu->R[15]); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRH PC %08X\n", cpu->R[15]); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataRead8(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSB PC %08X\n", cpu->R[15]); \ + val = (s32)(s8)val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* hang??? */ \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataRead8(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s8)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSB PC %08X\n", cpu->R[15]); \ + val = (s32)(s8)val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* hang??? */ \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataRead16(offset, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSH PC %08X\n", cpu->R[15]); \ + val = (s32)(s16)val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - bool dataabort = !cpu->DataRead16(addr, &cpu->R[(cpu->CurInstr>>12) & 0xF]); \ + u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = (s32)(s16)cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) printf("!! LDRSH PC %08X\n", cpu->R[15]); \ + val = (s32)(s16)val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -398,8 +408,9 @@ void A_SWP(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite32(base, rm)) { - // rd only gets updated if both read and write succeed - cpu->R[(cpu->CurInstr >> 12) & 0xF] = ROR(val, 8*(base&0x3)); + // rd only gets updated if both read and write succeed, and if rd isn't r15 + u32 rd = (cpu->CurInstr >> 12) & 0xF; + if (rd != 15) cpu->R[rd] = ROR(val, 8*(base&0x3)); } cpu->DataCycles += numD; } @@ -418,8 +429,9 @@ void A_SWPB(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite8(base, rm)) { - // rd only gets updated if both read and write succeed - cpu->R[(cpu->CurInstr >> 12) & 0xF] = val; + // rd only gets updated if both read and write succeed, and if rd isn't r15 + u32 rd = (cpu->CurInstr >> 12) & 0xF; + if (rd != 15) cpu->R[rd] = val; } cpu->DataCycles += numD; } From 8191f92bb639a51f21ca2680113574a1970a9ccc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 10:42:19 -0400 Subject: [PATCH 057/306] mcr is also affected --- src/ARMInterpreter.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index ff73e230..6da76b16 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -216,10 +216,12 @@ void A_MCR(ARM* cpu) u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; + u32 val = cpu->R[(cpu->CurInstr>>12)&0xF]; + if (((cpu->CurInstr>>12) & 0xF) == 15) val += 4; if (cpu->Num==0 && cp==15) { - ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo, cpu->R[(cpu->CurInstr>>12)&0xF]); + ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo, val); } else if (cpu->Num==1 && cp==14) { From 5f97dfc1ab0d21726acbb425081bd4579a510d1c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 10:53:22 -0400 Subject: [PATCH 058/306] fix bits fixed to 0 for pu region sizing being set --- src/CP15.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 34c8addf..7b11696b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -579,7 +579,7 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x670: case 0x671: char log_output[1024]; - PU_Region[(id >> 4) & 0xF] = val; + PU_Region[(id >> 4) & 0xF] = val & ~(0x3F<<6); std::snprintf(log_output, sizeof(log_output), From 3699768ac9657426da6b012ce93714e3823a24b8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 13:53:12 -0400 Subject: [PATCH 059/306] most cpsr bits can't actually be updated (or at least can't be read?) --- src/ARMInterpreter.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 6da76b16..d6c3a488 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -101,9 +101,9 @@ void A_MSR_IMM(ARM* cpu) u32 mask = 0; if (cpu->CurInstr & (1<<16)) mask |= 0x000000FF; - if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; - if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; - if (cpu->CurInstr & (1<<19)) mask |= 0xFF000000; + //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; + //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; + if (cpu->CurInstr & (1<<19)) mask |= (cpu->Num ? 0xF0000000 /* checkme */ : 0xF8000000); if (!(cpu->CurInstr & (1<<22))) mask &= 0xFFFFFFDF; @@ -154,9 +154,9 @@ void A_MSR_REG(ARM* cpu) u32 mask = 0; if (cpu->CurInstr & (1<<16)) mask |= 0x000000FF; - if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; - if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; - if (cpu->CurInstr & (1<<19)) mask |= 0xFF000000; + //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; + //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; + if (cpu->CurInstr & (1<<19)) mask |= (cpu->Num ? 0xF0000000 /* checkme */ : 0xF8000000); if (!(cpu->CurInstr & (1<<22))) mask &= 0xFFFFFFDF; From 659763f903b3517e7bbb6f76c2eadf1232cf93eb Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 16:15:02 -0400 Subject: [PATCH 060/306] clarification --- src/ARMInterpreter_LoadStore.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index aa4a90eb..19136cce 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -117,7 +117,7 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ } -// TODO: user mode +// TODO: user mode (note: ldrt w/ rd = 15 may be an undef instr) #define A_LDR_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ @@ -144,7 +144,7 @@ namespace melonDS::ARMInterpreter if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ -// TODO: user mode +// TODO: user mode (note: ldrbt w/ rd = 15 may be an undef instr) #define A_LDRB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ @@ -262,7 +262,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) A_UNK(cpu); /* hang??? */ \ + if (r == 14) A_UNK(cpu); /* checkme */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -274,7 +274,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) A_UNK(cpu); /*hang??? */ \ + if (r == 14) A_UNK(cpu); /* checkme */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -327,7 +327,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* hang??? */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -337,7 +337,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* hang??? */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -347,7 +347,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* checkme */ \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -357,7 +357,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* checkme */ \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; From 849d4e51acd9b934ee3184e521acf1441d75c66d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 22:12:44 -0400 Subject: [PATCH 061/306] imma be real, i have no idea what is going on here --- src/ARM.cpp | 38 ++++++++++++++++++++++++++++++++ src/ARM.h | 13 ++++++++--- src/ARMInterpreter_LoadStore.cpp | 24 ++++++++++---------- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 0d2976d2..a9c2d124 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -177,6 +177,8 @@ void ARM::Reset() ExceptionBase = Num ? 0x00000000 : 0xFFFF0000; + BuggyJump = 0; + CodeMem.Mem = NULL; #ifdef JIT_ENABLED @@ -284,6 +286,32 @@ void ARM::SetupCodeMem(u32 addr) } } +void ARMv5::BuggedJumpTo32(const u32 addr) +{ + if (BuggyJump == 1) + { + BuggyJump = 2; + JumpTo(addr); + } + else + { + JumpTo(addr & ~0x1); + } +} + +void ARMv5::BuggedJumpTo(const u32 addr) +{ + if ((BuggyJump == 0) && (addr & 0x3)) + { + BuggyJump = 1; + PrefetchAbort(); // checkme + } + else + { + JumpTo(addr); + } +} + void ARMv5::JumpTo(u32 addr, bool restorecpsr) { if (restorecpsr) @@ -352,6 +380,16 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) NDS.MonitorARM9Jump(addr); } +void ARMv4::BuggedJumpTo32(const u32 addr) +{ + JumpTo(addr); // todo +} + +void ARMv4::BuggedJumpTo(const u32 addr) +{ + JumpTo(addr); // todo +} + void ARMv4::JumpTo(u32 addr, bool restorecpsr) { if (restorecpsr) diff --git a/src/ARM.h b/src/ARM.h index 1f68567c..9cda0be1 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -64,7 +64,9 @@ public: virtual void DoSavestate(Savestate* file); virtual void FillPipeline() = 0; - + + virtual void BuggedJumpTo32(const u32 addr) = 0; + virtual void BuggedJumpTo(const u32 addr) = 0; virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; void RestoreCPSR(); @@ -173,6 +175,7 @@ public: u32 R_UND[3]; u32 CurInstr; u32 NextInstr[2]; + u32 BuggyJump; u32 ExceptionBase; @@ -235,7 +238,9 @@ public: void UpdateRegionTimings(u32 addrstart, u32 addrend); void FillPipeline() override; - + + void BuggedJumpTo32(const u32 addr) override; + void BuggedJumpTo(const u32 addr) override; void JumpTo(u32 addr, bool restorecpsr = false) override; void PrefetchAbort(); @@ -380,7 +385,9 @@ public: ARMv4(melonDS::NDS& nds, std::optional gdb, bool jit); void FillPipeline() override; - + + void BuggedJumpTo32(const u32 addr) override; + void BuggedJumpTo(const u32 addr) override; void JumpTo(u32 addr, bool restorecpsr = false) override; void Execute() override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 19136cce..b5a3ee63 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -141,8 +141,8 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; // TODO: user mode (note: ldrbt w/ rd = 15 may be an undef instr) #define A_LDRB_POST \ @@ -151,8 +151,8 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; @@ -262,7 +262,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) A_UNK(cpu); /* checkme */ \ + if (r == 14) cpu->BuggedJumpTo32(val); \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -274,7 +274,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) A_UNK(cpu); /* checkme */ \ + if (r == 14) cpu->BuggedJumpTo32(val); \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -308,7 +308,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -317,7 +317,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -327,7 +327,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -337,7 +337,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -347,7 +347,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* checkme */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -357,7 +357,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) A_UNK(cpu); /* checkme */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; From b846c6f100b53fc9b546e5a0acf870734e5f1e07 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 8 Jun 2024 22:17:07 -0400 Subject: [PATCH 062/306] remove out of date comments --- src/ARMInterpreter_LoadStore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index b5a3ee63..2f2a7912 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -117,7 +117,7 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ } -// TODO: user mode (note: ldrt w/ rd = 15 may be an undef instr) +// TODO: user mode #define A_LDR_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ @@ -144,7 +144,7 @@ namespace melonDS::ARMInterpreter if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; -// TODO: user mode (note: ldrbt w/ rd = 15 may be an undef instr) +// TODO: user mode #define A_LDRB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ From be60c68aeb66e918686c5a9d0d5729af79c5cf6c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 9 Jun 2024 07:25:42 -0400 Subject: [PATCH 063/306] more weirdness --- src/ARM.cpp | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index a9c2d124..6ec14682 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -288,11 +288,19 @@ void ARM::SetupCodeMem(u32 addr) void ARMv5::BuggedJumpTo32(const u32 addr) { + // ldrd to pc + // behavior seems to be related to if a bugged 8/16 bit write has prefetch aborted (does any p.abort work?) + // switching to thumb mode only seems to work the first time after one of the above aborts? + // writing to pc seems to fail entirely if an abort hasn't occured and thumb interworking is in v4 mode if (BuggyJump == 1) { BuggyJump = 2; JumpTo(addr); } + else if ((BuggyJump == 0) && (CP15Control & (1<<15))) + { + return; // checkme + } else { JumpTo(addr & ~0x1); @@ -301,15 +309,27 @@ void ARMv5::BuggedJumpTo32(const u32 addr) void ARMv5::BuggedJumpTo(const u32 addr) { - if ((BuggyJump == 0) && (addr & 0x3)) + // 16 and 8 bit loads (signed instructions included) to pc + // if they're misaligned they'll prefetch abort + // but they can only prefetch abort once, every time afterwards will succeed (more testing needed) + // if the lsb is set they will try to switch to thumb state, though it'll fail if they haven't prefetch aborted yet + // they work as expected if thumb interwork is set to v4 mode + if (BuggyJump == 0) { - BuggyJump = 1; - PrefetchAbort(); // checkme - } - else - { - JumpTo(addr); + if (CP15Control & (1<<15)) + { + JumpTo(addr & ~1); + return; + } + else if (addr & 0x3) + { + if (addr & 0x1) CPSR |= 0x20; + BuggyJump = 1; + PrefetchAbort(); + return; + } } + JumpTo(addr); } void ARMv5::JumpTo(u32 addr, bool restorecpsr) @@ -382,12 +402,12 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) void ARMv4::BuggedJumpTo32(const u32 addr) { - JumpTo(addr); // todo + JumpTo(addr & ~1); // todo } void ARMv4::BuggedJumpTo(const u32 addr) { - JumpTo(addr); // todo + JumpTo(addr & ~1); // todo } void ARMv4::JumpTo(u32 addr, bool restorecpsr) From b90d5c23200d5fda12c11a9e06de98b426c06332 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 9 Jun 2024 12:18:31 -0400 Subject: [PATCH 064/306] what the actual F*** is going on --- src/ARM.cpp | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 6ec14682..94f2debf 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -290,20 +290,20 @@ void ARMv5::BuggedJumpTo32(const u32 addr) { // ldrd to pc // behavior seems to be related to if a bugged 8/16 bit write has prefetch aborted (does any p.abort work?) - // switching to thumb mode only seems to work the first time after one of the above aborts? - // writing to pc seems to fail entirely if an abort hasn't occured and thumb interworking is in v4 mode + // switching to thumb mode only seems to work the first time an ldrd pc is executed after one of the above aborts? + // also it can restore cpsr but only if the PU is disabled (?????????????????????????????????????) if (BuggyJump == 1) { BuggyJump = 2; - JumpTo(addr); - } - else if ((BuggyJump == 0) && (CP15Control & (1<<15))) - { - return; // checkme + + if (CP15Control & (1<<15)) + JumpTo(addr & ~0x1, !(CP15Control & 1)); + else + JumpTo(addr, !(CP15Control & 1)); } else { - JumpTo(addr & ~0x1); + JumpTo(addr & ~0x1, !(CP15Control & 1)); } } @@ -313,23 +313,18 @@ void ARMv5::BuggedJumpTo(const u32 addr) // if they're misaligned they'll prefetch abort // but they can only prefetch abort once, every time afterwards will succeed (more testing needed) // if the lsb is set they will try to switch to thumb state, though it'll fail if they haven't prefetch aborted yet - // they work as expected if thumb interwork is set to v4 mode - if (BuggyJump == 0) + if ((BuggyJump == 0) && (addr & 0x3)) { - if (CP15Control & (1<<15)) - { - JumpTo(addr & ~1); - return; - } - else if (addr & 0x3) - { - if (addr & 0x1) CPSR |= 0x20; - BuggyJump = 1; - PrefetchAbort(); - return; - } + if (addr & 0x1) CPSR |= 0x20; + BuggyJump = 1; + PrefetchAbort(); + return; } - JumpTo(addr); + + if (CP15Control & (1<<15)) + JumpTo(addr & ~0x1); + else + JumpTo(addr); } void ARMv5::JumpTo(u32 addr, bool restorecpsr) From ae0824fdd35e282bf6a6b5787f6585f21eda1ae7 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 9 Jun 2024 19:10:43 -0400 Subject: [PATCH 065/306] it all makes sense now... --- src/ARM.cpp | 71 +++++++++++--------------------- src/ARM.h | 9 ++-- src/ARMInterpreter_LoadStore.cpp | 20 ++++----- 3 files changed, 36 insertions(+), 64 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 94f2debf..7bfb95a2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -286,47 +286,6 @@ void ARM::SetupCodeMem(u32 addr) } } -void ARMv5::BuggedJumpTo32(const u32 addr) -{ - // ldrd to pc - // behavior seems to be related to if a bugged 8/16 bit write has prefetch aborted (does any p.abort work?) - // switching to thumb mode only seems to work the first time an ldrd pc is executed after one of the above aborts? - // also it can restore cpsr but only if the PU is disabled (?????????????????????????????????????) - if (BuggyJump == 1) - { - BuggyJump = 2; - - if (CP15Control & (1<<15)) - JumpTo(addr & ~0x1, !(CP15Control & 1)); - else - JumpTo(addr, !(CP15Control & 1)); - } - else - { - JumpTo(addr & ~0x1, !(CP15Control & 1)); - } -} - -void ARMv5::BuggedJumpTo(const u32 addr) -{ - // 16 and 8 bit loads (signed instructions included) to pc - // if they're misaligned they'll prefetch abort - // but they can only prefetch abort once, every time afterwards will succeed (more testing needed) - // if the lsb is set they will try to switch to thumb state, though it'll fail if they haven't prefetch aborted yet - if ((BuggyJump == 0) && (addr & 0x3)) - { - if (addr & 0x1) CPSR |= 0x20; - BuggyJump = 1; - PrefetchAbort(); - return; - } - - if (CP15Control & (1<<15)) - JumpTo(addr & ~0x1); - else - JumpTo(addr); -} - void ARMv5::JumpTo(u32 addr, bool restorecpsr) { if (restorecpsr) @@ -395,14 +354,25 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) NDS.MonitorARM9Jump(addr); } -void ARMv4::BuggedJumpTo32(const u32 addr) +void ARMv5::JumpTo8_16Bit(const u32 addr) { - JumpTo(addr & ~1); // todo -} - -void ARMv4::BuggedJumpTo(const u32 addr) -{ - JumpTo(addr & ~1); // todo + // 8 and 16 loads (signed included) to pc + if (!(CP15Control & 0x1)) + { + // if the pu is disabled it behaves like a normal jump + JumpTo((CP15Control & (1<<15)) ? (addr & ~0x1) : addr); + } + else + { + if (addr & 0x3) + { + // if the pu is enabled it will always prefetch abort if not word aligned + // although it will still attempt (and fail) to enter thumb mode if enabled + if ((addr & 0x1) && !(CP15Control & (1<<15))) CPSR |= 0x20; + PrefetchAbort(); + } + else JumpTo(addr); + } } void ARMv4::JumpTo(u32 addr, bool restorecpsr) @@ -449,6 +419,11 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) } } +void ARMv4::JumpTo8_16Bit(const u32 addr) +{ + JumpTo(addr & ~1); // checkme? +} + void ARM::RestoreCPSR() { u32 oldcpsr = CPSR; diff --git a/src/ARM.h b/src/ARM.h index 9cda0be1..9b0511a3 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -65,9 +65,8 @@ public: virtual void FillPipeline() = 0; - virtual void BuggedJumpTo32(const u32 addr) = 0; - virtual void BuggedJumpTo(const u32 addr) = 0; virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; + virtual void JumpTo8_16Bit(u32 addr) = 0; void RestoreCPSR(); void Halt(u32 halt) @@ -239,9 +238,8 @@ public: void FillPipeline() override; - void BuggedJumpTo32(const u32 addr) override; - void BuggedJumpTo(const u32 addr) override; void JumpTo(u32 addr, bool restorecpsr = false) override; + void JumpTo8_16Bit(const u32 addr) override; void PrefetchAbort(); void DataAbort(); @@ -386,9 +384,8 @@ public: void FillPipeline() override; - void BuggedJumpTo32(const u32 addr) override; - void BuggedJumpTo(const u32 addr) override; void JumpTo(u32 addr, bool restorecpsr = false) override; + void JumpTo8_16Bit(const u32 addr) override; void Execute() override; #ifdef JIT_ENABLED diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 2f2a7912..8c96967e 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -141,7 +141,7 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; // TODO: user mode @@ -151,7 +151,7 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; @@ -262,7 +262,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->BuggedJumpTo32(val); \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), true); /* restores cpsr for some reason? */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -274,7 +274,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->BuggedJumpTo32(val); \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), true); /* restores cpsr for some reason? */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -308,7 +308,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -317,7 +317,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -327,7 +327,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -337,7 +337,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -347,7 +347,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -357,7 +357,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->BuggedJumpTo(val); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; From ca04710debd4c4a62fa5333c496b22ceadd0cf8c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 9 Jun 2024 22:31:10 -0400 Subject: [PATCH 066/306] ldrd is just ldm --- src/ARMInterpreter_LoadStore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 8c96967e..1f43868f 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -262,7 +262,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), true); /* restores cpsr for some reason? */ \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr due to shared ldm dna */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -274,7 +274,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } /* checkme */ \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), true); /* restores cpsr for some reason? */ \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr due to shared ldm dna */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; From 3ddccde5b907fa7e379bd5296322858a74bc67ee Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 10 Jun 2024 13:10:42 -0400 Subject: [PATCH 067/306] verified also remove no longer needed variable --- src/ARM.cpp | 2 -- src/ARM.h | 7 +++---- src/ARMInterpreter_LoadStore.cpp | 8 ++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7bfb95a2..906a243e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -177,8 +177,6 @@ void ARM::Reset() ExceptionBase = Num ? 0x00000000 : 0xFFFF0000; - BuggyJump = 0; - CodeMem.Mem = NULL; #ifdef JIT_ENABLED diff --git a/src/ARM.h b/src/ARM.h index 9b0511a3..7c5bb671 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -64,7 +64,7 @@ public: virtual void DoSavestate(Savestate* file); virtual void FillPipeline() = 0; - + virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; virtual void JumpTo8_16Bit(u32 addr) = 0; void RestoreCPSR(); @@ -174,7 +174,6 @@ public: u32 R_UND[3]; u32 CurInstr; u32 NextInstr[2]; - u32 BuggyJump; u32 ExceptionBase; @@ -237,7 +236,7 @@ public: void UpdateRegionTimings(u32 addrstart, u32 addrend); void FillPipeline() override; - + void JumpTo(u32 addr, bool restorecpsr = false) override; void JumpTo8_16Bit(const u32 addr) override; @@ -383,7 +382,7 @@ public: ARMv4(melonDS::NDS& nds, std::optional gdb, bool jit); void FillPipeline() override; - + void JumpTo(u32 addr, bool restorecpsr = false) override; void JumpTo8_16Bit(const u32 addr) override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 1f43868f..2e841549 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -259,7 +259,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } /* checkme */ \ + if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr due to shared ldm dna */ \ @@ -271,7 +271,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } /* checkme */ \ + if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr due to shared ldm dna */ \ @@ -283,7 +283,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } /* checkme */ \ + if (r&1) { A_UNK(cpu); return; } \ bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ @@ -295,7 +295,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ - if (r&1) { A_UNK(cpu); return; } /* checkme */ \ + if (r&1) { A_UNK(cpu); return; } \ bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ From 048b0b8878f610a7e71b5e944614c498b12adb44 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 10 Jun 2024 18:03:56 -0400 Subject: [PATCH 068/306] swp/swpb jumps work on the arm 7? --- src/ARMInterpreter_LoadStore.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 2e841549..7a33b8dd 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -262,7 +262,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr due to shared ldm dna */ \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -274,7 +274,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr due to shared ldm dna */ \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -408,9 +408,10 @@ void A_SWP(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite32(base, rm)) { - // rd only gets updated if both read and write succeed, and if rd isn't r15 + // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) cpu->R[rd] = ROR(val, 8*(base&0x3)); + else if (cpu->Num) cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1); // for some reason these jumps don't work on the arm 9? } cpu->DataCycles += numD; } @@ -429,9 +430,10 @@ void A_SWPB(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite8(base, rm)) { - // rd only gets updated if both read and write succeed, and if rd isn't r15 + // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) cpu->R[rd] = val; + else if (cpu->Num) cpu->JumpTo(val & ~1); // for some reason these jumps don't work on the arm 9? } cpu->DataCycles += numD; } From 42218106b04adb257ecc165d0e9b79a1065e65ed Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 11 Jun 2024 10:09:40 -0400 Subject: [PATCH 069/306] verify writable msr bits --- src/ARMInterpreter.cpp | 12 ++++++------ src/ARMInterpreter_LoadStore.cpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index d6c3a488..5a09d210 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -101,9 +101,9 @@ void A_MSR_IMM(ARM* cpu) u32 mask = 0; if (cpu->CurInstr & (1<<16)) mask |= 0x000000FF; - //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; - //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; - if (cpu->CurInstr & (1<<19)) mask |= (cpu->Num ? 0xF0000000 /* checkme */ : 0xF8000000); + //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; // unused by arm 7 & 9 + //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; // unused by arm 7 & 9 + if (cpu->CurInstr & (1<<19)) mask |= ((cpu->Num==1) ? 0xF0000000 : 0xF8000000); if (!(cpu->CurInstr & (1<<22))) mask &= 0xFFFFFFDF; @@ -154,9 +154,9 @@ void A_MSR_REG(ARM* cpu) u32 mask = 0; if (cpu->CurInstr & (1<<16)) mask |= 0x000000FF; - //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; - //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; - if (cpu->CurInstr & (1<<19)) mask |= (cpu->Num ? 0xF0000000 /* checkme */ : 0xF8000000); + //if (cpu->CurInstr & (1<<17)) mask |= 0x0000FF00; // unused by arm 7 & 9 + //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; // unused by arm 7 & 9 + if (cpu->CurInstr & (1<<19)) mask |= ((cpu->Num==1) ? 0xF0000000 : 0xF8000000); if (!(cpu->CurInstr & (1<<22))) mask &= 0xFFFFFFDF; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 7a33b8dd..4e705aed 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -411,7 +411,7 @@ void A_SWP(ARM* cpu) // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) cpu->R[rd] = ROR(val, 8*(base&0x3)); - else if (cpu->Num) cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1); // for some reason these jumps don't work on the arm 9? + else if (cpu->Num==1) cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1); // for some reason these jumps don't work on the arm 9? } cpu->DataCycles += numD; } @@ -433,7 +433,7 @@ void A_SWPB(ARM* cpu) // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) cpu->R[rd] = val; - else if (cpu->Num) cpu->JumpTo(val & ~1); // for some reason these jumps don't work on the arm 9? + else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't work on the arm 9? } cpu->DataCycles += numD; } From 5a174a2ce38c40b317231428418ca4f67e4218aa Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 14 Jun 2024 00:51:55 -0400 Subject: [PATCH 070/306] track interlock cycles for load instructions --- src/ARM.cpp | 3 + src/ARM.h | 31 +++++++ src/ARMInterpreter_LoadStore.cpp | 142 ++++++++++++++++++++++++------- 3 files changed, 143 insertions(+), 33 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 906a243e..bac57879 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1314,6 +1314,9 @@ void ARMv4::AddCycles_CD() Cycles += numC + numD; } } + u64 ARMv5::Timestamp() { return NDS.ARM9Timestamp; } + + u64 ARMv4::Timestamp() { return NDS.ARM7Timestamp; } u8 ARMv5::BusRead8(u32 addr) { diff --git a/src/ARM.h b/src/ARM.h index 7c5bb671..9fb48930 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -143,6 +143,24 @@ public: virtual void AddCycles_CDI() = 0; virtual void AddCycles_CD() = 0; + inline void AddCycles_L(const u8 reg1) + { + Cycles += InterlockTimestamp[reg1]; + } + + inline void AddCycles_L(const u8 reg1, const u8 reg2) + { + Cycles += std::max(InterlockTimestamp[reg1], InterlockTimestamp[reg2]); + } + + // Must be called after all of an instruction's cycles are calculated!!! + inline void SetCycles_L(const u8 reg, const u8 cycles, const u8 type) + { + InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; + } + + virtual u64 Timestamp() = 0; + void CheckGdbIncoming(); u32 Num; @@ -179,6 +197,15 @@ public: MemRegion CodeMem; + enum InterlockType + { + ILT_Norm = 0, + ILT_Mul = 1, + }; + + u8 InterlockType[16]; + u64 InterlockTimestamp[16]; + #ifdef JIT_ENABLED u32 FastBlockLookupStart, FastBlockLookupSize; u64* FastBlockLookup; @@ -299,6 +326,8 @@ public: // Cycles += numC + numD; } + u64 Timestamp() override; + void GetCodeMemRegion(u32 addr, MemRegion* region); void CP15Reset(); @@ -413,6 +442,8 @@ public: void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; void AddCycles_CD() override; + + u64 Timestamp() override; protected: u8 BusRead8(u32 addr) override; u16 BusRead16(u32 addr) override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4e705aed..4e93c749 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -115,6 +115,7 @@ namespace melonDS::ARMInterpreter else \ { \ cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, (offset & 3) ? 2 : 1, cpu->ILT_Norm); \ } // TODO: user mode @@ -133,6 +134,7 @@ namespace melonDS::ARMInterpreter else \ { \ cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, (addr & 3) ? 2 : 1, cpu->ILT_Norm); \ } #define A_LDRB \ @@ -141,8 +143,13 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } // TODO: user mode #define A_LDRB_POST \ @@ -151,8 +158,13 @@ namespace melonDS::ARMInterpreter cpu->AddCycles_CDI(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } @@ -260,23 +272,35 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ - else cpu->R[r+1] = val; \ + if (!cpu->DataRead32 (offset, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ + u32 val; bool dataabort = !cpu->DataRead32S(offset+4, &val); \ cpu->AddCycles_CDI(); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (dataabort) return; \ + if (r == 14) \ + cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ + else \ + { \ + cpu->R[r+1] = val; \ + cpu->SetCycles_L(r+1, 1, cpu->ILT_Norm); \ + } \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ #define A_LDRD_POST \ if (cpu->Num != 0) return; \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ - else cpu->R[r+1] = val; \ + if (!cpu->DataRead32 (addr, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ + u32 val; bool dataabort = !cpu->DataRead32S(addr+4, &val); \ cpu->AddCycles_CDI(); \ + if (dataabort) return; \ + if (r == 14) \ + cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ + else \ + { \ + cpu->R[r+1] = val; \ + cpu->SetCycles_L(r+1, 1, cpu->ILT_Norm); \ + } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ @@ -308,8 +332,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRH_POST \ @@ -317,8 +346,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSB \ @@ -327,8 +361,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSB_POST \ @@ -337,8 +376,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSH \ @@ -347,8 +391,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSH_POST \ @@ -357,8 +406,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + if (((cpu->CurInstr>>12) & 0xF) == 15) \ + cpu->JumpTo8_16Bit(val); \ + else \ + { \ + cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ + cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ + } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -408,14 +462,21 @@ void A_SWP(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite32(base, rm)) { + cpu->AddCycles_CDI(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; - if (rd != 15) cpu->R[rd] = ROR(val, 8*(base&0x3)); - else if (cpu->Num==1) cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1); // for some reason these jumps don't work on the arm 9? + if (rd != 15) + { + cpu->R[rd] = ROR(val, 8*(base&0x3)); + cpu->SetCycles_L(rd, 1, cpu->ILT_Norm); // TODO: it adds an extra interlock cycle when doing a misaligned load from a non-itcm address + } + else if (cpu->Num==1) // for some reason these jumps don't work on the arm 9? + cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); } + else cpu->AddCycles_CDI(); cpu->DataCycles += numD; } - cpu->AddCycles_CDI(); + else cpu->AddCycles_CDI(); } void A_SWPB(ARM* cpu) @@ -430,14 +491,21 @@ void A_SWPB(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite8(base, rm)) { + cpu->AddCycles_CDI(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; - if (rd != 15) cpu->R[rd] = val; - else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't work on the arm 9? + if (rd != 15) + { + cpu->R[rd] = val; + cpu->SetCycles_L(rd, 1, cpu->ILT_Norm); // TODO: it adds an extra interlock cycle when doing a load from a non-itcm address + } + else if (cpu->Num==1)// for some reason these jumps don't work on the arm 9? + cpu->JumpTo(val & ~1); } + else cpu->AddCycles_CDI(); cpu->DataCycles += numD; } - cpu->AddCycles_CDI(); + else cpu->AddCycles_CDI(); } @@ -450,6 +518,7 @@ void A_LDM(ARM* cpu) u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; + u8 lastreg = 0; // TODO: this doesn't support 0 reg LDMs (do those even work?) if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -486,6 +555,7 @@ void A_LDM(ARM* cpu) first = false; if (!preinc) base += 4; + lastreg = i; } } @@ -498,12 +568,18 @@ void A_LDM(ARM* cpu) { goto dataabort; } + cpu->AddCycles_CDI(); if (!preinc) base += 4; if (cpu->Num == 1) pc &= ~0x1; } + else + { + cpu->AddCycles_CDI(); + cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); // TODO: THIS DOESN'T APPLY WHEN LOADING FROM ITCM + } // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) @@ -537,6 +613,8 @@ void A_LDM(ARM* cpu) if (false) { dataabort: + cpu->AddCycles_CDI(); + // CHECKME: interlock shouldn't apply when it data aborts, right? // switch back to original set of regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) @@ -545,8 +623,6 @@ void A_LDM(ARM* cpu) // restore original value of base in case the reg got written to cpu->R[baseid] = oldbase; } - - cpu->AddCycles_CDI(); } void A_STM(ARM* cpu) From aa1217af0a2953dbdd3ddbe6563c6787d8013f34 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:47:42 -0400 Subject: [PATCH 071/306] track interlock cycles for the ALU --- src/ARMInterpreter_ALU.cpp | 107 +++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 46 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 315d59d0..0331aa08 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -767,12 +767,6 @@ void A_MUL(ARM* cpu) u32 res = rm * rs; cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ(res & 0x80000000, - !res); - if (cpu->Num==1) cpu->SetC(0); - } u32 cycles; if (cpu->Num == 0) @@ -786,6 +780,13 @@ void A_MUL(ARM* cpu) } cpu->AddCycles_CI(cycles); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ(res & 0x80000000, + !res); + if (cpu->Num==1) cpu->SetC(0); + } + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_MLA(ARM* cpu) @@ -797,12 +798,6 @@ void A_MLA(ARM* cpu) u32 res = (rm * rs) + rn; cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ(res & 0x80000000, - !res); - if (cpu->Num==1) cpu->SetC(0); - } u32 cycles; if (cpu->Num == 0) @@ -816,6 +811,13 @@ void A_MLA(ARM* cpu) } cpu->AddCycles_CI(cycles); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ(res & 0x80000000, + !res); + if (cpu->Num==1) cpu->SetC(0); + } + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_UMULL(ARM* cpu) @@ -827,12 +829,6 @@ void A_UMULL(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } u32 cycles; if (cpu->Num == 0) @@ -846,6 +842,13 @@ void A_UMULL(ARM* cpu) } cpu->AddCycles_CI(cycles); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } + else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_UMLAL(ARM* cpu) @@ -860,12 +863,6 @@ void A_UMLAL(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } u32 cycles; if (cpu->Num == 0) @@ -879,6 +876,13 @@ void A_UMLAL(ARM* cpu) } cpu->AddCycles_CI(cycles); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } + else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMULL(ARM* cpu) @@ -890,12 +894,6 @@ void A_SMULL(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } u32 cycles; if (cpu->Num == 0) @@ -909,6 +907,13 @@ void A_SMULL(ARM* cpu) } cpu->AddCycles_CI(cycles); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } + else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAL(ARM* cpu) @@ -923,12 +928,6 @@ void A_SMLAL(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } u32 cycles; if (cpu->Num == 0) @@ -940,8 +939,15 @@ void A_SMLAL(ARM* cpu) else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; } - + cpu->AddCycles_CI(cycles); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } + else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAxy(ARM* cpu) @@ -964,7 +970,8 @@ void A_SMLAxy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); } void A_SMLAWy(ARM* cpu) @@ -985,7 +992,8 @@ void A_SMLAWy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); } void A_SMULxy(ARM* cpu) @@ -1003,7 +1011,8 @@ void A_SMULxy(ARM* cpu) u32 res = ((s16)rm * (s16)rs); cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); } void A_SMULWy(ARM* cpu) @@ -1019,7 +1028,8 @@ void A_SMULWy(ARM* cpu) u32 res = ((s64)(s32)rm * (s16)rs) >> 16; cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); } void A_SMLALxy(ARM* cpu) @@ -1042,7 +1052,8 @@ void A_SMLALxy(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - cpu->AddCycles_CI(1); // TODO: interlock?? + cpu->AddCycles_CI(1); + cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); } @@ -1086,7 +1097,8 @@ void A_QADD(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); } void A_QSUB(ARM* cpu) @@ -1104,7 +1116,8 @@ void A_QSUB(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); } void A_QDADD(ARM* cpu) @@ -1130,7 +1143,8 @@ void A_QDADD(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); } void A_QDSUB(ARM* cpu) @@ -1156,7 +1170,8 @@ void A_QDSUB(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + cpu->AddCycles_C(); + cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); } From a973c0bf5bdc983ee6547f8533ea0ff3d5c750b4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 15 Jun 2024 16:07:36 -0400 Subject: [PATCH 072/306] initial implementation of interlock cycles --- src/ARM.cpp | 13 ++- src/ARM.h | 41 ++++++++-- src/ARMInterpreter.cpp | 4 +- src/ARMInterpreter_ALU.cpp | 134 +++++++++++++++---------------- src/ARMInterpreter_LoadStore.cpp | 112 +++++++++++++++----------- 5 files changed, 179 insertions(+), 125 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index bac57879..899fe661 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -190,6 +190,8 @@ void ARM::Reset() BreakReq = false; #endif + memset(InterlockTimestamp, 0, sizeof(InterlockTimestamp)); + // zorp JumpTo(ExceptionBase); } @@ -1314,9 +1316,16 @@ void ARMv4::AddCycles_CD() Cycles += numC + numD; } } - u64 ARMv5::Timestamp() { return NDS.ARM9Timestamp; } - u64 ARMv4::Timestamp() { return NDS.ARM7Timestamp; } +u64& ARMv5::Timestamp() +{ + return NDS.ARM9Timestamp; +} + +u64& ARMv4::Timestamp() +{ + return NDS.ARM7Timestamp; +} u8 ARMv5::BusRead8(u32 addr) { diff --git a/src/ARM.h b/src/ARM.h index 9fb48930..ff857db9 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -30,6 +30,8 @@ #include "debug/GdbStub.h" #endif +#define INTERLOCK + namespace melonDS { inline u32 ROR(u32 x, u32 n) @@ -143,23 +145,46 @@ public: virtual void AddCycles_CDI() = 0; virtual void AddCycles_CD() = 0; - inline void AddCycles_L(const u8 reg1) + inline void AddCycles_L(const u32 delay, const u32 reg1) { - Cycles += InterlockTimestamp[reg1]; + if (InterlockTimestamp[reg1] > Timestamp() + delay); + Timestamp() = InterlockTimestamp[reg1]; + } + + inline void AddCycles_L(const u32 delay, const u32 reg1, const u32 reg2) + { + u64 cycles = std::max(InterlockTimestamp[reg1], InterlockTimestamp[reg2]); + if (cycles > Timestamp() + delay) + Timestamp() = cycles; + } + + inline void AddCycles_L(const u32 delay, const u32 reg1, const u32 reg2, const u32 reg3) + { + u64 cycles = std::max(InterlockTimestamp[reg1], std::max(InterlockTimestamp[reg2], InterlockTimestamp[reg3])); + if (cycles > Timestamp() + delay) + Timestamp() = cycles; } - inline void AddCycles_L(const u8 reg1, const u8 reg2) + // fetch the value of a register while handling any interlock cycles + inline u32 GetReg(const u32 reg, const u32 delay = 0) { - Cycles += std::max(InterlockTimestamp[reg1], InterlockTimestamp[reg2]); +#ifdef INTERLOCK + if (InterlockTimestamp[reg] > (Timestamp() + delay)) + Timestamp() = InterlockTimestamp[reg] - delay; +#endif + return R[reg]; } // Must be called after all of an instruction's cycles are calculated!!! - inline void SetCycles_L(const u8 reg, const u8 cycles, const u8 type) + inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) { +#ifdef INTERLOCK InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; + //InterlockType[reg] = type; +#endif } - virtual u64 Timestamp() = 0; + virtual u64& Timestamp() = 0; void CheckGdbIncoming(); @@ -326,7 +351,7 @@ public: // Cycles += numC + numD; } - u64 Timestamp() override; + u64& Timestamp() override; void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -443,7 +468,7 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; - u64 Timestamp() override; + u64& Timestamp() override; protected: u8 BusRead8(u32 addr) override; u16 BusRead16(u32 addr) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 5a09d210..5621876a 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -163,7 +163,7 @@ void A_MSR_REG(ARM* cpu) if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; - u32 val = cpu->R[cpu->CurInstr & 0xF]; + u32 val = cpu->GetReg(cpu->CurInstr & 0xF, 1); // bit4 is forced to 1 val |= 0x00000010; @@ -216,7 +216,7 @@ void A_MCR(ARM* cpu) u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; - u32 val = cpu->R[(cpu->CurInstr>>12)&0xF]; + u32 val = cpu->GetReg((cpu->CurInstr>>12)&0xF); if (((cpu->CurInstr>>12) & 0xF) == 15) val += 4; if (cpu->Num==0 && cp==15) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 0331aa08..ac18872b 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -160,14 +160,14 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) cpu->SetC(b & 0x80000000); #define A_CALC_OP2_REG_SHIFT_IMM(shiftop) \ - u32 b = cpu->R[cpu->CurInstr&0xF]; \ + u32 b = cpu->GetReg(cpu->CurInstr&0xF); \ u32 s = (cpu->CurInstr>>7)&0x1F; \ shiftop(b, s); #define A_CALC_OP2_REG_SHIFT_REG(shiftop) \ - u32 b = cpu->R[cpu->CurInstr&0xF]; \ + u32 b = cpu->GetReg(cpu->CurInstr&0xF); \ if ((cpu->CurInstr&0xF)==15) b += 4; \ - shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); + shiftop(b, (cpu->GetReg((cpu->CurInstr>>8)&0xF) & 0xFF)); #define A_IMPLEMENT_ALU_OP(x,s) \ @@ -313,7 +313,7 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -326,7 +326,7 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ } #define A_AND_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -344,7 +344,7 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a ^ b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -357,7 +357,7 @@ A_IMPLEMENT_ALU_OP(AND,_S) } #define A_EOR_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -375,7 +375,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -388,7 +388,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) } #define A_SUB_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -408,7 +408,7 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = b - a; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -421,7 +421,7 @@ A_IMPLEMENT_ALU_OP(SUB,) } #define A_RSB_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = b - a; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -441,7 +441,7 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -454,7 +454,7 @@ A_IMPLEMENT_ALU_OP(RSB,) } #define A_ADD_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -474,7 +474,7 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b + (cpu->CPSR&0x20000000 ? 1:0); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -487,7 +487,7 @@ A_IMPLEMENT_ALU_OP(ADD,) } #define A_ADC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res_tmp = a + b; \ u32 carry = (cpu->CPSR&0x20000000 ? 1:0); \ u32 res = res_tmp + carry; \ @@ -509,7 +509,7 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -522,7 +522,7 @@ A_IMPLEMENT_ALU_OP(ADC,) } #define A_SBC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res_tmp = a - b; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -544,7 +544,7 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = b - a - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -557,7 +557,7 @@ A_IMPLEMENT_ALU_OP(SBC,) } #define A_RSC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res_tmp = b - a; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -579,7 +579,7 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -589,7 +589,7 @@ A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -599,7 +599,7 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -611,7 +611,7 @@ A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a + b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -623,7 +623,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a | b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -636,7 +636,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) } #define A_ORR_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a | b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -699,7 +699,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & ~b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -712,7 +712,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) } #define A_BIC_S(c) \ - u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 res = a & ~b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -761,8 +761,8 @@ A_IMPLEMENT_ALU_OP(MVN,_S) void A_MUL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF); u32 res = rm * rs; @@ -791,9 +791,9 @@ void A_MUL(ARM* cpu) void A_MLA(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF); + u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF); u32 res = (rm * rs) + rn; @@ -822,8 +822,8 @@ void A_MLA(ARM* cpu) void A_UMULL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); u64 res = (u64)rm * (u64)rs; @@ -848,17 +848,17 @@ void A_UMULL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_UMLAL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); u64 res = (u64)rm * (u64)rs; - u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); + u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); // CHECKME: INTERLOCK? res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; @@ -887,8 +887,8 @@ void A_UMLAL(ARM* cpu) void A_SMULL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); s64 res = (s64)(s32)rm * (s64)(s32)rs; @@ -913,17 +913,17 @@ void A_SMULL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAL(ARM* cpu) { - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); s64 res = (s64)(s32)rm * (s64)(s32)rs; - s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); + s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); // CHECKME: INTERLOCK? res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; @@ -947,16 +947,16 @@ void A_SMLAL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1); if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -978,9 +978,9 @@ void A_SMLAWy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1); if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; @@ -1000,8 +1000,8 @@ void A_SMULxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -1019,8 +1019,8 @@ void A_SMULWy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; @@ -1036,8 +1036,8 @@ void A_SMLALxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 0); + u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 0); // yeah this one actually doesn't need two interlock cycles to interlock if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -1053,7 +1053,7 @@ void A_SMLALxy(ARM* cpu) cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); cpu->AddCycles_CI(1); - cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); + cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); } @@ -1062,7 +1062,7 @@ void A_CLZ(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 val = cpu->R[cpu->CurInstr & 0xF]; + u32 val = cpu->GetReg(cpu->CurInstr & 0xF, 1); u32 res = 0; while ((val & 0xFF000000) == 0) @@ -1086,8 +1086,8 @@ void A_QADD(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); u32 res = rm + rn; if (OverflowAdd(rm, rn)) @@ -1105,8 +1105,8 @@ void A_QSUB(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); u32 res = rm - rn; if (OverflowSub(rm, rn)) @@ -1124,8 +1124,8 @@ void A_QDADD(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); if (OverflowAdd(rn, rn)) { @@ -1151,8 +1151,8 @@ void A_QDSUB(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); if (OverflowAdd(rn, rn)) { diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4e93c749..a11e912d 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -53,7 +53,7 @@ namespace melonDS::ARMInterpreter if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_WB_CALC_OFFSET_REG(shiftop) \ - u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ + u32 offset = cpu->GetReg(cpu->CurInstr & 0xF); \ u32 shift = ((cpu->CurInstr>>7)&0x1F); \ shiftop(offset, shift); \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; @@ -61,8 +61,8 @@ namespace melonDS::ARMInterpreter #define A_STR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(offset, storeval); \ @@ -72,8 +72,8 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(addr, storeval); \ @@ -82,8 +82,8 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(offset, storeval); \ cpu->AddCycles_CD(); \ @@ -92,8 +92,8 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(addr, storeval); \ cpu->AddCycles_CD(); \ @@ -101,7 +101,7 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -120,7 +120,7 @@ namespace melonDS::ARMInterpreter // TODO: user mode #define A_LDR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -138,7 +138,7 @@ namespace melonDS::ARMInterpreter } #define A_LDRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -153,7 +153,7 @@ namespace melonDS::ARMInterpreter // TODO: user mode #define A_LDRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -242,14 +242,14 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_HD_CALC_OFFSET_REG \ - u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ + u32 offset = cpu->GetReg(cpu->CurInstr & 0xF); \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_STRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(offset, storeval); \ cpu->AddCycles_CD(); \ @@ -257,8 +257,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(addr, storeval); \ cpu->AddCycles_CD(); \ @@ -269,7 +269,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD \ if (cpu->Num != 0) return; \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (offset, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ @@ -287,7 +287,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD_POST \ if (cpu->Num != 0) return; \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ if (!cpu->DataRead32 (addr, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ @@ -305,11 +305,11 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD \ if (cpu->Num != 0) return; \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ - u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + bool dataabort = !cpu->DataWrite32(offset, cpu->GetReg(r)); /* yes, this data abort behavior is on purpose */ \ + u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ if (dataabort) return; \ @@ -317,18 +317,18 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD_POST \ if (cpu->Num != 0) return; \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ - u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + bool dataabort = !cpu->DataWrite32(addr, cpu->GetReg(r)); \ + u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -342,7 +342,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -356,7 +356,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -371,7 +371,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -386,7 +386,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -401,7 +401,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI(); \ if (dataabort) return; \ @@ -452,8 +452,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) void A_SWP(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 base = cpu->GetReg((cpu->CurInstr >> 16) & 0xF); if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; @@ -468,9 +468,18 @@ void A_SWP(ARM* cpu) if (rd != 15) { cpu->R[rd] = ROR(val, 8*(base&0x3)); - cpu->SetCycles_L(rd, 1, cpu->ILT_Norm); // TODO: it adds an extra interlock cycle when doing a misaligned load from a non-itcm address + + u32 cycles; + if (base & 3) // add an extra interlock cycle when doing a misaligned load from a non-itcm address (checkme: does it matter whether you're executing from there?) + { + if (cpu->Num == 1) cycles = 2; // checkme + else cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; + } + else cycles = 1; + + cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); } - else if (cpu->Num==1) // for some reason these jumps don't work on the arm 9? + else if (cpu->Num == 1) // for some reason these jumps don't work on the arm 9? cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); } else cpu->AddCycles_CDI(); @@ -481,8 +490,8 @@ void A_SWP(ARM* cpu) void A_SWPB(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; + u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1) & 0xFF; + u32 base = cpu->GetReg((cpu->CurInstr >> 16) & 0xF); if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; @@ -497,9 +506,15 @@ void A_SWPB(ARM* cpu) if (rd != 15) { cpu->R[rd] = val; - cpu->SetCycles_L(rd, 1, cpu->ILT_Norm); // TODO: it adds an extra interlock cycle when doing a load from a non-itcm address + + // add an extra interlock cycle when doing a load from a non-itcm address (checkme: does it matter whether you're executing from there?) + u32 cycles; + if (cpu->Num == 1) cycles = 2; // checkme + else cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; + + cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); } - else if (cpu->Num==1)// for some reason these jumps don't work on the arm 9? + else if (cpu->Num == 1)// for some reason these jumps don't work on the arm 9? cpu->JumpTo(val & ~1); } else cpu->AddCycles_CDI(); @@ -513,12 +528,12 @@ void A_SWPB(ARM* cpu) void A_LDM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; - u32 base = cpu->R[baseid]; + u32 base = cpu->GetReg(baseid, 1); u32 wbbase; u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - u8 lastreg = 0; // TODO: this doesn't support 0 reg LDMs (do those even work?) + u32 lastreg = 0; // TODO: this doesn't support 0 reg LDMs (do those even work?) if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -554,8 +569,8 @@ void A_LDM(ARM* cpu) } first = false; - if (!preinc) base += 4; lastreg = i; + if (!preinc) base += 4; } } @@ -578,7 +593,12 @@ void A_LDM(ARM* cpu) else { cpu->AddCycles_CDI(); - cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); // TODO: THIS DOESN'T APPLY WHEN LOADING FROM ITCM + + u32 lastbase = base; + if (!preinc) lastbase -= 4; + // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) + if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) + cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); } // switch back to previous regs @@ -628,7 +648,7 @@ void A_LDM(ARM* cpu) void A_STM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; - u32 base = cpu->R[baseid]; + u32 base = cpu->GetReg(baseid, 1); u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; @@ -672,7 +692,7 @@ void A_STM(ARM* cpu) val = oldbase; else val = base; } - else val = cpu->R[i]; + else val = cpu->GetReg(i, 1+cpu->DataCycles); if (i == 15) val+=4; From 449557624d3577f26c39520c761ca69d6e297ce4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 15 Jun 2024 18:37:31 -0400 Subject: [PATCH 073/306] don't do interlocks for the arm7 --- src/ARM.h | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index ff857db9..739c704f 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -166,23 +166,10 @@ public: } // fetch the value of a register while handling any interlock cycles - inline u32 GetReg(const u32 reg, const u32 delay = 0) - { -#ifdef INTERLOCK - if (InterlockTimestamp[reg] > (Timestamp() + delay)) - Timestamp() = InterlockTimestamp[reg] - delay; -#endif - return R[reg]; - } + virtual inline u32 GetReg(const u32 reg, const u32 delay = 0) = 0; // Must be called after all of an instruction's cycles are calculated!!! - inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) - { -#ifdef INTERLOCK - InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; - //InterlockType[reg] = type; -#endif - } + virtual inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) = 0; virtual u64& Timestamp() = 0; @@ -351,6 +338,25 @@ public: // Cycles += numC + numD; } + // fetch the value of a register while handling any interlock cycles + inline u32 GetReg(const u32 reg, const u32 delay = 0) override + { +#ifdef INTERLOCK + if (InterlockTimestamp[reg] > (Timestamp() + delay)) + Timestamp() = InterlockTimestamp[reg] - delay; +#endif + return R[reg]; + } + + // Must be called after all of an instruction's cycles are calculated!!! + inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) override + { +#ifdef INTERLOCK + InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; + //InterlockType[reg] = type; +#endif + } + u64& Timestamp() override; void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -468,6 +474,15 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; + // fetch the value of a register while handling any interlock cycles + inline u32 GetReg(const u32 reg, const u32 delay = 0) override + { + return R[reg]; + } + + // Must be called after all of an instruction's cycles are calculated!!! + inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) override{} + u64& Timestamp() override; protected: u8 BusRead8(u32 addr) override; From debaaa0425a921817fc1701bca8f645f8248fd76 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 15 Jun 2024 18:47:56 -0400 Subject: [PATCH 074/306] fix performance regression for disabling interlock emulation path --- src/ARM.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 739c704f..e5d82ddf 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -145,6 +145,7 @@ public: virtual void AddCycles_CDI() = 0; virtual void AddCycles_CD() = 0; +/* inline void AddCycles_L(const u32 delay, const u32 reg1) { if (InterlockTimestamp[reg1] > Timestamp() + delay); @@ -163,13 +164,24 @@ public: u64 cycles = std::max(InterlockTimestamp[reg1], std::max(InterlockTimestamp[reg2], InterlockTimestamp[reg3])); if (cycles > Timestamp() + delay) Timestamp() = cycles; - } - + }*/ + +#ifdef INTERLOCK // fetch the value of a register while handling any interlock cycles virtual inline u32 GetReg(const u32 reg, const u32 delay = 0) = 0; // Must be called after all of an instruction's cycles are calculated!!! virtual inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) = 0; +#else + // fetch the value of a register while handling any interlock cycles + inline u32 GetReg(const u32 reg, const u32 delay = 0) + { + return R[reg]; + } + + // Must be called after all of an instruction's cycles are calculated!!! + inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) {} +#endif virtual u64& Timestamp() = 0; @@ -337,25 +349,23 @@ public: //else // Cycles += numC + numD; } - + +#ifdef INTERLOCK // fetch the value of a register while handling any interlock cycles inline u32 GetReg(const u32 reg, const u32 delay = 0) override { -#ifdef INTERLOCK if (InterlockTimestamp[reg] > (Timestamp() + delay)) Timestamp() = InterlockTimestamp[reg] - delay; -#endif return R[reg]; } // Must be called after all of an instruction's cycles are calculated!!! inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) override { -#ifdef INTERLOCK InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; //InterlockType[reg] = type; -#endif } +#endif u64& Timestamp() override; @@ -474,6 +484,7 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; +#ifdef INTERLOCK // fetch the value of a register while handling any interlock cycles inline u32 GetReg(const u32 reg, const u32 delay = 0) override { @@ -482,6 +493,7 @@ public: // Must be called after all of an instruction's cycles are calculated!!! inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) override{} +#endif u64& Timestamp() override; protected: From 5b37ca70d153a67a988a2f4c35ebd271157410fc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 16 Jun 2024 20:44:55 -0400 Subject: [PATCH 075/306] implement correct/guess interlocks for remaining instructions --- src/ARMInterpreter_ALU.cpp | 130 +++++++++++++------------- src/ARMInterpreter_Branch.cpp | 16 ++-- src/ARMInterpreter_LoadStore.cpp | 155 +++++++++++++++++++------------ 3 files changed, 168 insertions(+), 133 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index ac18872b..17afa833 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -1180,9 +1180,9 @@ void A_QDSUB(ARM* cpu) -void T_LSL_IMM(ARM* cpu) +void T_LSL_IMM(ARM* cpu) // verify interlock { - u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 s = (cpu->CurInstr >> 6) & 0x1F; LSL_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; @@ -1191,9 +1191,9 @@ void T_LSL_IMM(ARM* cpu) cpu->AddCycles_C(); } -void T_LSR_IMM(ARM* cpu) +void T_LSR_IMM(ARM* cpu) // verify interlock { - u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 s = (cpu->CurInstr >> 6) & 0x1F; LSR_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; @@ -1202,9 +1202,9 @@ void T_LSR_IMM(ARM* cpu) cpu->AddCycles_C(); } -void T_ASR_IMM(ARM* cpu) +void T_ASR_IMM(ARM* cpu) // verify interlock { - u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 s = (cpu->CurInstr >> 6) & 0x1F; ASR_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; @@ -1215,8 +1215,8 @@ void T_ASR_IMM(ARM* cpu) void T_ADD_REG_(ARM* cpu) { - u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 6) & 0x7); u32 res = a + b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1228,8 +1228,8 @@ void T_ADD_REG_(ARM* cpu) void T_SUB_REG_(ARM* cpu) { - u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 6) & 0x7); u32 res = a - b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1239,9 +1239,9 @@ void T_SUB_REG_(ARM* cpu) cpu->AddCycles_C(); } -void T_ADD_IMM_(ARM* cpu) +void T_ADD_IMM_(ARM* cpu) // verify interlock { - u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 b = (cpu->CurInstr >> 6) & 0x7; u32 res = a + b; cpu->R[cpu->CurInstr & 0x7] = res; @@ -1252,9 +1252,9 @@ void T_ADD_IMM_(ARM* cpu) cpu->AddCycles_C(); } -void T_SUB_IMM_(ARM* cpu) +void T_SUB_IMM_(ARM* cpu) // verify interlock { - u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 b = (cpu->CurInstr >> 6) & 0x7; u32 res = a - b; cpu->R[cpu->CurInstr & 0x7] = res; @@ -1265,7 +1265,7 @@ void T_SUB_IMM_(ARM* cpu) cpu->AddCycles_C(); } -void T_MOV_IMM(ARM* cpu) +void T_MOV_IMM(ARM* cpu) // verify interlock { u32 b = cpu->CurInstr & 0xFF; cpu->R[(cpu->CurInstr >> 8) & 0x7] = b; @@ -1274,7 +1274,7 @@ void T_MOV_IMM(ARM* cpu) cpu->AddCycles_C(); } -void T_CMP_IMM(ARM* cpu) +void T_CMP_IMM(ARM* cpu) // verify interlock { u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; @@ -1286,9 +1286,9 @@ void T_CMP_IMM(ARM* cpu) cpu->AddCycles_C(); } -void T_ADD_IMM(ARM* cpu) +void T_ADD_IMM(ARM* cpu) // verify interlock { - u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); u32 b = cpu->CurInstr & 0xFF; u32 res = a + b; cpu->R[(cpu->CurInstr >> 8) & 0x7] = res; @@ -1299,9 +1299,9 @@ void T_ADD_IMM(ARM* cpu) cpu->AddCycles_C(); } -void T_SUB_IMM(ARM* cpu) +void T_SUB_IMM(ARM* cpu) // verify interlock { - u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); u32 b = cpu->CurInstr & 0xFF; u32 res = a - b; cpu->R[(cpu->CurInstr >> 8) & 0x7] = res; @@ -1315,8 +1315,8 @@ void T_SUB_IMM(ARM* cpu) void T_AND_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a & b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1326,8 +1326,8 @@ void T_AND_REG(ARM* cpu) void T_EOR_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a ^ b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1337,8 +1337,8 @@ void T_EOR_REG(ARM* cpu) void T_LSL_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; LSL_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1348,8 +1348,8 @@ void T_LSL_REG(ARM* cpu) void T_LSR_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; LSR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1359,8 +1359,8 @@ void T_LSR_REG(ARM* cpu) void T_ASR_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; ASR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1370,8 +1370,8 @@ void T_ASR_REG(ARM* cpu) void T_ADC_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res_tmp = a + b; u32 carry = (cpu->CPSR&0x20000000 ? 1:0); u32 res = res_tmp + carry; @@ -1385,8 +1385,8 @@ void T_ADC_REG(ARM* cpu) void T_SBC_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res_tmp = a - b; u32 carry = (cpu->CPSR&0x20000000 ? 0:1); u32 res = res_tmp - carry; @@ -1400,8 +1400,8 @@ void T_SBC_REG(ARM* cpu) void T_ROR_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; ROR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1411,8 +1411,8 @@ void T_ROR_REG(ARM* cpu) void T_TST_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a & b; cpu->SetNZ(res & 0x80000000, !res); @@ -1421,7 +1421,7 @@ void T_TST_REG(ARM* cpu) void T_NEG_REG(ARM* cpu) { - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = -b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1433,8 +1433,8 @@ void T_NEG_REG(ARM* cpu) void T_CMP_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a - b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1445,8 +1445,8 @@ void T_CMP_REG(ARM* cpu) void T_CMN_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a + b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1457,8 +1457,8 @@ void T_CMN_REG(ARM* cpu) void T_ORR_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a | b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1468,8 +1468,8 @@ void T_ORR_REG(ARM* cpu) void T_MUL_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a * b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1493,8 +1493,8 @@ void T_MUL_REG(ARM* cpu) void T_BIC_REG(ARM* cpu) { - u32 a = cpu->R[cpu->CurInstr & 0x7]; - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 a = cpu->GetReg(cpu->CurInstr & 0x7); + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = a & ~b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1504,7 +1504,7 @@ void T_BIC_REG(ARM* cpu) void T_MVN_REG(ARM* cpu) { - u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 res = ~b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1516,13 +1516,13 @@ void T_MVN_REG(ARM* cpu) // TODO: check those when MSBs and MSBd are cleared // GBAtek says it's not allowed, but it works atleast on the ARM9 -void T_ADD_HIREG(ARM* cpu) +void T_ADD_HIREG(ARM* cpu) // verify interlock { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; - u32 a = cpu->R[rd]; - u32 b = cpu->R[rs]; + u32 a = cpu->GetReg(rd); + u32 b = cpu->GetReg(rs); cpu->AddCycles_C(); @@ -1536,13 +1536,13 @@ void T_ADD_HIREG(ARM* cpu) } } -void T_CMP_HIREG(ARM* cpu) +void T_CMP_HIREG(ARM* cpu) // verify interlock { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; - u32 a = cpu->R[rd]; - u32 b = cpu->R[rs]; + u32 a = cpu->GetReg(rd); + u32 b = cpu->GetReg(rs); u32 res = a - b; cpu->SetNZCV(res & 0x80000000, @@ -1552,7 +1552,7 @@ void T_CMP_HIREG(ARM* cpu) cpu->AddCycles_C(); } -void T_MOV_HIREG(ARM* cpu) +void T_MOV_HIREG(ARM* cpu) // verify interlock { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; @@ -1561,11 +1561,11 @@ void T_MOV_HIREG(ARM* cpu) if (rd == 15) { - cpu->JumpTo(cpu->R[rs] | 1); + cpu->JumpTo(cpu->GetReg(rs) | 1); } else { - cpu->R[rd] = cpu->R[rs]; + cpu->R[rd] = cpu->GetReg(rs); } // nocash-style debugging hook @@ -1582,25 +1582,25 @@ void T_MOV_HIREG(ARM* cpu) } -void T_ADD_PCREL(ARM* cpu) +void T_ADD_PCREL(ARM* cpu) // verify interlock { - u32 val = cpu->R[15] & ~2; + u32 val = cpu->GetReg(15) & ~2; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; cpu->AddCycles_C(); } -void T_ADD_SPREL(ARM* cpu) +void T_ADD_SPREL(ARM* cpu) // verify interlock { - u32 val = cpu->R[13]; + u32 val = cpu->GetReg(13); val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; cpu->AddCycles_C(); } -void T_ADD_SP(ARM* cpu) +void T_ADD_SP(ARM* cpu) // verify interlock { - u32 val = cpu->R[13]; + u32 val = cpu->GetReg(13); if (cpu->CurInstr & (1<<7)) val -= ((cpu->CurInstr & 0x7F) << 2); else diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index 015f5682..45f0440d 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -46,15 +46,15 @@ void A_BLX_IMM(ARM* cpu) cpu->JumpTo(cpu->R[15] + offset + 1); } -void A_BX(ARM* cpu) +void A_BX(ARM* cpu) // verify interlock { - cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); + cpu->JumpTo(cpu->GetReg(cpu->CurInstr & 0xF)); } -void A_BLX_REG(ARM* cpu) +void A_BLX_REG(ARM* cpu) // verify interlock { u32 lr = cpu->R[15] - 4; - cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); + cpu->JumpTo(cpu->GetReg(cpu->CurInstr & 0xF)); cpu->R[14] = lr; } @@ -71,12 +71,12 @@ void T_BCOND(ARM* cpu) cpu->AddCycles_C(); } -void T_BX(ARM* cpu) +void T_BX(ARM* cpu) // verify interlock { - cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); + cpu->JumpTo(cpu->GetReg((cpu->CurInstr >> 3) & 0xF)); } -void T_BLX_REG(ARM* cpu) +void T_BLX_REG(ARM* cpu) // verify interlock { if (cpu->Num==1) { @@ -85,7 +85,7 @@ void T_BLX_REG(ARM* cpu) } u32 lr = cpu->R[15] - 1; - cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); + cpu->JumpTo(cpu->GetReg((cpu->CurInstr >> 3) & 0xF)); cpu->R[14] = lr; } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index a11e912d..c25896ea 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -469,15 +469,17 @@ void A_SWP(ARM* cpu) { cpu->R[rd] = ROR(val, 8*(base&0x3)); - u32 cycles; - if (base & 3) // add an extra interlock cycle when doing a misaligned load from a non-itcm address (checkme: does it matter whether you're executing from there?) + if (cpu->Num == 0) { - if (cpu->Num == 1) cycles = 2; // checkme - else cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; - } - else cycles = 1; + u32 cycles; + if (base & 3) // add an extra interlock cycle when doing a misaligned load from a non-itcm address (checkme: does it matter whether you're executing from there?) + { + cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; + } + else cycles = 1; - cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); + cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); + } } else if (cpu->Num == 1) // for some reason these jumps don't work on the arm 9? cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); @@ -508,11 +510,8 @@ void A_SWPB(ARM* cpu) cpu->R[rd] = val; // add an extra interlock cycle when doing a load from a non-itcm address (checkme: does it matter whether you're executing from there?) - u32 cycles; - if (cpu->Num == 1) cycles = 2; // checkme - else cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; - - cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); + if (cpu->Num == 0) + cpu->SetCycles_L(rd, ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2, cpu->ILT_Norm); } else if (cpu->Num == 1)// for some reason these jumps don't work on the arm 9? cpu->JumpTo(val & ~1); @@ -594,11 +593,14 @@ void A_LDM(ARM* cpu) { cpu->AddCycles_CDI(); - u32 lastbase = base; - if (!preinc) lastbase -= 4; - // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) - if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) - cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); + if (cpu->Num == 0) + { + u32 lastbase = base; + if (!preinc) lastbase -= 4; + // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) + if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) + cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); + } } // switch back to previous regs @@ -736,160 +738,170 @@ void A_STM(ARM* cpu) -void T_LDR_PCREL(ARM* cpu) +void T_LDR_PCREL(ARM* cpu) // verify interlock { - u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); + u32 addr = (cpu->GetReg(15) & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[(cpu->CurInstr >> 8) & 0x7], 1, cpu->ILT_Norm); // checkme? ROR? } -void T_STR_REG(ARM* cpu) +void T_STR_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite32(addr, cpu->R[cpu->CurInstr & 0x7]); + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + cpu->DataWrite32(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); cpu->AddCycles_CD(); } void T_STRB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite8(addr, cpu->R[cpu->CurInstr & 0x7]); + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + cpu->DataWrite8(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); cpu->AddCycles_CD(); } void T_LDR_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); u32 val; if (cpu->DataRead32(addr, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], (addr & 3) ? 2 : 1, cpu->ILT_Norm); } void T_LDRB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); } void T_STRH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite16(addr, cpu->R[cpu->CurInstr & 0x7]); + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + cpu->DataWrite16(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); cpu->AddCycles_CD(); } void T_LDRSB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); if (cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); } void T_LDRH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); } void T_LDRSH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); if (cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); } -void T_STR_IMM(ARM* cpu) +void T_STR_IMM(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; + offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); - cpu->DataWrite32(offset, cpu->R[cpu->CurInstr & 0x7]); + cpu->DataWrite32(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); cpu->AddCycles_CD(); } -void T_LDR_IMM(ARM* cpu) +void T_LDR_IMM(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; + offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 val; if (cpu->DataRead32(offset, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], (offset & 3) ? 2 : 1, cpu->ILT_Norm); } -void T_STRB_IMM(ARM* cpu) +void T_STRB_IMM(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; + offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); - cpu->DataWrite8(offset, cpu->R[cpu->CurInstr & 0x7]); + cpu->DataWrite8(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); cpu->AddCycles_CD(); } -void T_LDRB_IMM(ARM* cpu) +void T_LDRB_IMM(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; + offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); } -void T_STRH_IMM(ARM* cpu) +void T_STRH_IMM(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; + offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); - cpu->DataWrite16(offset, cpu->R[cpu->CurInstr & 0x7]); + cpu->DataWrite16(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); cpu->AddCycles_CD(); } -void T_LDRH_IMM(ARM* cpu) +void T_LDRH_IMM(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; + offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); } -void T_STR_SPREL(ARM* cpu) +void T_STR_SPREL(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->R[13]; + offset += cpu->GetReg(13); - cpu->DataWrite32(offset, cpu->R[(cpu->CurInstr >> 8) & 0x7]); + cpu->DataWrite32(offset, cpu->GetReg((cpu->CurInstr >> 8) & 0x7, 1)); cpu->AddCycles_CD(); } -void T_LDR_SPREL(ARM* cpu) +void T_LDR_SPREL(ARM* cpu) // verify interlock { u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->R[13]; + offset += cpu->GetReg(13); cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); + cpu->SetCycles_L(cpu->R[(cpu->CurInstr >> 8) & 0x7], 1, cpu->ILT_Norm); // checkme? ROR? } @@ -907,7 +919,7 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) nregs++; - u32 base = cpu->R[13]; + u32 base = cpu->GetReg(13); base -= (nregs<<2); u32 wbbase = base; @@ -915,8 +927,8 @@ void T_PUSH(ARM* cpu) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]) - : cpu->DataWrite32S(base, cpu->R[i]))) + if (!(first ? cpu->DataWrite32 (base, cpu->GetReg(i, 1)) + : cpu->DataWrite32S(base, cpu->GetReg(i, 1)))) // verify interlock { goto dataabort; } @@ -940,10 +952,11 @@ void T_PUSH(ARM* cpu) cpu->AddCycles_CD(); } -void T_POP(ARM* cpu) +void T_POP(ARM* cpu) // verify interlock { - u32 base = cpu->R[13]; + u32 base = cpu->GetReg(13); bool first = true; + u32 lastreg = 0; for (int i = 0; i < 8; i++) { @@ -974,21 +987,30 @@ void T_POP(ARM* cpu) cpu->R[13] = base; + if (cpu->Num == 0) + { + u32 lastbase = base - 4; + // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) + if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) + cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); + } + return; + dataabort: cpu->AddCycles_CDI(); } void T_STMIA(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; + u32 base = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); bool first = true; for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]) - : cpu->DataWrite32S(base, cpu->R[i]))) + if (!(first ? cpu->DataWrite32 (base, cpu->GetReg(i, 1)) + : cpu->DataWrite32S(base, cpu->GetReg(i, 1)))) { goto dataabort; } @@ -1005,8 +1027,9 @@ void T_STMIA(ARM* cpu) void T_LDMIA(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; + u32 base = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); bool first = true; + u32 lastreg = 0; for (int i = 0; i < 8; i++) { @@ -1019,11 +1042,23 @@ void T_LDMIA(ARM* cpu) } first = false; base += 4; + lastreg = i; } } if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; + + + cpu->AddCycles_CDI(); + if (cpu->Num == 0) + { + u32 lastbase = base - 4; + // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) + if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) + cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); + } + return; dataabort: cpu->AddCycles_CDI(); From f00f1f6ca482758fdbd53d93e8c80c04bf6caa93 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 16 Jun 2024 20:50:42 -0400 Subject: [PATCH 076/306] im smart --- src/ARMInterpreter_LoadStore.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index c25896ea..52a80983 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -744,7 +744,7 @@ void T_LDR_PCREL(ARM* cpu) // verify interlock cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[(cpu->CurInstr >> 8) & 0x7], 1, cpu->ILT_Norm); // checkme? ROR? + cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme? ROR? } @@ -773,7 +773,7 @@ void T_LDR_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], (addr & 3) ? 2 : 1, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, (addr & 3) ? 2 : 1, cpu->ILT_Norm); } void T_LDRB_REG(ARM* cpu) @@ -782,7 +782,7 @@ void T_LDRB_REG(ARM* cpu) cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -801,7 +801,7 @@ void T_LDRSB_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_LDRH_REG(ARM* cpu) @@ -810,7 +810,7 @@ void T_LDRH_REG(ARM* cpu) cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_LDRSH_REG(ARM* cpu) @@ -820,7 +820,7 @@ void T_LDRSH_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -842,7 +842,7 @@ void T_LDR_IMM(ARM* cpu) // verify interlock if (cpu->DataRead32(offset, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], (offset & 3) ? 2 : 1, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, (offset & 3) ? 2 : 1, cpu->ILT_Norm); } void T_STRB_IMM(ARM* cpu) // verify interlock @@ -861,7 +861,7 @@ void T_LDRB_IMM(ARM* cpu) // verify interlock cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -881,7 +881,7 @@ void T_LDRH_IMM(ARM* cpu) // verify interlock cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[cpu->CurInstr & 0x7], 2, cpu->ILT_Norm); + cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -901,7 +901,7 @@ void T_LDR_SPREL(ARM* cpu) // verify interlock cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L(cpu->R[(cpu->CurInstr >> 8) & 0x7], 1, cpu->ILT_Norm); // checkme? ROR? + cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme? ROR? } From a9e2c7e047eb62f56d121bf12b82703bb7da07d9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 16 Jun 2024 23:24:20 -0400 Subject: [PATCH 077/306] implement two regs i missed --- src/ARMInterpreter_ALU.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 17afa833..92c027f3 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -858,7 +858,7 @@ void A_UMLAL(ARM* cpu) u64 res = (u64)rm * (u64)rs; - u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); // CHECKME: INTERLOCK? + u64 rd = (u64)cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1) | ((u64)cpu->GetReg((cpu->CurInstr >> 16) & 0xF) << 32ULL); res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; @@ -923,7 +923,7 @@ void A_SMLAL(ARM* cpu) s64 res = (s64)(s32)rm * (s64)(s32)rs; - s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); // CHECKME: INTERLOCK? + s64 rd = (s64)((u64)cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1) | ((u64)cpu->GetReg((cpu->CurInstr >> 16) & 0xF) << 32ULL)); res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; From c5258d6377f72053be136254a92b19c96065167e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 17 Jun 2024 18:07:53 -0400 Subject: [PATCH 078/306] verify interlocks for alu and load/store remove some checks for interlock that im pretty sure can't trigger --- src/ARMInterpreter_ALU.cpp | 38 ++++++++++++++++---------------- src/ARMInterpreter_LoadStore.cpp | 32 +++++++++++++-------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 92c027f3..be0498e1 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -1180,7 +1180,7 @@ void A_QDSUB(ARM* cpu) -void T_LSL_IMM(ARM* cpu) // verify interlock +void T_LSL_IMM(ARM* cpu) { u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 s = (cpu->CurInstr >> 6) & 0x1F; @@ -1191,7 +1191,7 @@ void T_LSL_IMM(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_LSR_IMM(ARM* cpu) // verify interlock +void T_LSR_IMM(ARM* cpu) { u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 s = (cpu->CurInstr >> 6) & 0x1F; @@ -1202,7 +1202,7 @@ void T_LSR_IMM(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_ASR_IMM(ARM* cpu) // verify interlock +void T_ASR_IMM(ARM* cpu) { u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 s = (cpu->CurInstr >> 6) & 0x1F; @@ -1239,7 +1239,7 @@ void T_SUB_REG_(ARM* cpu) cpu->AddCycles_C(); } -void T_ADD_IMM_(ARM* cpu) // verify interlock +void T_ADD_IMM_(ARM* cpu) { u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 b = (cpu->CurInstr >> 6) & 0x7; @@ -1252,7 +1252,7 @@ void T_ADD_IMM_(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_SUB_IMM_(ARM* cpu) // verify interlock +void T_SUB_IMM_(ARM* cpu) { u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); u32 b = (cpu->CurInstr >> 6) & 0x7; @@ -1265,7 +1265,7 @@ void T_SUB_IMM_(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_MOV_IMM(ARM* cpu) // verify interlock +void T_MOV_IMM(ARM* cpu) { u32 b = cpu->CurInstr & 0xFF; cpu->R[(cpu->CurInstr >> 8) & 0x7] = b; @@ -1274,9 +1274,9 @@ void T_MOV_IMM(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_CMP_IMM(ARM* cpu) // verify interlock +void T_CMP_IMM(ARM* cpu) { - u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; + u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); u32 b = cpu->CurInstr & 0xFF; u32 res = a - b; cpu->SetNZCV(res & 0x80000000, @@ -1286,7 +1286,7 @@ void T_CMP_IMM(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_ADD_IMM(ARM* cpu) // verify interlock +void T_ADD_IMM(ARM* cpu) { u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); u32 b = cpu->CurInstr & 0xFF; @@ -1299,7 +1299,7 @@ void T_ADD_IMM(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_SUB_IMM(ARM* cpu) // verify interlock +void T_SUB_IMM(ARM* cpu) { u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); u32 b = cpu->CurInstr & 0xFF; @@ -1516,7 +1516,7 @@ void T_MVN_REG(ARM* cpu) // TODO: check those when MSBs and MSBd are cleared // GBAtek says it's not allowed, but it works atleast on the ARM9 -void T_ADD_HIREG(ARM* cpu) // verify interlock +void T_ADD_HIREG(ARM* cpu) { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; @@ -1536,7 +1536,7 @@ void T_ADD_HIREG(ARM* cpu) // verify interlock } } -void T_CMP_HIREG(ARM* cpu) // verify interlock +void T_CMP_HIREG(ARM* cpu) { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; @@ -1552,7 +1552,7 @@ void T_CMP_HIREG(ARM* cpu) // verify interlock cpu->AddCycles_C(); } -void T_MOV_HIREG(ARM* cpu) // verify interlock +void T_MOV_HIREG(ARM* cpu) { u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; @@ -1582,25 +1582,25 @@ void T_MOV_HIREG(ARM* cpu) // verify interlock } -void T_ADD_PCREL(ARM* cpu) // verify interlock +void T_ADD_PCREL(ARM* cpu) // checkme: pc shouldn't be able to interlock? { - u32 val = cpu->GetReg(15) & ~2; + u32 val = cpu->R[15] & ~2; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; cpu->AddCycles_C(); } -void T_ADD_SPREL(ARM* cpu) // verify interlock +void T_ADD_SPREL(ARM* cpu) // checkme: sp shouldn't be able to interlock in thumb? { - u32 val = cpu->GetReg(13); + u32 val = cpu->R[13]; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; cpu->AddCycles_C(); } -void T_ADD_SP(ARM* cpu) // verify interlock +void T_ADD_SP(ARM* cpu) // checkme: sp shouldn't be able to interlock in thumb? { - u32 val = cpu->GetReg(13); + u32 val = cpu->R[13]; if (cpu->CurInstr & (1<<7)) val -= ((cpu->CurInstr & 0x7F) << 2); else diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 52a80983..3fac1963 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -738,13 +738,13 @@ void A_STM(ARM* cpu) -void T_LDR_PCREL(ARM* cpu) // verify interlock +void T_LDR_PCREL(ARM* cpu) // checkme: can pc be interlocked? { - u32 addr = (cpu->GetReg(15) & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); + u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme? ROR? + cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme: verify cycle count } @@ -824,7 +824,7 @@ void T_LDRSH_REG(ARM* cpu) } -void T_STR_IMM(ARM* cpu) // verify interlock +void T_STR_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 4) & 0x7C; offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); @@ -833,7 +833,7 @@ void T_STR_IMM(ARM* cpu) // verify interlock cpu->AddCycles_CD(); } -void T_LDR_IMM(ARM* cpu) // verify interlock +void T_LDR_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 4) & 0x7C; offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); @@ -845,7 +845,7 @@ void T_LDR_IMM(ARM* cpu) // verify interlock cpu->SetCycles_L(cpu->CurInstr & 0x7, (offset & 3) ? 2 : 1, cpu->ILT_Norm); } -void T_STRB_IMM(ARM* cpu) // verify interlock +void T_STRB_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 6) & 0x1F; offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); @@ -854,7 +854,7 @@ void T_STRB_IMM(ARM* cpu) // verify interlock cpu->AddCycles_CD(); } -void T_LDRB_IMM(ARM* cpu) // verify interlock +void T_LDRB_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 6) & 0x1F; offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); @@ -865,7 +865,7 @@ void T_LDRB_IMM(ARM* cpu) // verify interlock } -void T_STRH_IMM(ARM* cpu) // verify interlock +void T_STRH_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 5) & 0x3E; offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); @@ -874,7 +874,7 @@ void T_STRH_IMM(ARM* cpu) // verify interlock cpu->AddCycles_CD(); } -void T_LDRH_IMM(ARM* cpu) // verify interlock +void T_LDRH_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 5) & 0x3E; offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); @@ -885,23 +885,23 @@ void T_LDRH_IMM(ARM* cpu) // verify interlock } -void T_STR_SPREL(ARM* cpu) // verify interlock +void T_STR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? { u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->GetReg(13); + offset += cpu->R[13]; cpu->DataWrite32(offset, cpu->GetReg((cpu->CurInstr >> 8) & 0x7, 1)); cpu->AddCycles_CD(); } -void T_LDR_SPREL(ARM* cpu) // verify interlock +void T_LDR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? { u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->GetReg(13); + offset += cpu->R[13]; cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); - cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme? ROR? + cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme: verify cycle count } @@ -952,9 +952,9 @@ void T_PUSH(ARM* cpu) cpu->AddCycles_CD(); } -void T_POP(ARM* cpu) // verify interlock +void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? { - u32 base = cpu->GetReg(13); + u32 base = cpu->R[13]; bool first = true; u32 lastreg = 0; From e6ba4075b9e4d4598a8b566667c6c2ebcc984d33 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:12:05 -0400 Subject: [PATCH 079/306] correct interlocked reg for umlal --- src/ARMInterpreter_ALU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index be0498e1..44ee84aa 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -882,7 +882,7 @@ void A_UMLAL(ARM* cpu) !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMULL(ARM* cpu) From f1b71fe5a9886a2840d747ebf9494c7bf27e2324 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:15:04 -0400 Subject: [PATCH 080/306] implement configurable vram bus width not implemented for direct boot --- src/DSi.cpp | 18 ++++++++++++++++++ src/DSi.h | 1 + 2 files changed, 19 insertions(+) diff --git a/src/DSi.cpp b/src/DSi.cpp index 306c5d1c..5b98957c 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -142,6 +142,7 @@ void DSi::Reset() SCFG_Clock9 = 0x0187; // CHECKME SCFG_Clock7 = 0x0187; SCFG_EXT[0] = 0x8307F100; + SetVRAMTimings(true); SCFG_EXT[1] = 0x93FFFB06; SCFG_MC = 0x0010 | (~((u32)(NDSCartSlot.GetCart() != nullptr))&1);//0x0011; SCFG_RST = 0; @@ -215,6 +216,7 @@ void DSi::DoSavestateExtra(Savestate* file) Set_SCFG_Clock9(SCFG_Clock9); Set_SCFG_MC(SCFG_MC); DSP.SetRstLine(SCFG_RST & 0x0001); + SetVRAMTimings(SCFG_EXT[0] & (1<<13)); MBK[0][8] = 0; MBK[1][8] = 0; @@ -693,6 +695,7 @@ void DSi::SoftReset() SCFG_Clock9 = 0x0187; // CHECKME SCFG_Clock7 = 0x0187; SCFG_EXT[0] = 0x8307F100; + SetVRAMTimings(true); SCFG_EXT[1] = 0x93FFFB06; SCFG_MC = 0x0010;//0x0011; // TODO: is this actually reset? @@ -1283,6 +1286,14 @@ void DSi::Set_SCFG_MC(u32 val) } } +void DSi::SetVRAMTimings(bool extrabuswidth) +{ + if (extrabuswidth) + SetARM9RegionTimings(0x06000, 0x07000, Mem9_VRAM, 32, 1, 1); // dsi vram + else + SetARM9RegionTimings(0x06000, 0x07000, Mem9_VRAM, 16, 1, 1); // ds vram +} + u8 DSi::ARM9Read8(u32 addr) { @@ -2521,11 +2532,18 @@ void DSi::ARM9IOWrite32(u32 addr, u32 val) u32 oldram = (SCFG_EXT[0] >> 14) & 0x3; u32 newram = (val >> 14) & 0x3; + u32 oldvram = (SCFG_EXT[0] & (1<<13)); + u32 newvram = (val & (1<<13)); + SCFG_EXT[0] &= ~0x8007F19F; SCFG_EXT[0] |= (val & 0x8007F19F); SCFG_EXT[1] &= ~0x0000F080; SCFG_EXT[1] |= (val & 0x0000F080); Log(LogLevel::Debug, "SCFG_EXT = %08X / %08X (val9 %08X)\n", SCFG_EXT[0], SCFG_EXT[1], val); + + if (oldvram != newvram) + SetVRAMTimings(newvram); + /*switch ((SCFG_EXT[0] >> 14) & 0x3) { case 0: diff --git a/src/DSi.h b/src/DSi.h index 1d010e0f..755e1f50 100644 --- a/src/DSi.h +++ b/src/DSi.h @@ -96,6 +96,7 @@ public: void MapNWRAM_B(u32 num, u8 val); void MapNWRAM_C(u32 num, u8 val); void MapNWRAMRange(u32 cpu, u32 num, u32 val); + void SetVRAMTimings(bool extrabuswidth); u8 ARM9Read8(u32 addr) override; u16 ARM9Read16(u32 addr) override; From 3583d8222fbeaaa06ebe02f2bf430ccc402d0a32 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:17:04 -0400 Subject: [PATCH 081/306] disable interlock emulation, needs more research --- src/ARM.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARM.h b/src/ARM.h index e5d82ddf..3ef0d439 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -30,7 +30,7 @@ #include "debug/GdbStub.h" #endif -#define INTERLOCK +//#define INTERLOCK namespace melonDS { From 109bbed3d0959b07c03c0bde36118f685497cb6f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 24 Jun 2024 19:44:38 -0400 Subject: [PATCH 082/306] improve ldm timings I believe this also applies to other loads as well, but currently untested. --- src/ARM.cpp | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/ARM.h | 13 +------------ src/CP15.cpp | 19 ++++++++++-------- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 899fe661..cb72dad5 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -302,6 +302,10 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) u32 oldregion = R[15] >> 24; u32 newregion = addr >> 24; + + if (addr < ITCMSize) CodeRegion = Mem9_ITCM; + else if ((addr & DTCMMask) == DTCMBase) CodeRegion = Mem9_DTCM; + else CodeRegion = NDS.ARM9Regions[addr >> 14]; RegionCodeCycles = MemTimings[addr >> 12][0]; @@ -1255,6 +1259,57 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) } +void ARMv5::AddCycles_CDI() +{ + // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early + s32 early; + switch (DataRegion) + { + case 0: // background region; CHECKME + case Mem9_DTCM: + case Mem9_BIOS: + case Mem9_WRAM: + case Mem9_IO: + case Mem9_Pal: // CHECKME + default: + early = 2; + break; + + case Mem9_OAM: // CHECKME + case Mem9_GBAROM: + case Mem9_GBARAM: + early = 4; + break; + + case Mem9_MainRAM: + early = (CodeRegion == Mem9_MainRAM) ? 0 : 4; + break; + + case Mem9_VRAM: // the dsi can toggle the bus width of vram between 32 and 16 bit + early = (NDS.ConsoleType == 0 || !(((DSi&)NDS).SCFG_EXT[0] & (1<<13))) ? 4 : 2; + break; + + case Mem9_ITCM: // itcm data fetches cannot be done at the same time as a code fetch, it'll even incurr a 1 cycle penalty when executing from itcm + early = (CodeRegion == Mem9_ITCM) ? -1 : 0; + break; + } + + if (numD > early) + { + numC -= early; + if (numC < 0) numC = 0; + Cycles += numC + numD; + } + else + { + Cycles += numC; + } +} + void ARMv4::AddCycles_C() { // code only. this code fetch is sequential. diff --git a/src/ARM.h b/src/ARM.h index 3ef0d439..25a96ef2 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -325,18 +325,7 @@ public: Cycles += numC + numI; } - void AddCycles_CDI() override - { - // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; - } + void AddCycles_CDI() override; void AddCycles_CD() override { diff --git a/src/CP15.cpp b/src/CP15.cpp index 7b11696b..319ac9c4 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -815,22 +815,23 @@ bool ARMv5::DataRead8(u32 addr, u32* val) return false; } - DataRegion = addr; - if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } - + *val = BusRead8(addr); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -843,24 +844,25 @@ bool ARMv5::DataRead16(u32 addr, u32* val) return false; } - DataRegion = addr; - addr &= ~1; if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } - + *val = BusRead16(addr); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -873,24 +875,25 @@ bool ARMv5::DataRead32(u32 addr, u32* val) return false; } - DataRegion = addr; - addr &= ~3; if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } *val = BusRead32(addr); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][2]; return true; } From dbe00e72ddfc0fc2b342c2afbaeda13294c7763b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 24 Jun 2024 22:50:04 -0400 Subject: [PATCH 083/306] improve stm timings need to verify if they apply to all store instructions --- src/ARM.cpp | 42 ++++++++++++++++++++++++++++++------------ src/ARM.h | 12 +----------- src/CP15.cpp | 15 +++++++++------ 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index cb72dad5..907a4790 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1259,6 +1259,31 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) } +void ARMv5::AddCycles_CD() +{ + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + s32 early; + if (DataRegion == Mem9_ITCM) + { + early = (CodeRegion == Mem9_ITCM) ? -1 : 0; + } + else if (DataRegion == Mem9_DTCM) + { + early = 2; + } + else if (DataRegion == Mem9_MainRAM) + { + early = (CodeRegion == Mem9_MainRAM) ? 0 : 18; // CHECKME: how early can main ram be? + } + else early = (DataRegion == CodeRegion) ? 4 : 6; + + s32 code = numC - early; + if (code < 0) code = 0; + Cycles += std::max(code + numD, numC); +} + void ARMv5::AddCycles_CDI() { // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. @@ -1269,7 +1294,7 @@ void ARMv5::AddCycles_CDI() s32 early; switch (DataRegion) { - case 0: // background region; CHECKME + case 0: // background region; case Mem9_DTCM: case Mem9_BIOS: case Mem9_WRAM: @@ -1297,17 +1322,10 @@ void ARMv5::AddCycles_CDI() early = (CodeRegion == Mem9_ITCM) ? -1 : 0; break; } - - if (numD > early) - { - numC -= early; - if (numC < 0) numC = 0; - Cycles += numC + numD; - } - else - { - Cycles += numC; - } + + s32 code = numC - early; + if (code < 0) code = 0; + Cycles += std::max(code + numD, numC); } void ARMv4::AddCycles_C() diff --git a/src/ARM.h b/src/ARM.h index 25a96ef2..68eeb685 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -327,17 +327,7 @@ public: void AddCycles_CDI() override; - void AddCycles_CD() override - { - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; - } + void AddCycles_CD() override; #ifdef INTERLOCK // fetch the value of a register while handling any interlock cycles diff --git a/src/CP15.cpp b/src/CP15.cpp index 319ac9c4..06e01e83 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -934,10 +934,9 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) return false; } - DataRegion = addr; - if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -945,12 +944,14 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } BusWrite8(addr, val); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -963,12 +964,11 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) return false; } - DataRegion = addr; - addr &= ~1; if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -976,12 +976,14 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } BusWrite16(addr, val); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -994,12 +996,11 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) return false; } - DataRegion = addr; - addr &= ~3; if (addr < ITCMSize) { + DataRegion = Mem9_ITCM; DataCycles = 1; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -1007,12 +1008,14 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } if ((addr & DTCMMask) == DTCMBase) { + DataRegion = Mem9_DTCM; DataCycles = 1; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } BusWrite32(addr, val); + DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][2]; return true; } From 541e1e6388537790a0bd2d8a515be3de5c52956d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 25 Jun 2024 09:08:11 -0400 Subject: [PATCH 084/306] proper timings for ldr/str --- src/ARM.cpp | 60 +++++++++++++++-- src/ARM.h | 21 ++++-- src/ARMInterpreter_LoadStore.cpp | 111 ++++++++++++++++--------------- 3 files changed, 126 insertions(+), 66 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 907a4790..644a58a2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1259,7 +1259,7 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) } -void ARMv5::AddCycles_CD() +void ARMv5::AddCycles_CD_STR() { s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; s32 numD = DataCycles; @@ -1267,7 +1267,7 @@ void ARMv5::AddCycles_CD() s32 early; if (DataRegion == Mem9_ITCM) { - early = (CodeRegion == Mem9_ITCM) ? -1 : 0; + early = (CodeRegion == Mem9_ITCM) ? 0 : 2; } else if (DataRegion == Mem9_DTCM) { @@ -1284,9 +1284,61 @@ void ARMv5::AddCycles_CD() Cycles += std::max(code + numD, numC); } -void ARMv5::AddCycles_CDI() +void ARMv5::AddCycles_CD_STM() { - // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + s32 early; + if (DataRegion == Mem9_ITCM) + { + early = (CodeRegion == Mem9_ITCM) ? -1 : 0; // stm adds either: no penalty or benefit to itcm loads, or a 1 cycle penalty if executing from itcm. + } + else if (DataRegion == Mem9_DTCM) + { + early = 2; + } + else if (DataRegion == Mem9_MainRAM) + { + early = (CodeRegion == Mem9_MainRAM) ? 0 : 18; // CHECKME: how early can main ram be? + } + else early = (DataRegion == CodeRegion) ? 4 : 6; + + s32 code = numC - early; + if (code < 0) code = 0; + Cycles += std::max(code + numD, numC); +} + +void ARMv5::AddCycles_CDI_LDR() +{ + // LDR cycles. ARM9 seems to skip the internal cycle here. + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early + s32 early; + if (DataRegion == Mem9_ITCM) + { + early = (CodeRegion == Mem9_ITCM) ? 0 : 2; + } + else if (DataRegion == Mem9_DTCM) + { + early = 2; + } + else if (DataRegion == Mem9_MainRAM) + { + early = (CodeRegion == Mem9_MainRAM) ? 0 : 6; + } + else early = 6; + + s32 code = numC - early; + if (code < 0) code = 0; + Cycles += std::max(code + numD, numC); +} + +void ARMv5::AddCycles_CDI_LDM() +{ + // LDM cycles. ARM9 seems to skip the internal cycle here. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; s32 numD = DataCycles; diff --git a/src/ARM.h b/src/ARM.h index 68eeb685..38f60c6f 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -142,8 +142,10 @@ public: virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; - virtual void AddCycles_CDI() = 0; - virtual void AddCycles_CD() = 0; + virtual void AddCycles_CDI_LDR() = 0; + virtual void AddCycles_CDI_LDM() = 0; + virtual void AddCycles_CD_STR() = 0; + virtual void AddCycles_CD_STM() = 0; /* inline void AddCycles_L(const u32 delay, const u32 reg1) @@ -325,9 +327,10 @@ public: Cycles += numC + numI; } - void AddCycles_CDI() override; - - void AddCycles_CD() override; + void AddCycles_CDI_LDR() override; + void AddCycles_CDI_LDM() override; + void AddCycles_CD_STR() override; + void AddCycles_CD_STM() override; #ifdef INTERLOCK // fetch the value of a register while handling any interlock cycles @@ -460,8 +463,12 @@ public: bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; - void AddCycles_CDI() override; - void AddCycles_CD() override; + void AddCycles_CDI(); + void AddCycles_CDI_LDR() override { AddCycles_CDI(); } + void AddCycles_CDI_LDM() override { AddCycles_CDI(); } + void AddCycles_CD(); + void AddCycles_CD_STR() override { AddCycles_CD(); } + void AddCycles_CD_STM() override { AddCycles_CD(); } #ifdef INTERLOCK // fetch the value of a register while handling any interlock cycles diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 3fac1963..dd7f9762 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -66,7 +66,7 @@ namespace melonDS::ARMInterpreter if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(offset, storeval); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STR(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -77,7 +77,7 @@ namespace melonDS::ARMInterpreter if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(addr, storeval); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STR(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -86,7 +86,7 @@ namespace melonDS::ARMInterpreter u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(offset, storeval); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STR(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -96,14 +96,14 @@ namespace melonDS::ARMInterpreter u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(addr, storeval); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STR(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDR \ offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = ROR(val, ((offset&0x3)<<3)); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ @@ -122,7 +122,7 @@ namespace melonDS::ARMInterpreter #define A_LDR_POST \ u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = ROR(val, ((addr&0x3)<<3)); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ @@ -140,7 +140,7 @@ namespace melonDS::ARMInterpreter #define A_LDRB \ offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -155,7 +155,7 @@ namespace melonDS::ARMInterpreter #define A_LDRB_POST \ u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -252,7 +252,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(offset, storeval); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STR(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -261,7 +261,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(addr, storeval); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STR(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -272,9 +272,9 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (offset, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ + if (!cpu->DataRead32 (offset, &cpu->R[r])) {cpu->AddCycles_CDI_LDR(); return;} \ u32 val; bool dataabort = !cpu->DataRead32S(offset+4, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDM(); \ if (dataabort) return; \ if (r == 14) \ cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ @@ -290,9 +290,9 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (addr, &cpu->R[r])) {cpu->AddCycles_CDI(); return;} \ + if (!cpu->DataRead32 (addr, &cpu->R[r])) {cpu->AddCycles_CDI_LDR(); return;} \ u32 val; bool dataabort = !cpu->DataRead32S(addr+4, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDM(); \ if (dataabort) return; \ if (r == 14) \ cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ @@ -311,7 +311,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dataabort = !cpu->DataWrite32(offset, cpu->GetReg(r)); /* yes, this data abort behavior is on purpose */ \ u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STM(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -323,14 +323,14 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dataabort = !cpu->DataWrite32(addr, cpu->GetReg(r)); \ u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ - cpu->AddCycles_CD(); \ + cpu->AddCycles_CD_STM(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ cpu->JumpTo8_16Bit(val); \ @@ -344,7 +344,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRH_POST \ u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ cpu->JumpTo8_16Bit(val); \ @@ -358,7 +358,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSB \ offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s8)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -373,7 +373,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSB_POST \ u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s8)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -388,7 +388,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSH \ offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s16)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -403,7 +403,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSH_POST \ u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ - cpu->AddCycles_CDI(); \ + cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s16)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -462,7 +462,7 @@ void A_SWP(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite32(base, rm)) { - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) @@ -484,10 +484,10 @@ void A_SWP(ARM* cpu) else if (cpu->Num == 1) // for some reason these jumps don't work on the arm 9? cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); } - else cpu->AddCycles_CDI(); + else cpu->AddCycles_CDI_LDR(); cpu->DataCycles += numD; } - else cpu->AddCycles_CDI(); + else cpu->AddCycles_CDI_LDR(); } void A_SWPB(ARM* cpu) @@ -502,7 +502,7 @@ void A_SWPB(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite8(base, rm)) { - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) @@ -516,10 +516,10 @@ void A_SWPB(ARM* cpu) else if (cpu->Num == 1)// for some reason these jumps don't work on the arm 9? cpu->JumpTo(val & ~1); } - else cpu->AddCycles_CDI(); + else cpu->AddCycles_CDI_LDR(); cpu->DataCycles += numD; } - else cpu->AddCycles_CDI(); + else cpu->AddCycles_CDI_LDR(); } @@ -582,7 +582,7 @@ void A_LDM(ARM* cpu) { goto dataabort; } - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDM(); if (!preinc) base += 4; @@ -591,7 +591,7 @@ void A_LDM(ARM* cpu) } else { - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDM(); if (cpu->Num == 0) { @@ -635,7 +635,7 @@ void A_LDM(ARM* cpu) if (false) { dataabort: - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDM(); // CHECKME: interlock shouldn't apply when it data aborts, right? // switch back to original set of regs @@ -728,7 +728,7 @@ void A_STM(ARM* cpu) cpu->R[baseid] = oldbase; } - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STM(); } @@ -743,7 +743,7 @@ void T_LDR_PCREL(ARM* cpu) // checkme: can pc be interlocked? u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme: verify cycle count } @@ -753,7 +753,7 @@ void T_STR_REG(ARM* cpu) u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataWrite32(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_STRB_REG(ARM* cpu) @@ -761,7 +761,7 @@ void T_STRB_REG(ARM* cpu) u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataWrite8(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_LDR_REG(ARM* cpu) @@ -772,7 +772,7 @@ void T_LDR_REG(ARM* cpu) if (cpu->DataRead32(addr, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, (addr & 3) ? 2 : 1, cpu->ILT_Norm); } @@ -781,7 +781,7 @@ void T_LDRB_REG(ARM* cpu) u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -791,7 +791,7 @@ void T_STRH_REG(ARM* cpu) u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataWrite16(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_LDRSB_REG(ARM* cpu) @@ -800,7 +800,7 @@ void T_LDRSB_REG(ARM* cpu) if (cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -809,7 +809,7 @@ void T_LDRH_REG(ARM* cpu) u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -819,7 +819,7 @@ void T_LDRSH_REG(ARM* cpu) if (cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -830,7 +830,7 @@ void T_STR_IMM(ARM* cpu) offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataWrite32(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_LDR_IMM(ARM* cpu) @@ -841,7 +841,7 @@ void T_LDR_IMM(ARM* cpu) u32 val; if (cpu->DataRead32(offset, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, (offset & 3) ? 2 : 1, cpu->ILT_Norm); } @@ -851,7 +851,7 @@ void T_STRB_IMM(ARM* cpu) offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataWrite8(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_LDRB_IMM(ARM* cpu) @@ -860,7 +860,7 @@ void T_LDRB_IMM(ARM* cpu) offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -871,7 +871,7 @@ void T_STRH_IMM(ARM* cpu) offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataWrite16(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_LDRH_IMM(ARM* cpu) @@ -880,7 +880,7 @@ void T_LDRH_IMM(ARM* cpu) offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } @@ -891,7 +891,7 @@ void T_STR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? offset += cpu->R[13]; cpu->DataWrite32(offset, cpu->GetReg((cpu->CurInstr >> 8) & 0x7, 1)); - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STR(); } void T_LDR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? @@ -900,7 +900,7 @@ void T_LDR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? offset += cpu->R[13]; cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDR(); cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme: verify cycle count } @@ -949,7 +949,7 @@ void T_PUSH(ARM* cpu) cpu->R[13] = wbbase; dataabort: - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STM(); } void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? @@ -986,7 +986,8 @@ void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? } cpu->R[13] = base; - + + cpu->AddCycles_CDI_LDM(); if (cpu->Num == 0) { u32 lastbase = base - 4; @@ -997,7 +998,7 @@ void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? return; dataabort: - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDM(); } void T_STMIA(ARM* cpu) @@ -1022,7 +1023,7 @@ void T_STMIA(ARM* cpu) // TODO: check "Rb included in Rlist" case cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; dataabort: - cpu->AddCycles_CD(); + cpu->AddCycles_CD_STM(); } void T_LDMIA(ARM* cpu) @@ -1050,7 +1051,7 @@ void T_LDMIA(ARM* cpu) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDM(); if (cpu->Num == 0) { u32 lastbase = base - 4; @@ -1061,7 +1062,7 @@ void T_LDMIA(ARM* cpu) return; dataabort: - cpu->AddCycles_CDI(); + cpu->AddCycles_CDI_LDM(); } From c5b035a97314d10f7cfa54de8ead946acaf43dee Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 25 Jun 2024 11:20:01 -0400 Subject: [PATCH 085/306] SWP and SWPB use the same behavior as STR on the ARM9 --- src/ARM.h | 3 +++ src/ARMInterpreter_LoadStore.cpp | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 38f60c6f..cb47f287 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -144,6 +144,7 @@ public: virtual void AddCycles_CI(s32 numI) = 0; virtual void AddCycles_CDI_LDR() = 0; virtual void AddCycles_CDI_LDM() = 0; + virtual void AddCycles_CDI_SWP() = 0; virtual void AddCycles_CD_STR() = 0; virtual void AddCycles_CD_STM() = 0; @@ -329,6 +330,7 @@ public: void AddCycles_CDI_LDR() override; void AddCycles_CDI_LDM() override; + void AddCycles_CDI_SWP() override { AddCycles_CD_STR(); } // uses the same behavior as str void AddCycles_CD_STR() override; void AddCycles_CD_STM() override; @@ -466,6 +468,7 @@ public: void AddCycles_CDI(); void AddCycles_CDI_LDR() override { AddCycles_CDI(); } void AddCycles_CDI_LDM() override { AddCycles_CDI(); } + void AddCycles_CDI_SWP() override { AddCycles_CDI(); } // checkme? void AddCycles_CD(); void AddCycles_CD_STR() override { AddCycles_CD(); } void AddCycles_CD_STM() override { AddCycles_CD(); } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index dd7f9762..d874fb9a 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -462,7 +462,7 @@ void A_SWP(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite32(base, rm)) { - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI_SWP(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) @@ -484,10 +484,10 @@ void A_SWP(ARM* cpu) else if (cpu->Num == 1) // for some reason these jumps don't work on the arm 9? cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); } - else cpu->AddCycles_CDI_LDR(); + else cpu->AddCycles_CDI_SWP(); cpu->DataCycles += numD; } - else cpu->AddCycles_CDI_LDR(); + else cpu->AddCycles_CDI_SWP(); } void A_SWPB(ARM* cpu) @@ -502,7 +502,7 @@ void A_SWPB(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite8(base, rm)) { - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI_SWP(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; if (rd != 15) @@ -516,10 +516,10 @@ void A_SWPB(ARM* cpu) else if (cpu->Num == 1)// for some reason these jumps don't work on the arm 9? cpu->JumpTo(val & ~1); } - else cpu->AddCycles_CDI_LDR(); + else cpu->AddCycles_CDI_SWP(); cpu->DataCycles += numD; } - else cpu->AddCycles_CDI_LDR(); + else cpu->AddCycles_CDI_SWP(); } From 88e5584b5f018458b0a683aa33d6d6e367a1a8f0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 27 Jun 2024 13:02:38 -0400 Subject: [PATCH 086/306] fix clz r15 --- src/ARMInterpreter_ALU.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 44ee84aa..51b219d7 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -1078,7 +1078,8 @@ void A_CLZ(ARM* cpu) val |= 0x1; } - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; + if ((cpu->CurInstr >> 12) & 0xF == 15) cpu->JumpTo(res & ~1); + else cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; cpu->AddCycles_C(); } From a549977eb0c4823a1a900f60383e4d003402a3af Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 4 Jul 2024 11:04:38 -0400 Subject: [PATCH 087/306] fix clz for realsies --- src/ARMInterpreter_ALU.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 3e94d2af..e3208668 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -1078,7 +1078,7 @@ void A_CLZ(ARM* cpu) val |= 0x1; } - if ((cpu->CurInstr >> 12) & 0xF == 15) cpu->JumpTo(res & ~1); + if (((cpu->CurInstr >> 12) & 0xF) == 15) cpu->JumpTo(res & ~1); else cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; cpu->AddCycles_C(); } From bd1665c1d3602a3d7f6b327ddb0ecf0982340047 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 4 Jul 2024 11:15:37 -0400 Subject: [PATCH 088/306] minor timing tweaks --- src/ARM.cpp | 2 +- src/ARM.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 3ecae2c8..f667e0f6 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -647,7 +647,7 @@ void ARMv5::Execute() R[15] += 2; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } + if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 1; } else NextInstr[1] = CodeRead32(R[15], false); // actually execute diff --git a/src/ARM.h b/src/ARM.h index 8efb8fa6..20d11ad2 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -317,15 +317,16 @@ public: void AddCycles_C() override { // code only. always nonseq 32-bit for ARM9. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numC = CodeCycles; Cycles += numC; } void AddCycles_CI(s32 numI) override { // code+internal - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; + s32 numC = CodeCycles; + numI += 1; + Cycles += std::max(numC, numI); } void AddCycles_CDI_LDR() override; From ea429a1b8d04b53c0e0d9c33bdba8e613de0e88d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:58:58 -0400 Subject: [PATCH 089/306] improve interlock emulation add cycles to the instruction execution time rather than the timestamp directly. --- src/ARM.cpp | 9 +++++---- src/ARM.h | 10 ++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index f667e0f6..7d5a02c7 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -696,6 +696,7 @@ void ARMv5::Execute() NDS.ARM9Timestamp += Cycles; Cycles = 0; + CyclesILed = 0; } if (Halted == 2) @@ -1262,7 +1263,7 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) void ARMv5::AddCycles_CD_STR() { s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; + s32 numD = DataCycles + CyclesILed; s32 early; if (DataRegion == Mem9_ITCM) @@ -1287,7 +1288,7 @@ void ARMv5::AddCycles_CD_STR() void ARMv5::AddCycles_CD_STM() { s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; + s32 numD = DataCycles + CyclesILed; s32 early; if (DataRegion == Mem9_ITCM) @@ -1313,7 +1314,7 @@ void ARMv5::AddCycles_CDI_LDR() { // LDR cycles. ARM9 seems to skip the internal cycle here. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; + s32 numD = DataCycles + CyclesILed; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early s32 early; @@ -1340,7 +1341,7 @@ void ARMv5::AddCycles_CDI_LDM() { // LDM cycles. ARM9 seems to skip the internal cycle here. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; + s32 numD = DataCycles + CyclesILed; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early s32 early; diff --git a/src/ARM.h b/src/ARM.h index 20d11ad2..a76a6d09 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -30,7 +30,7 @@ #include "debug/GdbStub.h" #endif -//#define INTERLOCK +#define INTERLOCK namespace melonDS { @@ -318,14 +318,14 @@ public: { // code only. always nonseq 32-bit for ARM9. s32 numC = CodeCycles; - Cycles += numC; + Cycles += std::max(numC, CyclesILed + 1); } void AddCycles_CI(s32 numI) override { // code+internal s32 numC = CodeCycles; - numI += 1; + numI += 1 + CyclesILed; Cycles += std::max(numC, numI); } @@ -340,7 +340,7 @@ public: inline u32 GetReg(const u32 reg, const u32 delay = 0) override { if (InterlockTimestamp[reg] > (Timestamp() + delay)) - Timestamp() = InterlockTimestamp[reg] - delay; + CyclesILed = InterlockTimestamp[reg] - (Timestamp() + delay); return R[reg]; } @@ -417,6 +417,8 @@ public: bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); + s32 CyclesILed; + #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; void WriteMem(u32 addr, int size, u32 v) override; From 0f02c0bbbad3828638ff22f49e57ae6c21e51e2e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 6 Jul 2024 12:13:41 -0400 Subject: [PATCH 090/306] disable interlock emulation again again our understanding of how it works is just too incomplete to be worth implementing yet --- src/ARM.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARM.h b/src/ARM.h index a76a6d09..25889329 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -30,7 +30,7 @@ #include "debug/GdbStub.h" #endif -#define INTERLOCK +//#define INTERLOCK namespace melonDS { From 383750692e95fb40f60cca0b3208af5414ad53eb Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 6 Jul 2024 12:38:39 -0400 Subject: [PATCH 091/306] doesn't really matter but idk it's more correct? --- src/ARM.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7d5a02c7..2c56d505 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1262,7 +1262,7 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) void ARMv5::AddCycles_CD_STR() { - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numC = CodeCycles; s32 numD = DataCycles + CyclesILed; s32 early; @@ -1287,7 +1287,7 @@ void ARMv5::AddCycles_CD_STR() void ARMv5::AddCycles_CD_STM() { - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numC = CodeCycles; s32 numD = DataCycles + CyclesILed; s32 early; @@ -1313,7 +1313,7 @@ void ARMv5::AddCycles_CD_STM() void ARMv5::AddCycles_CDI_LDR() { // LDR cycles. ARM9 seems to skip the internal cycle here. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numC = CodeCycles; s32 numD = DataCycles + CyclesILed; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early @@ -1340,7 +1340,7 @@ void ARMv5::AddCycles_CDI_LDR() void ARMv5::AddCycles_CDI_LDM() { // LDM cycles. ARM9 seems to skip the internal cycle here. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numC = CodeCycles; s32 numD = DataCycles + CyclesILed; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early From e2be0b4f93cfe310a53dfc034a7d4f047a02962e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 7 Jul 2024 15:41:14 -0400 Subject: [PATCH 092/306] actually no it was not more correct undo previous commit because actually code cycles *do* matter --- src/ARM.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 2c56d505..7d5a02c7 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1262,7 +1262,7 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) void ARMv5::AddCycles_CD_STR() { - s32 numC = CodeCycles; + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; s32 numD = DataCycles + CyclesILed; s32 early; @@ -1287,7 +1287,7 @@ void ARMv5::AddCycles_CD_STR() void ARMv5::AddCycles_CD_STM() { - s32 numC = CodeCycles; + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; s32 numD = DataCycles + CyclesILed; s32 early; @@ -1313,7 +1313,7 @@ void ARMv5::AddCycles_CD_STM() void ARMv5::AddCycles_CDI_LDR() { // LDR cycles. ARM9 seems to skip the internal cycle here. - s32 numC = CodeCycles; + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; s32 numD = DataCycles + CyclesILed; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early @@ -1340,7 +1340,7 @@ void ARMv5::AddCycles_CDI_LDR() void ARMv5::AddCycles_CDI_LDM() { // LDM cycles. ARM9 seems to skip the internal cycle here. - s32 numC = CodeCycles; + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; s32 numD = DataCycles + CyclesILed; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early From 1fdac1d489089019b1d495ce948e40ab21ab98e8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 11 Jul 2024 16:18:55 -0400 Subject: [PATCH 093/306] ...why am i checking for dtcm? --- src/ARM.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7d5a02c7..7e0e9228 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -304,7 +304,6 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) u32 newregion = addr >> 24; if (addr < ITCMSize) CodeRegion = Mem9_ITCM; - else if ((addr & DTCMMask) == DTCMBase) CodeRegion = Mem9_DTCM; else CodeRegion = NDS.ARM9Regions[addr >> 14]; RegionCodeCycles = MemTimings[addr >> 12][0]; From 038ffa3a3598c03b156dc8626f46738fead16728 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 11 Jul 2024 20:06:56 -0400 Subject: [PATCH 094/306] revert the *entire* interlock implemention too slow, not accurate enough. we need to do a *lot* more research into the specifics of how this works with all the various aspects of the cpu's timings before we can make a good implementation --- src/ARM.cpp | 21 +- src/ARM.h | 88 +------- src/ARMInterpreter.cpp | 4 +- src/ARMInterpreter_ALU.cpp | 351 +++++++++++++++---------------- src/ARMInterpreter_Branch.cpp | 16 +- src/ARMInterpreter_LoadStore.cpp | 344 ++++++++++-------------------- 6 files changed, 290 insertions(+), 534 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7e0e9228..e1f93a58 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -190,8 +190,6 @@ void ARM::Reset() BreakReq = false; #endif - memset(InterlockTimestamp, 0, sizeof(InterlockTimestamp)); - // zorp JumpTo(ExceptionBase); } @@ -695,7 +693,6 @@ void ARMv5::Execute() NDS.ARM9Timestamp += Cycles; Cycles = 0; - CyclesILed = 0; } if (Halted == 2) @@ -1262,7 +1259,7 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) void ARMv5::AddCycles_CD_STR() { s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles + CyclesILed; + s32 numD = DataCycles; s32 early; if (DataRegion == Mem9_ITCM) @@ -1287,7 +1284,7 @@ void ARMv5::AddCycles_CD_STR() void ARMv5::AddCycles_CD_STM() { s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles + CyclesILed; + s32 numD = DataCycles; s32 early; if (DataRegion == Mem9_ITCM) @@ -1313,7 +1310,7 @@ void ARMv5::AddCycles_CDI_LDR() { // LDR cycles. ARM9 seems to skip the internal cycle here. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles + CyclesILed; + s32 numD = DataCycles; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early s32 early; @@ -1340,7 +1337,7 @@ void ARMv5::AddCycles_CDI_LDM() { // LDM cycles. ARM9 seems to skip the internal cycle here. s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles + CyclesILed; + s32 numD = DataCycles; // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early s32 early; @@ -1442,16 +1439,6 @@ void ARMv4::AddCycles_CD() } } -u64& ARMv5::Timestamp() -{ - return NDS.ARM9Timestamp; -} - -u64& ARMv4::Timestamp() -{ - return NDS.ARM7Timestamp; -} - u8 ARMv5::BusRead8(u32 addr) { return NDS.ARM9Read8(addr); diff --git a/src/ARM.h b/src/ARM.h index 25889329..dae5d96a 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -30,8 +30,6 @@ #include "debug/GdbStub.h" #endif -//#define INTERLOCK - namespace melonDS { inline u32 ROR(u32 x, u32 n) @@ -148,46 +146,6 @@ public: virtual void AddCycles_CD_STR() = 0; virtual void AddCycles_CD_STM() = 0; -/* - inline void AddCycles_L(const u32 delay, const u32 reg1) - { - if (InterlockTimestamp[reg1] > Timestamp() + delay); - Timestamp() = InterlockTimestamp[reg1]; - } - - inline void AddCycles_L(const u32 delay, const u32 reg1, const u32 reg2) - { - u64 cycles = std::max(InterlockTimestamp[reg1], InterlockTimestamp[reg2]); - if (cycles > Timestamp() + delay) - Timestamp() = cycles; - } - - inline void AddCycles_L(const u32 delay, const u32 reg1, const u32 reg2, const u32 reg3) - { - u64 cycles = std::max(InterlockTimestamp[reg1], std::max(InterlockTimestamp[reg2], InterlockTimestamp[reg3])); - if (cycles > Timestamp() + delay) - Timestamp() = cycles; - }*/ - -#ifdef INTERLOCK - // fetch the value of a register while handling any interlock cycles - virtual inline u32 GetReg(const u32 reg, const u32 delay = 0) = 0; - - // Must be called after all of an instruction's cycles are calculated!!! - virtual inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) = 0; -#else - // fetch the value of a register while handling any interlock cycles - inline u32 GetReg(const u32 reg, const u32 delay = 0) - { - return R[reg]; - } - - // Must be called after all of an instruction's cycles are calculated!!! - inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) {} -#endif - - virtual u64& Timestamp() = 0; - void CheckGdbIncoming(); u32 Num; @@ -224,15 +182,6 @@ public: MemRegion CodeMem; - enum InterlockType - { - ILT_Norm = 0, - ILT_Mul = 1, - }; - - u8 InterlockType[16]; - u64 InterlockTimestamp[16]; - #ifdef JIT_ENABLED u32 FastBlockLookupStart, FastBlockLookupSize; u64* FastBlockLookup; @@ -318,14 +267,14 @@ public: { // code only. always nonseq 32-bit for ARM9. s32 numC = CodeCycles; - Cycles += std::max(numC, CyclesILed + 1); + Cycles += numC; } void AddCycles_CI(s32 numI) override { // code+internal s32 numC = CodeCycles; - numI += 1 + CyclesILed; + numI += 1; Cycles += std::max(numC, numI); } @@ -334,25 +283,6 @@ public: void AddCycles_CDI_SWP() override { AddCycles_CD_STR(); } // uses the same behavior as str void AddCycles_CD_STR() override; void AddCycles_CD_STM() override; - -#ifdef INTERLOCK - // fetch the value of a register while handling any interlock cycles - inline u32 GetReg(const u32 reg, const u32 delay = 0) override - { - if (InterlockTimestamp[reg] > (Timestamp() + delay)) - CyclesILed = InterlockTimestamp[reg] - (Timestamp() + delay); - return R[reg]; - } - - // Must be called after all of an instruction's cycles are calculated!!! - inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) override - { - InterlockTimestamp[reg] = cycles + Timestamp() + Cycles; - //InterlockType[reg] = type; - } -#endif - - u64& Timestamp() override; void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -417,8 +347,6 @@ public: bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); - s32 CyclesILed; - #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; void WriteMem(u32 addr, int size, u32 v) override; @@ -476,18 +404,6 @@ public: void AddCycles_CD_STR() override { AddCycles_CD(); } void AddCycles_CD_STM() override { AddCycles_CD(); } -#ifdef INTERLOCK - // fetch the value of a register while handling any interlock cycles - inline u32 GetReg(const u32 reg, const u32 delay = 0) override - { - return R[reg]; - } - - // Must be called after all of an instruction's cycles are calculated!!! - inline void SetCycles_L(const u32 reg, const u32 cycles, const u32 type) override{} -#endif - - u64& Timestamp() override; protected: u8 BusRead8(u32 addr) override; u16 BusRead16(u32 addr) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 93b347b5..f9623147 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -163,7 +163,7 @@ void A_MSR_REG(ARM* cpu) if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; - u32 val = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 val = cpu->R[cpu->CurInstr & 0xF]; // bit4 is forced to 1 val |= 0x00000010; @@ -216,7 +216,7 @@ void A_MCR(ARM* cpu) u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; - u32 val = cpu->GetReg((cpu->CurInstr>>12)&0xF); + u32 val = cpu->R[(cpu->CurInstr>>12)&0xF]; if (((cpu->CurInstr>>12) & 0xF) == 15) val += 4; if (cpu->Num==0 && cp==15) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index e3208668..bc655996 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -160,14 +160,14 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) cpu->SetC(b & 0x80000000); #define A_CALC_OP2_REG_SHIFT_IMM(shiftop) \ - u32 b = cpu->GetReg(cpu->CurInstr&0xF); \ + u32 b = cpu->R[cpu->CurInstr&0xF]; \ u32 s = (cpu->CurInstr>>7)&0x1F; \ shiftop(b, s); #define A_CALC_OP2_REG_SHIFT_REG(shiftop) \ - u32 b = cpu->GetReg(cpu->CurInstr&0xF); \ + u32 b = cpu->R[cpu->CurInstr&0xF]; \ if ((cpu->CurInstr&0xF)==15) b += 4; \ - shiftop(b, (cpu->GetReg((cpu->CurInstr>>8)&0xF) & 0xFF)); + shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); #define A_IMPLEMENT_ALU_OP(x,s) \ @@ -313,7 +313,7 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -326,7 +326,7 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ } #define A_AND_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -344,7 +344,7 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a ^ b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -357,7 +357,7 @@ A_IMPLEMENT_ALU_OP(AND,_S) } #define A_EOR_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -375,7 +375,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -388,7 +388,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) } #define A_SUB_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -408,7 +408,7 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = b - a; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -421,7 +421,7 @@ A_IMPLEMENT_ALU_OP(SUB,) } #define A_RSB_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = b - a; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -441,7 +441,7 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -454,7 +454,7 @@ A_IMPLEMENT_ALU_OP(RSB,) } #define A_ADD_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -474,7 +474,7 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b + (cpu->CPSR&0x20000000 ? 1:0); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -487,7 +487,7 @@ A_IMPLEMENT_ALU_OP(ADD,) } #define A_ADC_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res_tmp = a + b; \ u32 carry = (cpu->CPSR&0x20000000 ? 1:0); \ u32 res = res_tmp + carry; \ @@ -509,7 +509,7 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -522,7 +522,7 @@ A_IMPLEMENT_ALU_OP(ADC,) } #define A_SBC_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res_tmp = a - b; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -544,7 +544,7 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = b - a - (cpu->CPSR&0x20000000 ? 0:1); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -557,7 +557,7 @@ A_IMPLEMENT_ALU_OP(SBC,) } #define A_RSC_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res_tmp = b - a; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -579,7 +579,7 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -589,7 +589,7 @@ A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -599,7 +599,7 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -611,7 +611,7 @@ A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ @@ -623,7 +623,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a | b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -636,7 +636,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) } #define A_ORR_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a | b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -699,7 +699,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & ~b; \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ @@ -712,7 +712,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) } #define A_BIC_S(c) \ - u32 a = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & ~b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ @@ -761,12 +761,18 @@ A_IMPLEMENT_ALU_OP(MVN,_S) void A_MUL(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; u32 res = rm * rs; cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ(res & 0x80000000, + !res); + if (cpu->Num==1) cpu->SetC(0); + } u32 cycles; if (cpu->Num == 0) @@ -780,55 +786,53 @@ void A_MUL(ARM* cpu) } cpu->AddCycles_CI(cycles); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ(res & 0x80000000, - !res); - if (cpu->Num==1) cpu->SetC(0); - } - else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_MLA(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF); - u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; u32 res = (rm * rs) + rn; cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - - u32 cycles; - if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; - else - { - if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; - else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; - else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; - else cycles = 5; - } - - cpu->AddCycles_CI(cycles); if (cpu->CurInstr & (1<<20)) { cpu->SetNZ(res & 0x80000000, !res); if (cpu->Num==1) cpu->SetC(0); } - else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions + + u32 cycles; + if (cpu->Num == 0) + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + else + { + if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; + else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; + else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; + else cycles = 5; + } + + cpu->AddCycles_CI(cycles); } void A_UMULL(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; u64 res = (u64)rm * (u64)rs; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } u32 cycles; if (cpu->Num == 0) @@ -842,27 +846,26 @@ void A_UMULL(ARM* cpu) } cpu->AddCycles_CI(cycles); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_UMLAL(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; u64 res = (u64)rm * (u64)rs; - u64 rd = (u64)cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1) | ((u64)cpu->GetReg((cpu->CurInstr >> 16) & 0xF) << 32ULL); + u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } u32 cycles; if (cpu->Num == 0) @@ -876,24 +879,23 @@ void A_UMLAL(ARM* cpu) } cpu->AddCycles_CI(cycles); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMULL(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; s64 res = (s64)(s32)rm * (s64)(s32)rs; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } u32 cycles; if (cpu->Num == 0) @@ -907,27 +909,26 @@ void A_SMULL(ARM* cpu) } cpu->AddCycles_CI(cycles); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAL(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; s64 res = (s64)(s32)rm * (s64)(s32)rs; - s64 rd = (s64)((u64)cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1) | ((u64)cpu->GetReg((cpu->CurInstr >> 16) & 0xF) << 32ULL)); + s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); res += rd; cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (cpu->CurInstr & (1<<20)) + { + cpu->SetNZ((u32)(res >> 63ULL), + !res); + if (cpu->Num==1) cpu->SetC(0); + } u32 cycles; if (cpu->Num == 0) @@ -939,24 +940,17 @@ void A_SMLAL(ARM* cpu) else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; } - + cpu->AddCycles_CI(cycles); - if (cpu->CurInstr & (1<<20)) - { - cpu->SetNZ((u32)(res >> 63ULL), - !res); - if (cpu->Num==1) cpu->SetC(0); - } - else cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Mul); // interlock cycles do not occur with S variants of multiply instructions } void A_SMLAxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); - u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -970,17 +964,16 @@ void A_SMLAxy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_SMLAWy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); - u32 rn = cpu->GetReg((cpu->CurInstr >> 12) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; @@ -992,16 +985,15 @@ void A_SMLAWy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_SMULxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -1011,16 +1003,15 @@ void A_SMULxy(ARM* cpu) u32 res = ((s16)rm * (s16)rs); cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_SMULWy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; if (cpu->CurInstr & (1<<6)) rs >>= 16; else rs &= 0xFFFF; @@ -1028,16 +1019,15 @@ void A_SMULWy(ARM* cpu) u32 res = ((s64)(s32)rm * (s16)rs) >> 16; cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_SMLALxy(ARM* cpu) { if (cpu->Num != 0) return; - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 0); - u32 rs = cpu->GetReg((cpu->CurInstr >> 8) & 0xF, 0); // yeah this one actually doesn't need two interlock cycles to interlock + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; if (cpu->CurInstr & (1<<5)) rm >>= 16; else rm &= 0xFFFF; @@ -1052,8 +1042,7 @@ void A_SMLALxy(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - cpu->AddCycles_CI(1); - cpu->SetCycles_L((cpu->CurInstr >> 16) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_CI(1); // TODO: interlock?? } @@ -1062,7 +1051,7 @@ void A_CLZ(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 val = cpu->GetReg(cpu->CurInstr & 0xF, 1); + u32 val = cpu->R[cpu->CurInstr & 0xF]; u32 res = 0; while ((val & 0xFF000000) == 0) @@ -1087,8 +1076,8 @@ void A_QADD(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 res = rm + rn; if (OverflowAdd(rm, rn)) @@ -1098,16 +1087,15 @@ void A_QADD(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_QSUB(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 res = rm - rn; if (OverflowSub(rm, rn)) @@ -1117,16 +1105,15 @@ void A_QSUB(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_QDADD(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; if (OverflowAdd(rn, rn)) { @@ -1144,16 +1131,15 @@ void A_QDADD(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } void A_QDSUB(ARM* cpu) { if (cpu->Num != 0) return A_UNK(cpu); - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 rn = cpu->GetReg((cpu->CurInstr >> 16) & 0xF, 1); + u32 rm = cpu->R[cpu->CurInstr & 0xF]; + u32 rn = cpu->R[(cpu->CurInstr >> 16) & 0xF]; if (OverflowAdd(rn, rn)) { @@ -1171,8 +1157,7 @@ void A_QDSUB(ARM* cpu) } cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); - cpu->SetCycles_L((cpu->CurInstr >> 12) & 0xF, 1, cpu->ILT_Norm); + cpu->AddCycles_C(); // TODO: interlock?? } @@ -1183,7 +1168,7 @@ void A_QDSUB(ARM* cpu) void T_LSL_IMM(ARM* cpu) { - u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 s = (cpu->CurInstr >> 6) & 0x1F; LSL_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; @@ -1194,7 +1179,7 @@ void T_LSL_IMM(ARM* cpu) void T_LSR_IMM(ARM* cpu) { - u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 s = (cpu->CurInstr >> 6) & 0x1F; LSR_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; @@ -1205,7 +1190,7 @@ void T_LSR_IMM(ARM* cpu) void T_ASR_IMM(ARM* cpu) { - u32 op = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 op = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 s = (cpu->CurInstr >> 6) & 0x1F; ASR_IMM_S(op, s); cpu->R[cpu->CurInstr & 0x7] = op; @@ -1216,8 +1201,8 @@ void T_ASR_IMM(ARM* cpu) void T_ADD_REG_(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 6) & 0x7]; u32 res = a + b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1229,8 +1214,8 @@ void T_ADD_REG_(ARM* cpu) void T_SUB_REG_(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 6) & 0x7]; u32 res = a - b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1242,7 +1227,7 @@ void T_SUB_REG_(ARM* cpu) void T_ADD_IMM_(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 b = (cpu->CurInstr >> 6) & 0x7; u32 res = a + b; cpu->R[cpu->CurInstr & 0x7] = res; @@ -1255,7 +1240,7 @@ void T_ADD_IMM_(ARM* cpu) void T_SUB_IMM_(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 b = (cpu->CurInstr >> 6) & 0x7; u32 res = a - b; cpu->R[cpu->CurInstr & 0x7] = res; @@ -1275,9 +1260,9 @@ void T_MOV_IMM(ARM* cpu) cpu->AddCycles_C(); } -void T_CMP_IMM(ARM* cpu) +void T_CMP_IMM(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; u32 res = a - b; cpu->SetNZCV(res & 0x80000000, @@ -1289,7 +1274,7 @@ void T_CMP_IMM(ARM* cpu) void T_ADD_IMM(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; u32 res = a + b; cpu->R[(cpu->CurInstr >> 8) & 0x7] = res; @@ -1302,7 +1287,7 @@ void T_ADD_IMM(ARM* cpu) void T_SUB_IMM(ARM* cpu) { - u32 a = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); + u32 a = cpu->R[(cpu->CurInstr >> 8) & 0x7]; u32 b = cpu->CurInstr & 0xFF; u32 res = a - b; cpu->R[(cpu->CurInstr >> 8) & 0x7] = res; @@ -1316,8 +1301,8 @@ void T_SUB_IMM(ARM* cpu) void T_AND_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a & b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1327,8 +1312,8 @@ void T_AND_REG(ARM* cpu) void T_EOR_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a ^ b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1338,8 +1323,8 @@ void T_EOR_REG(ARM* cpu) void T_LSL_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; LSL_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1349,8 +1334,8 @@ void T_LSL_REG(ARM* cpu) void T_LSR_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; LSR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1360,8 +1345,8 @@ void T_LSR_REG(ARM* cpu) void T_ASR_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; ASR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1371,8 +1356,8 @@ void T_ASR_REG(ARM* cpu) void T_ADC_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res_tmp = a + b; u32 carry = (cpu->CPSR&0x20000000 ? 1:0); u32 res = res_tmp + carry; @@ -1386,8 +1371,8 @@ void T_ADC_REG(ARM* cpu) void T_SBC_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res_tmp = a - b; u32 carry = (cpu->CPSR&0x20000000 ? 0:1); u32 res = res_tmp - carry; @@ -1401,8 +1386,8 @@ void T_SBC_REG(ARM* cpu) void T_ROR_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7, 1); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) & 0xFF; + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7] & 0xFF; ROR_REG_S(a, b); cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, @@ -1412,8 +1397,8 @@ void T_ROR_REG(ARM* cpu) void T_TST_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a & b; cpu->SetNZ(res & 0x80000000, !res); @@ -1422,7 +1407,7 @@ void T_TST_REG(ARM* cpu) void T_NEG_REG(ARM* cpu) { - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = -b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZCV(res & 0x80000000, @@ -1434,8 +1419,8 @@ void T_NEG_REG(ARM* cpu) void T_CMP_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a - b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1446,8 +1431,8 @@ void T_CMP_REG(ARM* cpu) void T_CMN_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a + b; cpu->SetNZCV(res & 0x80000000, !res, @@ -1458,8 +1443,8 @@ void T_CMN_REG(ARM* cpu) void T_ORR_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a | b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1469,8 +1454,8 @@ void T_ORR_REG(ARM* cpu) void T_MUL_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a * b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1494,8 +1479,8 @@ void T_MUL_REG(ARM* cpu) void T_BIC_REG(ARM* cpu) { - u32 a = cpu->GetReg(cpu->CurInstr & 0x7); - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 a = cpu->R[cpu->CurInstr & 0x7]; + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = a & ~b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1505,7 +1490,7 @@ void T_BIC_REG(ARM* cpu) void T_MVN_REG(ARM* cpu) { - u32 b = cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + u32 b = cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 res = ~b; cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, @@ -1522,8 +1507,8 @@ void T_ADD_HIREG(ARM* cpu) u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; - u32 a = cpu->GetReg(rd); - u32 b = cpu->GetReg(rs); + u32 a = cpu->R[rd]; + u32 b = cpu->R[rs]; cpu->AddCycles_C(); @@ -1542,8 +1527,8 @@ void T_CMP_HIREG(ARM* cpu) u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; - u32 a = cpu->GetReg(rd); - u32 b = cpu->GetReg(rs); + u32 a = cpu->R[rd]; + u32 b = cpu->R[rs]; u32 res = a - b; cpu->SetNZCV(res & 0x80000000, @@ -1562,11 +1547,11 @@ void T_MOV_HIREG(ARM* cpu) if (rd == 15) { - cpu->JumpTo(cpu->GetReg(rs) | 1); + cpu->JumpTo(cpu->R[rs] | 1); } else { - cpu->R[rd] = cpu->GetReg(rs); + cpu->R[rd] = cpu->R[rs]; } // nocash-style debugging hook @@ -1583,7 +1568,7 @@ void T_MOV_HIREG(ARM* cpu) } -void T_ADD_PCREL(ARM* cpu) // checkme: pc shouldn't be able to interlock? +void T_ADD_PCREL(ARM* cpu) { u32 val = cpu->R[15] & ~2; val += ((cpu->CurInstr & 0xFF) << 2); @@ -1591,7 +1576,7 @@ void T_ADD_PCREL(ARM* cpu) // checkme: pc shouldn't be able to interlock? cpu->AddCycles_C(); } -void T_ADD_SPREL(ARM* cpu) // checkme: sp shouldn't be able to interlock in thumb? +void T_ADD_SPREL(ARM* cpu) { u32 val = cpu->R[13]; val += ((cpu->CurInstr & 0xFF) << 2); @@ -1599,7 +1584,7 @@ void T_ADD_SPREL(ARM* cpu) // checkme: sp shouldn't be able to interlock in thum cpu->AddCycles_C(); } -void T_ADD_SP(ARM* cpu) // checkme: sp shouldn't be able to interlock in thumb? +void T_ADD_SP(ARM* cpu) { u32 val = cpu->R[13]; if (cpu->CurInstr & (1<<7)) diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index 284dfa75..623be41a 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -46,15 +46,15 @@ void A_BLX_IMM(ARM* cpu) cpu->JumpTo(cpu->R[15] + offset + 1); } -void A_BX(ARM* cpu) // verify interlock +void A_BX(ARM* cpu) { - cpu->JumpTo(cpu->GetReg(cpu->CurInstr & 0xF)); + cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); } -void A_BLX_REG(ARM* cpu) // verify interlock +void A_BLX_REG(ARM* cpu) { u32 lr = cpu->R[15] - 4; - cpu->JumpTo(cpu->GetReg(cpu->CurInstr & 0xF)); + cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); cpu->R[14] = lr; } @@ -71,12 +71,12 @@ void T_BCOND(ARM* cpu) cpu->AddCycles_C(); } -void T_BX(ARM* cpu) // verify interlock +void T_BX(ARM* cpu) { - cpu->JumpTo(cpu->GetReg((cpu->CurInstr >> 3) & 0xF)); + cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); } -void T_BLX_REG(ARM* cpu) // verify interlock +void T_BLX_REG(ARM* cpu) { if (cpu->Num==1) { @@ -85,7 +85,7 @@ void T_BLX_REG(ARM* cpu) // verify interlock } u32 lr = cpu->R[15] - 1; - cpu->JumpTo(cpu->GetReg((cpu->CurInstr >> 3) & 0xF)); + cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); cpu->R[14] = lr; } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4a640bc5..e2726005 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -53,7 +53,7 @@ namespace melonDS::ARMInterpreter if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_WB_CALC_OFFSET_REG(shiftop) \ - u32 offset = cpu->GetReg(cpu->CurInstr & 0xF); \ + u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ u32 shift = ((cpu->CurInstr>>7)&0x1F); \ shiftop(offset, shift); \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; @@ -61,8 +61,8 @@ namespace melonDS::ARMInterpreter #define A_STR \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ - u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(offset, storeval); \ @@ -72,8 +72,8 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STR_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ - u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(addr, storeval); \ @@ -82,8 +82,8 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRB \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ - u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(offset, storeval); \ cpu->AddCycles_CD_STR(); \ @@ -92,8 +92,8 @@ namespace melonDS::ARMInterpreter // TODO: user mode (bit21) #define A_STRB_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ - u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(addr, storeval); \ cpu->AddCycles_CD_STR(); \ @@ -101,7 +101,7 @@ namespace melonDS::ARMInterpreter cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDR \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ @@ -115,12 +115,11 @@ namespace melonDS::ARMInterpreter else \ { \ cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, (offset & 3) ? 2 : 1, cpu->ILT_Norm); \ } // TODO: user mode #define A_LDR_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ @@ -134,37 +133,26 @@ namespace melonDS::ARMInterpreter else \ { \ cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, (addr & 3) ? 2 : 1, cpu->ILT_Norm); \ } #define A_LDRB \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; // TODO: user mode #define A_LDRB_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; @@ -242,14 +230,14 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_HD_CALC_OFFSET_REG \ - u32 offset = cpu->GetReg(cpu->CurInstr & 0xF); \ + u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; #define A_STRH \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ - u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(offset, storeval); \ cpu->AddCycles_CD_STR(); \ @@ -257,8 +245,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRH_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ - u32 storeval = cpu->GetReg((cpu->CurInstr>>12) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(addr, storeval); \ cpu->AddCycles_CD_STR(); \ @@ -269,47 +257,35 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD \ if (cpu->Num != 0) return; \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (offset, &cpu->R[r])) {cpu->AddCycles_CDI_LDR(); return;} \ - u32 val; bool dataabort = !cpu->DataRead32S(offset+4, &val); \ + if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ + u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ + else cpu->R[r+1] = val; \ cpu->AddCycles_CDI_LDM(); \ - if (dataabort) return; \ - if (r == 14) \ - cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ - else \ - { \ - cpu->R[r+1] = val; \ - cpu->SetCycles_L(r+1, 1, cpu->ILT_Norm); \ - } \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ + if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ if (cpu->Num != 0) return; \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (addr, &cpu->R[r])) {cpu->AddCycles_CDI_LDR(); return;} \ - u32 val; bool dataabort = !cpu->DataRead32S(addr+4, &val); \ + if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ + u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ + if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ + else cpu->R[r+1] = val; \ cpu->AddCycles_CDI_LDM(); \ - if (dataabort) return; \ - if (r == 14) \ - cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ - else \ - { \ - cpu->R[r+1] = val; \ - cpu->SetCycles_L(r+1, 1, cpu->ILT_Norm); \ - } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ if (cpu->Num != 0) return; \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(offset, cpu->GetReg(r)); /* yes, this data abort behavior is on purpose */ \ - u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ + bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ + u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ cpu->AddCycles_CD_STM(); \ if (dataabort) return; \ @@ -317,102 +293,72 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD_POST \ if (cpu->Num != 0) return; \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(addr, cpu->GetReg(r)); \ - u32 storeval = cpu->GetReg(r+1, cpu->DataCycles); if (r == 14) storeval+=4; \ + bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ + u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ cpu->AddCycles_CD_STM(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRH_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSB \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSB_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRSH \ - offset += cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRSH_POST \ - u32 addr = cpu->GetReg((cpu->CurInstr>>16) & 0xF); \ + u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ cpu->AddCycles_CDI_LDR(); \ if (dataabort) return; \ val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - cpu->JumpTo8_16Bit(val); \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->SetCycles_L((cpu->CurInstr>>12) & 0xF, 2, cpu->ILT_Norm); \ - } \ + if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ + else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -452,8 +398,8 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) void A_SWP(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1); - u32 base = cpu->GetReg((cpu->CurInstr >> 16) & 0xF); + u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->R[cpu->CurInstr & 0xF]; if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; @@ -462,38 +408,20 @@ void A_SWP(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite32(base, rm)) { - cpu->AddCycles_CDI_SWP(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; - if (rd != 15) - { - cpu->R[rd] = ROR(val, 8*(base&0x3)); - - if (cpu->Num == 0) - { - u32 cycles; - if (base & 3) // add an extra interlock cycle when doing a misaligned load from a non-itcm address (checkme: does it matter whether you're executing from there?) - { - cycles = ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2; - } - else cycles = 1; - - cpu->SetCycles_L(rd, cycles, cpu->ILT_Norm); - } - } - else if (cpu->Num == 1) // for some reason these jumps don't work on the arm 9? - cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1, cpu->ILT_Norm); + if (rd != 15) cpu->R[rd] = ROR(val, 8*(base&0x3)); + else if (cpu->Num==1) cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1); // for some reason these jumps don't work on the arm 9? } - else cpu->AddCycles_CDI_SWP(); cpu->DataCycles += numD; } - else cpu->AddCycles_CDI_SWP(); + cpu->AddCycles_CDI_SWP(); } void A_SWPB(ARM* cpu) { - u32 rm = cpu->GetReg(cpu->CurInstr & 0xF, 1) & 0xFF; - u32 base = cpu->GetReg((cpu->CurInstr >> 16) & 0xF); + u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; + u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; @@ -502,24 +430,14 @@ void A_SWPB(ARM* cpu) u32 numD = cpu->DataCycles; if (cpu->DataWrite8(base, rm)) { - cpu->AddCycles_CDI_SWP(); // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; - if (rd != 15) - { - cpu->R[rd] = val; - - // add an extra interlock cycle when doing a load from a non-itcm address (checkme: does it matter whether you're executing from there?) - if (cpu->Num == 0) - cpu->SetCycles_L(rd, ((base < ((ARMv5*)cpu)->ITCMSize) && ((cpu->R[15]-8) < ((ARMv5*)cpu)->ITCMSize)) ? 1 : 2, cpu->ILT_Norm); - } - else if (cpu->Num == 1)// for some reason these jumps don't work on the arm 9? - cpu->JumpTo(val & ~1); + if (rd != 15) cpu->R[rd] = val; + else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't work on the arm 9? } - else cpu->AddCycles_CDI_SWP(); cpu->DataCycles += numD; } - else cpu->AddCycles_CDI_SWP(); + cpu->AddCycles_CDI_SWP(); } @@ -527,12 +445,11 @@ void A_SWPB(ARM* cpu) void A_LDM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; - u32 base = cpu->GetReg(baseid, 1); + u32 base = cpu->R[baseid]; u32 wbbase; u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; - u32 lastreg = 0; // TODO: this doesn't support 0 reg LDMs (do those even work?) if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -568,7 +485,6 @@ void A_LDM(ARM* cpu) } first = false; - lastreg = i; if (!preinc) base += 4; } } @@ -582,26 +498,12 @@ void A_LDM(ARM* cpu) { goto dataabort; } - cpu->AddCycles_CDI_LDM(); if (!preinc) base += 4; if (cpu->Num == 1) pc &= ~0x1; } - else - { - cpu->AddCycles_CDI_LDM(); - - if (cpu->Num == 0) - { - u32 lastbase = base; - if (!preinc) lastbase -= 4; - // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) - if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) - cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); - } - } // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) @@ -635,8 +537,6 @@ void A_LDM(ARM* cpu) if (false) { dataabort: - cpu->AddCycles_CDI_LDM(); - // CHECKME: interlock shouldn't apply when it data aborts, right? // switch back to original set of regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) @@ -645,12 +545,14 @@ void A_LDM(ARM* cpu) // restore original value of base in case the reg got written to cpu->R[baseid] = oldbase; } + + cpu->AddCycles_CDI_LDM(); } void A_STM(ARM* cpu) { u32 baseid = (cpu->CurInstr >> 16) & 0xF; - u32 base = cpu->GetReg(baseid, 1); + u32 base = cpu->R[baseid]; u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; @@ -694,7 +596,7 @@ void A_STM(ARM* cpu) val = oldbase; else val = base; } - else val = cpu->GetReg(i, 1+cpu->DataCycles); + else val = cpu->R[i]; if (i == 15) val+=4; @@ -738,170 +640,160 @@ void A_STM(ARM* cpu) -void T_LDR_PCREL(ARM* cpu) // checkme: can pc be interlocked? +void T_LDR_PCREL(ARM* cpu) { u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme: verify cycle count } -void T_STR_REG(ARM* cpu) +void T_STR_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); - cpu->DataWrite32(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + cpu->DataWrite32(addr, cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CD_STR(); } void T_STRB_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); - cpu->DataWrite8(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + cpu->DataWrite8(addr, cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CD_STR(); } void T_LDR_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; u32 val; if (cpu->DataRead32(addr, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, (addr & 3) ? 2 : 1, cpu->ILT_Norm); } void T_LDRB_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_STRH_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); - cpu->DataWrite16(addr, cpu->GetReg(cpu->CurInstr & 0x7, 1)); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; + cpu->DataWrite16(addr, cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CD_STR(); } void T_LDRSB_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; if (cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_LDRH_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_LDRSH_REG(ARM* cpu) { - u32 addr = cpu->GetReg((cpu->CurInstr >> 3) & 0x7) + cpu->GetReg((cpu->CurInstr >> 6) & 0x7); + u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; if (cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_STR_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - cpu->DataWrite32(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); - cpu->AddCycles_CD_STR(); + cpu->DataWrite32(offset, cpu->R[cpu->CurInstr & 0x7]); + cpu->AddCycles_CD_LDR(); } void T_LDR_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; u32 val; if (cpu->DataRead32(offset, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, (offset & 3) ? 2 : 1, cpu->ILT_Norm); } void T_STRB_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - cpu->DataWrite8(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); + cpu->DataWrite8(offset, cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CD_STR(); } void T_LDRB_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } void T_STRH_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - cpu->DataWrite16(offset, cpu->GetReg(cpu->CurInstr & 0x7, 1)); + cpu->DataWrite16(offset, cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CD_STR(); } void T_LDRH_IMM(ARM* cpu) { u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->GetReg((cpu->CurInstr >> 3) & 0x7); + offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L(cpu->CurInstr & 0x7, 2, cpu->ILT_Norm); } -void T_STR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? +void T_STR_SPREL(ARM* cpu) { u32 offset = (cpu->CurInstr << 2) & 0x3FC; offset += cpu->R[13]; - cpu->DataWrite32(offset, cpu->GetReg((cpu->CurInstr >> 8) & 0x7, 1)); + cpu->DataWrite32(offset, cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CD_STR(); } -void T_LDR_SPREL(ARM* cpu) // checkme: can sp be interlocked in thumb mode? +void T_LDR_SPREL(ARM* cpu) { u32 offset = (cpu->CurInstr << 2) & 0x3FC; offset += cpu->R[13]; cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI_LDR(); - cpu->SetCycles_L((cpu->CurInstr >> 8) & 0x7, 1, cpu->ILT_Norm); // checkme: verify cycle count } @@ -919,7 +811,7 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) nregs++; - u32 base = cpu->GetReg(13); + u32 base = cpu->R[13]; base -= (nregs<<2); u32 wbbase = base; @@ -927,8 +819,8 @@ void T_PUSH(ARM* cpu) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->GetReg(i, 1)) - : cpu->DataWrite32S(base, cpu->GetReg(i, 1)))) // verify interlock + if (!(first ? cpu->DataWrite32 (base, cpu->R[i]) + : cpu->DataWrite32S(base, cpu->R[i]))) { goto dataabort; } @@ -952,11 +844,10 @@ void T_PUSH(ARM* cpu) cpu->AddCycles_CD_STM(); } -void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? +void T_POP(ARM* cpu) { u32 base = cpu->R[13]; bool first = true; - u32 lastreg = 0; for (int i = 0; i < 8; i++) { @@ -986,16 +877,6 @@ void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? } cpu->R[13] = base; - - cpu->AddCycles_CDI_LDM(); - if (cpu->Num == 0) - { - u32 lastbase = base - 4; - // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) - if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) - cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); - } - return; dataabort: cpu->AddCycles_CDI_LDM(); @@ -1003,15 +884,15 @@ void T_POP(ARM* cpu) // checkme: can sp be interlocked in thumb mode? void T_STMIA(ARM* cpu) { - u32 base = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); + u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->GetReg(i, 1)) - : cpu->DataWrite32S(base, cpu->GetReg(i, 1)))) + if (!(first ? cpu->DataWrite32 (base, cpu->R[i]) + : cpu->DataWrite32S(base, cpu->R[i]))) { goto dataabort; } @@ -1028,9 +909,8 @@ void T_STMIA(ARM* cpu) void T_LDMIA(ARM* cpu) { - u32 base = cpu->GetReg((cpu->CurInstr >> 8) & 0x7); + u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; - u32 lastreg = 0; for (int i = 0; i < 8; i++) { @@ -1043,23 +923,11 @@ void T_LDMIA(ARM* cpu) } first = false; base += 4; - lastreg = i; } } if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - - - cpu->AddCycles_CDI_LDM(); - if (cpu->Num == 0) - { - u32 lastbase = base - 4; - // no interlock occurs when loading from itcm (checkme: does it matter whether you're executing from there?) - if ((((ARMv5*)cpu)->ITCMSize < lastbase) && ((cpu->R[15]-8) > ((ARMv5*)cpu)->ITCMSize) && (cpu->CurInstr & (0x7FFF >> (15 - lastreg)))) - cpu->SetCycles_L(lastreg, 1, cpu->ILT_Norm); - } - return; dataabort: cpu->AddCycles_CDI_LDM(); From 4fcd52ed1682de76b96d9e486c1a4ee983f2f593 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 11 Jul 2024 20:19:25 -0400 Subject: [PATCH 095/306] someday i will learn to test things before pushing them --- src/ARMInterpreter_LoadStore.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index e2726005..3df9acdd 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -260,8 +260,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ + if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI_LDM(); return;} \ + u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI_LDM(); return;} \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI_LDM(); \ @@ -272,8 +272,8 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ + if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI_LDM(); return;} \ + u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI_LDM(); return;} \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI_LDM(); \ @@ -726,7 +726,7 @@ void T_STR_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataWrite32(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_LDR(); + cpu->AddCycles_CD_STR(); } void T_LDR_IMM(ARM* cpu) From 789ef21c700774211467cba3261b1c3d88b0a159 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 12 Jul 2024 22:46:22 -0400 Subject: [PATCH 096/306] improve timings for S variants of multiply instructions on arm9 behavior seems to be a quirk of the way they made the interlock cycle mandatory --- src/ARM.h | 13 +++++- src/ARMInterpreter_ALU.cpp | 81 +++++++++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 24 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index dae5d96a..3bbc8735 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -272,12 +272,23 @@ public: void AddCycles_CI(s32 numI) override { - // code+internal + // code||internal s32 numC = CodeCycles; numI += 1; Cycles += std::max(numC, numI); } + void AddCycles_CIL(s32 numI, s32 numL) + { + // (code||internal)+forced interlock + // used by S variants of multiply instructions on the ARM9 + // seems that instead of adding extra hardware logic to allow for handling the memory stage of the instructions during the execute stage + // it instead seems to force a two cycle interlock allowing for the interlocked cycle to be executed without any special logic + presumably an extra cycle to set flags + s32 numC = CodeCycles; + numI += 1; + Cycles += std::max(numC, numI) + numL; + } + void AddCycles_CDI_LDR() override; void AddCycles_CDI_LDM() override; void AddCycles_CDI_SWP() override { AddCycles_CD_STR(); } // uses the same behavior as str diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index bc655996..e7b3ffb5 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -774,18 +774,23 @@ void A_MUL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + else + cpu->AddCycles_CI(1); + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 2; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 3; else cycles = 4; - } - cpu->AddCycles_CI(cycles); + cpu->AddCycles_CI(cycles); + } } void A_MLA(ARM* cpu) @@ -804,18 +809,23 @@ void A_MLA(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + else + cpu->AddCycles_CI(1); + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; - } - cpu->AddCycles_CI(cycles); + cpu->AddCycles_CI(cycles); + } } void A_UMULL(ARM* cpu) @@ -834,18 +844,24 @@ void A_UMULL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + else + cpu->AddCycles_CI(1); + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_UMLAL(ARM* cpu) @@ -867,18 +883,24 @@ void A_UMLAL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + else + cpu->AddCycles_CI(1); + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_SMULL(ARM* cpu) @@ -897,18 +919,24 @@ void A_SMULL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + else + cpu->AddCycles_CI(1); + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_SMLAL(ARM* cpu) @@ -930,18 +958,24 @@ void A_SMLAL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + else + cpu->AddCycles_CI(1); + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_SMLAxy(ARM* cpu) @@ -1461,20 +1495,21 @@ void T_MUL_REG(ARM* cpu) cpu->SetNZ(res & 0x80000000, !res); - s32 cycles = 0; if (cpu->Num == 0) { - cycles += 3; + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); // checkme? } else { + s32 cycles = 0; cpu->SetC(0); // carry flag destroyed, they say. whatever that means... if (a & 0xFF000000) cycles += 4; else if (a & 0x00FF0000) cycles += 3; else if (a & 0x0000FF00) cycles += 2; else cycles += 1; + + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void T_BIC_REG(ARM* cpu) From 764ee9ea1abf6aecebc96b253393f5b10a6a2381 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 13 Jul 2024 08:01:39 -0400 Subject: [PATCH 097/306] improve timings further --- src/ARM.h | 11 ++++++----- src/ARMInterpreter_ALU.cpp | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 3bbc8735..8ea553e6 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -278,12 +278,13 @@ public: Cycles += std::max(numC, numI); } - void AddCycles_CIL(s32 numI, s32 numL) + void AddCycles_CIF(s32 numI, s32 numL) { - // (code||internal)+forced interlock - // used by S variants of multiply instructions on the ARM9 - // seems that instead of adding extra hardware logic to allow for handling the memory stage of the instructions during the execute stage - // it instead seems to force a two cycle interlock allowing for the interlocked cycle to be executed without any special logic + presumably an extra cycle to set flags + // (code||internal)+forced + // used by certain multiply instructions + // seems likely that the execute stage occurs 2 cycles before the fetch stage ends....? + // could also be in some way related to interlock and the memory stage + // though that doesn't explain why some non-S variants trigger this s32 numC = CodeCycles; numI += 1; Cycles += std::max(numC, numI) + numL; diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index e7b3ffb5..00af1dac 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -777,7 +777,7 @@ void A_MUL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + ((ARMv5*)cpu)->AddCycles_CIF(1, 2); else cpu->AddCycles_CI(1); } @@ -812,7 +812,7 @@ void A_MLA(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + ((ARMv5*)cpu)->AddCycles_CIF(1, 2); else cpu->AddCycles_CI(1); } @@ -847,9 +847,9 @@ void A_UMULL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + ((ARMv5*)cpu)->AddCycles_CIF(1, 3); else - cpu->AddCycles_CI(1); + ((ARMv5*)cpu)->AddCycles_CIF(1, 1); } else { @@ -886,9 +886,9 @@ void A_UMLAL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + ((ARMv5*)cpu)->AddCycles_CIF(1, 3); else - cpu->AddCycles_CI(1); + ((ARMv5*)cpu)->AddCycles_CIF(1, 1); } else { @@ -922,9 +922,9 @@ void A_SMULL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + ((ARMv5*)cpu)->AddCycles_CIF(1, 3); else - cpu->AddCycles_CI(1); + ((ARMv5*)cpu)->AddCycles_CIF(1, 1); } else { @@ -961,9 +961,9 @@ void A_SMLAL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); + ((ARMv5*)cpu)->AddCycles_CIF(1, 3); else - cpu->AddCycles_CI(1); + ((ARMv5*)cpu)->AddCycles_CIF(1, 1); } else { @@ -1497,7 +1497,7 @@ void T_MUL_REG(ARM* cpu) if (cpu->Num == 0) { - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); // checkme? + ((ARMv5*)cpu)->AddCycles_CIF(1, 2); } else { From 36f4f2c5d3f360184756d1e2261a28d4a362b2cc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:52:26 -0400 Subject: [PATCH 098/306] Revert "improve timings further" This reverts commit 764ee9ea1abf6aecebc96b253393f5b10a6a2381. --- src/ARM.h | 11 +++++------ src/ARMInterpreter_ALU.cpp | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 8ea553e6..3bbc8735 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -278,13 +278,12 @@ public: Cycles += std::max(numC, numI); } - void AddCycles_CIF(s32 numI, s32 numL) + void AddCycles_CIL(s32 numI, s32 numL) { - // (code||internal)+forced - // used by certain multiply instructions - // seems likely that the execute stage occurs 2 cycles before the fetch stage ends....? - // could also be in some way related to interlock and the memory stage - // though that doesn't explain why some non-S variants trigger this + // (code||internal)+forced interlock + // used by S variants of multiply instructions on the ARM9 + // seems that instead of adding extra hardware logic to allow for handling the memory stage of the instructions during the execute stage + // it instead seems to force a two cycle interlock allowing for the interlocked cycle to be executed without any special logic + presumably an extra cycle to set flags s32 numC = CodeCycles; numI += 1; Cycles += std::max(numC, numI) + numL; diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 00af1dac..e7b3ffb5 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -777,7 +777,7 @@ void A_MUL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIF(1, 2); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); else cpu->AddCycles_CI(1); } @@ -812,7 +812,7 @@ void A_MLA(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIF(1, 2); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); else cpu->AddCycles_CI(1); } @@ -847,9 +847,9 @@ void A_UMULL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIF(1, 3); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); else - ((ARMv5*)cpu)->AddCycles_CIF(1, 1); + cpu->AddCycles_CI(1); } else { @@ -886,9 +886,9 @@ void A_UMLAL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIF(1, 3); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); else - ((ARMv5*)cpu)->AddCycles_CIF(1, 1); + cpu->AddCycles_CI(1); } else { @@ -922,9 +922,9 @@ void A_SMULL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIF(1, 3); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); else - ((ARMv5*)cpu)->AddCycles_CIF(1, 1); + cpu->AddCycles_CI(1); } else { @@ -961,9 +961,9 @@ void A_SMLAL(ARM* cpu) if (cpu->Num == 0) { if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIF(1, 3); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); else - ((ARMv5*)cpu)->AddCycles_CIF(1, 1); + cpu->AddCycles_CI(1); } else { @@ -1497,7 +1497,7 @@ void T_MUL_REG(ARM* cpu) if (cpu->Num == 0) { - ((ARMv5*)cpu)->AddCycles_CIF(1, 2); + ((ARMv5*)cpu)->AddCycles_CIL(1, 2); // checkme? } else { From 13578a3cc95ab6c77c913a5167213380ede402f4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:52:28 -0400 Subject: [PATCH 099/306] Revert "improve timings for S variants of multiply instructions on arm9" This reverts commit 789ef21c700774211467cba3261b1c3d88b0a159. --- src/ARM.h | 13 +----- src/ARMInterpreter_ALU.cpp | 81 +++++++++++--------------------------- 2 files changed, 24 insertions(+), 70 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 3bbc8735..dae5d96a 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -272,23 +272,12 @@ public: void AddCycles_CI(s32 numI) override { - // code||internal + // code+internal s32 numC = CodeCycles; numI += 1; Cycles += std::max(numC, numI); } - void AddCycles_CIL(s32 numI, s32 numL) - { - // (code||internal)+forced interlock - // used by S variants of multiply instructions on the ARM9 - // seems that instead of adding extra hardware logic to allow for handling the memory stage of the instructions during the execute stage - // it instead seems to force a two cycle interlock allowing for the interlocked cycle to be executed without any special logic + presumably an extra cycle to set flags - s32 numC = CodeCycles; - numI += 1; - Cycles += std::max(numC, numI) + numL; - } - void AddCycles_CDI_LDR() override; void AddCycles_CDI_LDM() override; void AddCycles_CDI_SWP() override { AddCycles_CD_STR(); } // uses the same behavior as str diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index e7b3ffb5..bc655996 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -774,23 +774,18 @@ void A_MUL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } + u32 cycles; if (cpu->Num == 0) - { - if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); - else - cpu->AddCycles_CI(1); - } + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; else { - u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 2; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 3; else cycles = 4; - - cpu->AddCycles_CI(cycles); } + + cpu->AddCycles_CI(cycles); } void A_MLA(ARM* cpu) @@ -809,23 +804,18 @@ void A_MLA(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } + u32 cycles; if (cpu->Num == 0) - { - if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); - else - cpu->AddCycles_CI(1); - } + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; else { - u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; - - cpu->AddCycles_CI(cycles); } + + cpu->AddCycles_CI(cycles); } void A_UMULL(ARM* cpu) @@ -844,24 +834,18 @@ void A_UMULL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } + u32 cycles; if (cpu->Num == 0) - { - if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); - else - cpu->AddCycles_CI(1); - } + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; else { - u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; - - cpu->AddCycles_CI(cycles); } + cpu->AddCycles_CI(cycles); } void A_UMLAL(ARM* cpu) @@ -883,24 +867,18 @@ void A_UMLAL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } + u32 cycles; if (cpu->Num == 0) - { - if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); - else - cpu->AddCycles_CI(1); - } + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; else { - u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; - - cpu->AddCycles_CI(cycles); } + cpu->AddCycles_CI(cycles); } void A_SMULL(ARM* cpu) @@ -919,24 +897,18 @@ void A_SMULL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } + u32 cycles; if (cpu->Num == 0) - { - if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); - else - cpu->AddCycles_CI(1); - } + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; else { - u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; - - cpu->AddCycles_CI(cycles); } + cpu->AddCycles_CI(cycles); } void A_SMLAL(ARM* cpu) @@ -958,24 +930,18 @@ void A_SMLAL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } + u32 cycles; if (cpu->Num == 0) - { - if (cpu->CurInstr & (1<<20)) - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); - else - cpu->AddCycles_CI(1); - } + cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; else { - u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; - - cpu->AddCycles_CI(cycles); } + cpu->AddCycles_CI(cycles); } void A_SMLAxy(ARM* cpu) @@ -1495,21 +1461,20 @@ void T_MUL_REG(ARM* cpu) cpu->SetNZ(res & 0x80000000, !res); + s32 cycles = 0; if (cpu->Num == 0) { - ((ARMv5*)cpu)->AddCycles_CIL(1, 2); // checkme? + cycles += 3; } else { - s32 cycles = 0; cpu->SetC(0); // carry flag destroyed, they say. whatever that means... if (a & 0xFF000000) cycles += 4; else if (a & 0x00FF0000) cycles += 3; else if (a & 0x0000FF00) cycles += 2; else cycles += 1; - - cpu->AddCycles_CI(cycles); } + cpu->AddCycles_CI(cycles); } void T_BIC_REG(ARM* cpu) From 7cd50e7b56755f7def5aeb3d7a2f01037af25928 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:56:43 -0400 Subject: [PATCH 100/306] fix some multiply timings --- src/ARMInterpreter_ALU.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index bc655996..37c79904 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -836,7 +836,7 @@ void A_UMULL(ARM* cpu) u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; else { if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; @@ -869,7 +869,7 @@ void A_UMLAL(ARM* cpu) u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; else { if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; @@ -899,7 +899,7 @@ void A_SMULL(ARM* cpu) u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; else { if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; @@ -932,7 +932,7 @@ void A_SMLAL(ARM* cpu) u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; else { if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; From 3c936d84b3b9821ba211f4c9fb5235c493260ad2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 3 Aug 2024 16:20:50 -0400 Subject: [PATCH 101/306] improve mrs, mrc timings --- src/ARMInterpreter.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index f9623147..e4b23641 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -202,7 +202,9 @@ void A_MRS(ARM* cpu) psr = cpu->CPSR; cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; - cpu->AddCycles_C(); + + if (cpu->Num != 1) cpu->AddCycles_CI(1); // arm9 + else cpu->AddCycles_C(); // arm7 } @@ -261,7 +263,8 @@ void A_MRC(ARM* cpu) return A_UNK(cpu); // TODO: check what kind of exception it really is } - cpu->AddCycles_CI(2 + 1); // TODO: checkme + if (cpu->Num != 1) cpu->AddCycles_CI(1); // checkme + else cpu->AddCycles_CI(2 + 1); // TODO: checkme } From 2e421e29e355f7e21e5419a5c0735044325676b6 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 27 Jul 2024 15:02:13 -0400 Subject: [PATCH 102/306] cache should be disabled when pu is disabled --- src/CP15.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index d5898ac8..cba249fc 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -266,8 +266,6 @@ void ARMv5::UpdatePURegions(bool update_all) // PU disabled u8 mask = 0x07; - if (CP15Control & (1<<2)) mask |= 0x30; - if (CP15Control & (1<<12)) mask |= 0x40; memset(PU_UserMap, mask, 0x100000); memset(PU_PrivMap, mask, 0x100000); From 4b703d24b53b1cba9c7ea8324bf89208e72a8a0b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:39:55 -0400 Subject: [PATCH 103/306] improve msr timings for arm9 --- src/ARMInterpreter.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index e4b23641..0122e082 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -121,7 +121,8 @@ void A_MSR_IMM(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - cpu->AddCycles_C(); + if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); + else cpu->AddCycles_C(); } void A_MSR_REG(ARM* cpu) @@ -174,7 +175,8 @@ void A_MSR_REG(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - cpu->AddCycles_C(); + if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); + else cpu->AddCycles_C(); } void A_MRS(ARM* cpu) From ab2a8f128f255895abd323336dea6d257ef42c22 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 4 Aug 2024 14:45:28 -0400 Subject: [PATCH 104/306] revert timing tweaks, finish thumb interwork code --- src/ARM.cpp | 126 +------------------------------ src/ARM.h | 45 +++++++---- src/ARMInterpreter_LoadStore.cpp | 106 +++++++++++++------------- src/CP15.cpp | 18 ----- 4 files changed, 83 insertions(+), 212 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index e1f93a58..16c53dc1 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -300,9 +300,6 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) u32 oldregion = R[15] >> 24; u32 newregion = addr >> 24; - - if (addr < ITCMSize) CodeRegion = Mem9_ITCM; - else CodeRegion = NDS.ARM9Regions[addr >> 14]; RegionCodeCycles = MemTimings[addr >> 12][0]; @@ -644,7 +641,7 @@ void ARMv5::Execute() R[15] += 2; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 1; } + if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } else NextInstr[1] = CodeRead32(R[15], false); // actually execute @@ -1256,127 +1253,6 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) } -void ARMv5::AddCycles_CD_STR() -{ - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - s32 early; - if (DataRegion == Mem9_ITCM) - { - early = (CodeRegion == Mem9_ITCM) ? 0 : 2; - } - else if (DataRegion == Mem9_DTCM) - { - early = 2; - } - else if (DataRegion == Mem9_MainRAM) - { - early = (CodeRegion == Mem9_MainRAM) ? 0 : 18; // CHECKME: how early can main ram be? - } - else early = (DataRegion == CodeRegion) ? 4 : 6; - - s32 code = numC - early; - if (code < 0) code = 0; - Cycles += std::max(code + numD, numC); -} - -void ARMv5::AddCycles_CD_STM() -{ - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - s32 early; - if (DataRegion == Mem9_ITCM) - { - early = (CodeRegion == Mem9_ITCM) ? -1 : 0; // stm adds either: no penalty or benefit to itcm loads, or a 1 cycle penalty if executing from itcm. - } - else if (DataRegion == Mem9_DTCM) - { - early = 2; - } - else if (DataRegion == Mem9_MainRAM) - { - early = (CodeRegion == Mem9_MainRAM) ? 0 : 18; // CHECKME: how early can main ram be? - } - else early = (DataRegion == CodeRegion) ? 4 : 6; - - s32 code = numC - early; - if (code < 0) code = 0; - Cycles += std::max(code + numD, numC); -} - -void ARMv5::AddCycles_CDI_LDR() -{ - // LDR cycles. ARM9 seems to skip the internal cycle here. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early - s32 early; - if (DataRegion == Mem9_ITCM) - { - early = (CodeRegion == Mem9_ITCM) ? 0 : 2; - } - else if (DataRegion == Mem9_DTCM) - { - early = 2; - } - else if (DataRegion == Mem9_MainRAM) - { - early = (CodeRegion == Mem9_MainRAM) ? 0 : 6; - } - else early = 6; - - s32 code = numC - early; - if (code < 0) code = 0; - Cycles += std::max(code + numD, numC); -} - -void ARMv5::AddCycles_CDI_LDM() -{ - // LDM cycles. ARM9 seems to skip the internal cycle here. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - // if a 32 bit bus, start 2 cycles early; else, start 4 cycles early - s32 early; - switch (DataRegion) - { - case 0: // background region; - case Mem9_DTCM: - case Mem9_BIOS: - case Mem9_WRAM: - case Mem9_IO: - case Mem9_Pal: // CHECKME - default: - early = 2; - break; - - case Mem9_OAM: // CHECKME - case Mem9_GBAROM: - case Mem9_GBARAM: - early = 4; - break; - - case Mem9_MainRAM: - early = (CodeRegion == Mem9_MainRAM) ? 0 : 4; - break; - - case Mem9_VRAM: // the dsi can toggle the bus width of vram between 32 and 16 bit - early = (NDS.ConsoleType == 0 || !(((DSi&)NDS).SCFG_EXT[0] & (1<<13))) ? 4 : 2; - break; - - case Mem9_ITCM: // itcm data fetches cannot be done at the same time as a code fetch, it'll even incurr a 1 cycle penalty when executing from itcm - early = (CodeRegion == Mem9_ITCM) ? -1 : 0; - break; - } - - s32 code = numC - early; - if (code < 0) code = 0; - Cycles += std::max(code + numD, numC); -} - void ARMv4::AddCycles_C() { // code only. this code fetch is sequential. diff --git a/src/ARM.h b/src/ARM.h index dae5d96a..7558f7a3 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -266,23 +266,41 @@ public: void AddCycles_C() override { // code only. always nonseq 32-bit for ARM9. - s32 numC = CodeCycles; + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; Cycles += numC; } void AddCycles_CI(s32 numI) override { // code+internal - s32 numC = CodeCycles; - numI += 1; - Cycles += std::max(numC, numI); + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + Cycles += numC + numI; } - void AddCycles_CDI_LDR() override; - void AddCycles_CDI_LDM() override; - void AddCycles_CDI_SWP() override { AddCycles_CD_STR(); } // uses the same behavior as str - void AddCycles_CD_STR() override; - void AddCycles_CD_STM() override; + void AddCycles_CDI() override + { + // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. + // TODO: ITCM data fetches shouldn't be parallelized, they say + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + //if (DataRegion != CodeRegion) + Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + //else + // Cycles += numC + numD; + } + + void AddCycles_CD() override + { + // TODO: ITCM data fetches shouldn't be parallelized, they say + s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; + s32 numD = DataCycles; + + //if (DataRegion != CodeRegion) + Cycles += std::max(numC + numD - 6, std::max(numC, numD)); + //else + // Cycles += numC + numD; + } void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -396,13 +414,8 @@ public: bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; - void AddCycles_CDI(); - void AddCycles_CDI_LDR() override { AddCycles_CDI(); } - void AddCycles_CDI_LDM() override { AddCycles_CDI(); } - void AddCycles_CDI_SWP() override { AddCycles_CDI(); } // checkme? - void AddCycles_CD(); - void AddCycles_CD_STR() override { AddCycles_CD(); } - void AddCycles_CD_STM() override { AddCycles_CD(); } + void AddCycles_CDI() override; + void AddCycles_CD() override; protected: u8 BusRead8(u32 addr) override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 3df9acdd..580c66fc 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -66,7 +66,7 @@ namespace melonDS::ARMInterpreter if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(offset, storeval); \ - cpu->AddCycles_CD_STR(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -77,7 +77,7 @@ namespace melonDS::ARMInterpreter if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ storeval += 4; \ bool dataabort = !cpu->DataWrite32(addr, storeval); \ - cpu->AddCycles_CD_STR(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -86,7 +86,7 @@ namespace melonDS::ARMInterpreter u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(offset, storeval); \ - cpu->AddCycles_CD_STR(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -96,20 +96,20 @@ namespace melonDS::ARMInterpreter u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite8(addr, storeval); \ - cpu->AddCycles_CD_STR(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDR \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = ROR(val, ((offset&0x3)<<3)); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ - if (cpu->Num==1) val &= ~0x1; \ + if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; \ cpu->JumpTo(val); \ } \ else \ @@ -121,13 +121,13 @@ namespace melonDS::ARMInterpreter #define A_LDR_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = ROR(val, ((addr&0x3)<<3)); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ - if (cpu->Num==1) val &= ~0x1; \ + if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; \ cpu->JumpTo(val); \ } \ else \ @@ -138,7 +138,7 @@ namespace melonDS::ARMInterpreter #define A_LDRB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ @@ -148,7 +148,7 @@ namespace melonDS::ARMInterpreter #define A_LDRB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ @@ -240,7 +240,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(offset, storeval); \ - cpu->AddCycles_CD_STR(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -249,7 +249,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ bool dataabort = !cpu->DataWrite16(addr, storeval); \ - cpu->AddCycles_CD_STR(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -260,11 +260,11 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI_LDM(); return;} \ - u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI_LDM(); return;} \ + if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ + u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ - cpu->AddCycles_CDI_LDM(); \ + cpu->AddCycles_CDI(); \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ @@ -272,11 +272,11 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI_LDM(); return;} \ - u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI_LDM(); return;} \ + if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ + u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ - cpu->AddCycles_CDI_LDM(); \ + cpu->AddCycles_CDI(); \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ @@ -287,7 +287,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ - cpu->AddCycles_CD_STM(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -299,14 +299,14 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ - cpu->AddCycles_CD_STM(); \ + cpu->AddCycles_CD(); \ if (dataabort) return; \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ @@ -315,7 +315,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ @@ -324,7 +324,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSB \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ @@ -334,7 +334,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSB_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s8)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ @@ -344,7 +344,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSH \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ @@ -354,7 +354,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRSH_POST \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ - cpu->AddCycles_CDI_LDR(); \ + cpu->AddCycles_CDI(); \ if (dataabort) return; \ val = (s32)(s16)val; \ if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ @@ -415,7 +415,7 @@ void A_SWP(ARM* cpu) } cpu->DataCycles += numD; } - cpu->AddCycles_CDI_SWP(); + cpu->AddCycles_CDI(); } void A_SWPB(ARM* cpu) @@ -437,7 +437,7 @@ void A_SWPB(ARM* cpu) } cpu->DataCycles += numD; } - cpu->AddCycles_CDI_SWP(); + cpu->AddCycles_CDI(); } @@ -501,7 +501,7 @@ void A_LDM(ARM* cpu) if (!preinc) base += 4; - if (cpu->Num == 1) + if (cpu->Num == 1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc &= ~0x1; } @@ -546,7 +546,7 @@ void A_LDM(ARM* cpu) cpu->R[baseid] = oldbase; } - cpu->AddCycles_CDI_LDM(); + cpu->AddCycles_CDI(); } void A_STM(ARM* cpu) @@ -630,7 +630,7 @@ void A_STM(ARM* cpu) cpu->R[baseid] = oldbase; } - cpu->AddCycles_CD_STM(); + cpu->AddCycles_CD(); } @@ -645,7 +645,7 @@ void T_LDR_PCREL(ARM* cpu) u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } @@ -654,7 +654,7 @@ void T_STR_REG(ARM* cpu) u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataWrite32(addr, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_STRB_REG(ARM* cpu) @@ -662,7 +662,7 @@ void T_STRB_REG(ARM* cpu) u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataWrite8(addr, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_LDR_REG(ARM* cpu) @@ -673,7 +673,7 @@ void T_LDR_REG(ARM* cpu) if (cpu->DataRead32(addr, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } void T_LDRB_REG(ARM* cpu) @@ -681,7 +681,7 @@ void T_LDRB_REG(ARM* cpu) u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } @@ -690,7 +690,7 @@ void T_STRH_REG(ARM* cpu) u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataWrite16(addr, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_LDRSB_REG(ARM* cpu) @@ -699,7 +699,7 @@ void T_LDRSB_REG(ARM* cpu) if (cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } void T_LDRH_REG(ARM* cpu) @@ -707,7 +707,7 @@ void T_LDRH_REG(ARM* cpu) u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } void T_LDRSH_REG(ARM* cpu) @@ -716,7 +716,7 @@ void T_LDRSH_REG(ARM* cpu) if (cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7])) cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } @@ -726,7 +726,7 @@ void T_STR_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataWrite32(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_LDR_IMM(ARM* cpu) @@ -737,7 +737,7 @@ void T_LDR_IMM(ARM* cpu) u32 val; if (cpu->DataRead32(offset, &val)) cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } void T_STRB_IMM(ARM* cpu) @@ -746,7 +746,7 @@ void T_STRB_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataWrite8(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_LDRB_IMM(ARM* cpu) @@ -755,7 +755,7 @@ void T_LDRB_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } @@ -765,7 +765,7 @@ void T_STRH_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataWrite16(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_LDRH_IMM(ARM* cpu) @@ -774,7 +774,7 @@ void T_LDRH_IMM(ARM* cpu) offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } @@ -784,7 +784,7 @@ void T_STR_SPREL(ARM* cpu) offset += cpu->R[13]; cpu->DataWrite32(offset, cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CD_STR(); + cpu->AddCycles_CD(); } void T_LDR_SPREL(ARM* cpu) @@ -793,7 +793,7 @@ void T_LDR_SPREL(ARM* cpu) offset += cpu->R[13]; cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CDI_LDR(); + cpu->AddCycles_CDI(); } @@ -841,7 +841,7 @@ void T_PUSH(ARM* cpu) cpu->R[13] = wbbase; dataabort: - cpu->AddCycles_CD_STM(); + cpu->AddCycles_CD(); } void T_POP(ARM* cpu) @@ -871,7 +871,7 @@ void T_POP(ARM* cpu) { goto dataabort; } - if (cpu->Num==1) pc |= 0x1; + if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; cpu->JumpTo(pc); base += 4; } @@ -879,7 +879,7 @@ void T_POP(ARM* cpu) cpu->R[13] = base; dataabort: - cpu->AddCycles_CDI_LDM(); + cpu->AddCycles_CDI(); } void T_STMIA(ARM* cpu) @@ -904,7 +904,7 @@ void T_STMIA(ARM* cpu) // TODO: check "Rb included in Rlist" case cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; dataabort: - cpu->AddCycles_CD_STM(); + cpu->AddCycles_CD(); } void T_LDMIA(ARM* cpu) @@ -930,7 +930,7 @@ void T_LDMIA(ARM* cpu) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; dataabort: - cpu->AddCycles_CDI_LDM(); + cpu->AddCycles_CDI(); } diff --git a/src/CP15.cpp b/src/CP15.cpp index cba249fc..bf1d2edc 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -815,21 +815,18 @@ bool ARMv5::DataRead8(u32 addr, u32* val) if (addr < ITCMSize) { - DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { - DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } *val = BusRead8(addr); - DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -846,21 +843,18 @@ bool ARMv5::DataRead16(u32 addr, u32* val) if (addr < ITCMSize) { - DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { - DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } *val = BusRead16(addr); - DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -877,21 +871,18 @@ bool ARMv5::DataRead32(u32 addr, u32* val) if (addr < ITCMSize) { - DataRegion = Mem9_ITCM; DataCycles = 1; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { - DataRegion = Mem9_DTCM; DataCycles = 1; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } *val = BusRead32(addr); - DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][2]; return true; } @@ -934,7 +925,6 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { - DataRegion = Mem9_ITCM; DataCycles = 1; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -942,14 +932,12 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) } if ((addr & DTCMMask) == DTCMBase) { - DataRegion = Mem9_DTCM; DataCycles = 1; *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } BusWrite8(addr, val); - DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -966,7 +954,6 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { - DataRegion = Mem9_ITCM; DataCycles = 1; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -974,14 +961,12 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } if ((addr & DTCMMask) == DTCMBase) { - DataRegion = Mem9_DTCM; DataCycles = 1; *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } BusWrite16(addr, val); - DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -998,7 +983,6 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { - DataRegion = Mem9_ITCM; DataCycles = 1; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -1006,14 +990,12 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } if ((addr & DTCMMask) == DTCMBase) { - DataRegion = Mem9_DTCM; DataCycles = 1; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } BusWrite32(addr, val); - DataRegion = NDS.ARM9Regions[addr >> 14]; DataCycles = MemTimings[addr >> 12][2]; return true; } From 346ac1380f043c62afc42c0aff5c67b7c56be47b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 4 Aug 2024 15:21:23 -0400 Subject: [PATCH 105/306] forgot to remove a thingy when removing timing reworks --- src/ARM.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 7558f7a3..b41389e1 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -140,11 +140,8 @@ public: virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; - virtual void AddCycles_CDI_LDR() = 0; - virtual void AddCycles_CDI_LDM() = 0; - virtual void AddCycles_CDI_SWP() = 0; - virtual void AddCycles_CD_STR() = 0; - virtual void AddCycles_CD_STM() = 0; + virtual void AddCycles_CDI() = 0; + virtual void AddCycles_CD() = 0; void CheckGdbIncoming(); From 587958e6781cf4d44cd9c611f2589e7d3fc36e5d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 4 Aug 2024 23:31:20 -0400 Subject: [PATCH 106/306] Improve accuracy of prefetch aborts comes with a small-ish performance hit --- src/ARM.cpp | 38 ++++++++++++++++++-------------------- src/CP15.cpp | 29 ++++++++++++++--------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 6ac387b2..ae55514a 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -343,12 +343,6 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) CPSR &= ~0x20; } - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return; - } - NDS.MonitorARM9Jump(addr); } @@ -575,15 +569,6 @@ void ARMv5::PrefetchAbort() CPSR |= 0x97; UpdateMode(oldcpsr, CPSR); - // this shouldn't happen, but if it does, we're stuck in some nasty endless loop - // so better take care of it - if (!(PU_Map[ExceptionBase>>12] & 0x04)) - { - Log(LogLevel::Error, "!!!!! EXCEPTION REGION NOT EXECUTABLE. THIS IS VERY BAD!!\n"); - NDS.Stop(Platform::StopReason::BadExceptionRegion); - return; - } - R_ABT[2] = oldcpsr; R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); JumpTo(ExceptionBase + 0x0C); @@ -685,10 +670,18 @@ void ARMv5::Execute() NextInstr[0] = NextInstr[1]; if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } else NextInstr[1] = CodeRead32(R[15], false); - + + // handle aborted instructions + if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] + { + PrefetchAbort(); + } // actually execute - u32 icode = (CurInstr >> 6) & 0x3FF; - ARMInterpreter::THUMBInstrTable[icode](this); + else [[likely]] + { + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); + } } else { @@ -700,9 +693,14 @@ void ARMv5::Execute() CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead32(R[15], false); - + + // handle aborted instructions + if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // todo: check for bkpt instruction? + { + PrefetchAbort(); + } // actually execute - if (CheckCondition(CurInstr >> 28)) + else if (CheckCondition(CurInstr >> 28)) [[likely]] { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); ARMInterpreter::ARMInstrTable[icode](this); diff --git a/src/CP15.cpp b/src/CP15.cpp index bf1d2edc..6fcaff93 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -773,14 +773,13 @@ u32 ARMv5::CP15Read(u32 id) const u32 ARMv5::CodeRead32(u32 addr, bool branch) { - /*if (branch || (!(addr & 0xFFF))) + // prefetch abort + // the actual exception is not raised until the aborted instruction is executed + if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] { - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return 0; - } - }*/ + CodeCycles = 1; + return 0; + } if (addr < ITCMSize) { @@ -807,7 +806,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) bool ARMv5::DataRead8(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -833,7 +832,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -861,7 +860,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -889,7 +888,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) bool ARMv5::DataRead32S(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -917,7 +916,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) bool ARMv5::DataWrite8(u32 addr, u8 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); return false; @@ -944,7 +943,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); return false; @@ -973,7 +972,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); return false; @@ -1002,7 +1001,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { if (!dataabort) DataAbort(); return false; From 0dc619d6155b0f6533ff35d13cf5f00add4b1939 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 5 Aug 2024 11:41:25 -0400 Subject: [PATCH 107/306] Revert "Improve accuracy of prefetch aborts" This reverts commit 587958e6781cf4d44cd9c611f2589e7d3fc36e5d. --- src/ARM.cpp | 38 ++++++++++++++++++++------------------ src/CP15.cpp | 29 +++++++++++++++-------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index ae55514a..6ac387b2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -343,6 +343,12 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) CPSR &= ~0x20; } + if (!(PU_Map[addr>>12] & 0x04)) + { + PrefetchAbort(); + return; + } + NDS.MonitorARM9Jump(addr); } @@ -569,6 +575,15 @@ void ARMv5::PrefetchAbort() CPSR |= 0x97; UpdateMode(oldcpsr, CPSR); + // this shouldn't happen, but if it does, we're stuck in some nasty endless loop + // so better take care of it + if (!(PU_Map[ExceptionBase>>12] & 0x04)) + { + Log(LogLevel::Error, "!!!!! EXCEPTION REGION NOT EXECUTABLE. THIS IS VERY BAD!!\n"); + NDS.Stop(Platform::StopReason::BadExceptionRegion); + return; + } + R_ABT[2] = oldcpsr; R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); JumpTo(ExceptionBase + 0x0C); @@ -670,18 +685,10 @@ void ARMv5::Execute() NextInstr[0] = NextInstr[1]; if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } else NextInstr[1] = CodeRead32(R[15], false); - - // handle aborted instructions - if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] - { - PrefetchAbort(); - } + // actually execute - else [[likely]] - { - u32 icode = (CurInstr >> 6) & 0x3FF; - ARMInterpreter::THUMBInstrTable[icode](this); - } + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); } else { @@ -693,14 +700,9 @@ void ARMv5::Execute() CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead32(R[15], false); - - // handle aborted instructions - if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // todo: check for bkpt instruction? - { - PrefetchAbort(); - } + // actually execute - else if (CheckCondition(CurInstr >> 28)) [[likely]] + if (CheckCondition(CurInstr >> 28)) { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); ARMInterpreter::ARMInstrTable[icode](this); diff --git a/src/CP15.cpp b/src/CP15.cpp index 6fcaff93..bf1d2edc 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -773,13 +773,14 @@ u32 ARMv5::CP15Read(u32 id) const u32 ARMv5::CodeRead32(u32 addr, bool branch) { - // prefetch abort - // the actual exception is not raised until the aborted instruction is executed - if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] + /*if (branch || (!(addr & 0xFFF))) { - CodeCycles = 1; - return 0; - } + if (!(PU_Map[addr>>12] & 0x04)) + { + PrefetchAbort(); + return 0; + } + }*/ if (addr < ITCMSize) { @@ -806,7 +807,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) bool ARMv5::DataRead8(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); return false; @@ -832,7 +833,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); return false; @@ -860,7 +861,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); return false; @@ -888,7 +889,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) bool ARMv5::DataRead32S(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x01)) { DataAbort(); return false; @@ -916,7 +917,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) bool ARMv5::DataWrite8(u32 addr, u8 val) { - if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x02)) { DataAbort(); return false; @@ -943,7 +944,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { - if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x02)) { DataAbort(); return false; @@ -972,7 +973,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { - if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x02)) { DataAbort(); return false; @@ -1001,7 +1002,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) { - if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] + if (!(PU_Map[addr>>12] & 0x02)) { if (!dataabort) DataAbort(); return false; From eedd2806f9d6c7505130db9fe57d97ce7415e2ba Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:37:42 -0400 Subject: [PATCH 108/306] Reapply "Improve accuracy of prefetch aborts" This reverts commit 0dc619d6155b0f6533ff35d13cf5f00add4b1939. --- src/ARM.cpp | 38 ++++++++++++++++++-------------------- src/CP15.cpp | 29 ++++++++++++++--------------- 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 6ac387b2..ae55514a 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -343,12 +343,6 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) CPSR &= ~0x20; } - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return; - } - NDS.MonitorARM9Jump(addr); } @@ -575,15 +569,6 @@ void ARMv5::PrefetchAbort() CPSR |= 0x97; UpdateMode(oldcpsr, CPSR); - // this shouldn't happen, but if it does, we're stuck in some nasty endless loop - // so better take care of it - if (!(PU_Map[ExceptionBase>>12] & 0x04)) - { - Log(LogLevel::Error, "!!!!! EXCEPTION REGION NOT EXECUTABLE. THIS IS VERY BAD!!\n"); - NDS.Stop(Platform::StopReason::BadExceptionRegion); - return; - } - R_ABT[2] = oldcpsr; R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); JumpTo(ExceptionBase + 0x0C); @@ -685,10 +670,18 @@ void ARMv5::Execute() NextInstr[0] = NextInstr[1]; if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } else NextInstr[1] = CodeRead32(R[15], false); - + + // handle aborted instructions + if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] + { + PrefetchAbort(); + } // actually execute - u32 icode = (CurInstr >> 6) & 0x3FF; - ARMInterpreter::THUMBInstrTable[icode](this); + else [[likely]] + { + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); + } } else { @@ -700,9 +693,14 @@ void ARMv5::Execute() CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead32(R[15], false); - + + // handle aborted instructions + if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // todo: check for bkpt instruction? + { + PrefetchAbort(); + } // actually execute - if (CheckCondition(CurInstr >> 28)) + else if (CheckCondition(CurInstr >> 28)) [[likely]] { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); ARMInterpreter::ARMInstrTable[icode](this); diff --git a/src/CP15.cpp b/src/CP15.cpp index bf1d2edc..6fcaff93 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -773,14 +773,13 @@ u32 ARMv5::CP15Read(u32 id) const u32 ARMv5::CodeRead32(u32 addr, bool branch) { - /*if (branch || (!(addr & 0xFFF))) + // prefetch abort + // the actual exception is not raised until the aborted instruction is executed + if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] { - if (!(PU_Map[addr>>12] & 0x04)) - { - PrefetchAbort(); - return 0; - } - }*/ + CodeCycles = 1; + return 0; + } if (addr < ITCMSize) { @@ -807,7 +806,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) bool ARMv5::DataRead8(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -833,7 +832,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -861,7 +860,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -889,7 +888,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) bool ARMv5::DataRead32S(u32 addr, u32* val) { - if (!(PU_Map[addr>>12] & 0x01)) + if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); return false; @@ -917,7 +916,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) bool ARMv5::DataWrite8(u32 addr, u8 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); return false; @@ -944,7 +943,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); return false; @@ -973,7 +972,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); return false; @@ -1002,7 +1001,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) { - if (!(PU_Map[addr>>12] & 0x02)) + if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { if (!dataabort) DataAbort(); return false; From a85b2bfb5647c4e228ea8683ca0481b6a69c2619 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:57:17 -0400 Subject: [PATCH 109/306] tweak when irqs are triggered and fix prefetch aborts also ig add some comments next to the svc funcs so that someone searching for "swi" can find them easier --- src/ARM.cpp | 41 +++++++++++++++++++++-------------------- src/ARMInterpreter.cpp | 4 ++-- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index ae55514a..e01e0e36 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -548,7 +548,7 @@ void ARM::TriggerIRQ() UpdateMode(oldcpsr, CPSR); R_IRQ[2] = oldcpsr; - R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); + R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); JumpTo(ExceptionBase + 0x18); // ARDS cheat support @@ -570,7 +570,7 @@ void ARMv5::PrefetchAbort() UpdateMode(oldcpsr, CPSR); R_ABT[2] = oldcpsr; - R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); + R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); JumpTo(ExceptionBase + 0x0C); } @@ -609,7 +609,7 @@ void ARMv5::Execute() { Halted = 0; if (NDS.IME[0] & 0x1) - TriggerIRQ(); + IRQ = 1; } else { @@ -671,13 +671,13 @@ void ARMv5::Execute() if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } else NextInstr[1] = CodeRead32(R[15], false); - // handle aborted instructions - if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); } - // actually execute - else [[likely]] + else [[likely]] // actually execute { u32 icode = (CurInstr >> 6) & 0x3FF; ARMInterpreter::THUMBInstrTable[icode](this); @@ -694,13 +694,13 @@ void ARMv5::Execute() NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead32(R[15], false); - // handle aborted instructions - if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // todo: check for bkpt instruction? + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); } - // actually execute - else if (CheckCondition(CurInstr >> 28)) [[likely]] + else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); ARMInterpreter::ARMInstrTable[icode](this); @@ -727,8 +727,6 @@ void ARMv5::Execute() if (NDS::IME[0] & 0x1) TriggerIRQ(); }*/ - if (IRQ) TriggerIRQ(); - } NDS.ARM9Timestamp += Cycles; @@ -760,7 +758,7 @@ void ARMv4::Execute() { Halted = 0; if (NDS.IME[1] & 0x1) - TriggerIRQ(); + IRQ = 1; } else { @@ -820,9 +818,13 @@ void ARMv4::Execute() NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead16(R[15]); - // actually execute - u32 icode = (CurInstr >> 6); - ARMInterpreter::THUMBInstrTable[icode](this); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else + { + // actually execute + u32 icode = (CurInstr >> 6); + ARMInterpreter::THUMBInstrTable[icode](this); + } } else { @@ -835,8 +837,8 @@ void ARMv4::Execute() NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead32(R[15]); - // actually execute - if (CheckCondition(CurInstr >> 28)) + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CheckCondition(CurInstr >> 28)) // actually execute { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); ARMInterpreter::ARMInstrTable[icode](this); @@ -859,7 +861,6 @@ void ARMv4::Execute() if (NDS::IME[1] & 0x1) TriggerIRQ(); }*/ - if (IRQ) TriggerIRQ(); } NDS.ARM7Timestamp += Cycles; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 8ca85976..6e6c9a8d 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -271,7 +271,7 @@ void A_MRC(ARM* cpu) -void A_SVC(ARM* cpu) +void A_SVC(ARM* cpu) // A_SWI { u32 oldcpsr = cpu->CPSR; cpu->CPSR &= ~0xBF; @@ -283,7 +283,7 @@ void A_SVC(ARM* cpu) cpu->JumpTo(cpu->ExceptionBase + 0x08); } -void T_SVC(ARM* cpu) +void T_SVC(ARM* cpu) // T_SWI { u32 oldcpsr = cpu->CPSR; cpu->CPSR &= ~0xBF; From 332a39dbafde51d6b703c78635246aaa098f337b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 5 Aug 2024 16:14:17 -0400 Subject: [PATCH 110/306] fix JIT being borked --- src/ARM.cpp | 37 ++++++++++++++++++++++++++----------- src/ARM.h | 1 + 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index e01e0e36..9919cbcb 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -537,6 +537,7 @@ void ARM::UpdateMode(u32 oldmode, u32 newmode, bool phony) } } +template void ARM::TriggerIRQ() { if (CPSR & 0x80) @@ -548,7 +549,10 @@ void ARM::TriggerIRQ() UpdateMode(oldcpsr, CPSR); R_IRQ[2] = oldcpsr; - R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); + if constexpr (mode == CPUExecuteMode::JIT) + R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); + else + R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); JumpTo(ExceptionBase + 0x18); // ARDS cheat support @@ -559,6 +563,11 @@ void ARM::TriggerIRQ() NDS.AREngine.RunCheats(); } } +template void ARM::TriggerIRQ(); +template void ARM::TriggerIRQ(); +#ifdef JIT_ENABLED +template void ARM::TriggerIRQ(); +#endif void ARMv5::PrefetchAbort() { @@ -609,7 +618,10 @@ void ARMv5::Execute() { Halted = 0; if (NDS.IME[0] & 0x1) - IRQ = 1; + { + if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); + else IRQ = 1; + } } else { @@ -643,7 +655,7 @@ void ARMv5::Execute() { // this order is crucial otherwise idle loops waiting for an IRQ won't function if (IRQ) - TriggerIRQ(); + TriggerIRQ(); if (Halted || IdleLoop) { @@ -672,7 +684,7 @@ void ARMv5::Execute() else NextInstr[1] = CodeRead32(R[15], false); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); @@ -695,7 +707,7 @@ void ARMv5::Execute() NextInstr[1] = CodeRead32(R[15], false); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); @@ -725,7 +737,7 @@ void ARMv5::Execute() /*if (NDS::IF[0] & NDS::IE[0]) { if (NDS::IME[0] & 0x1) - TriggerIRQ(); + TriggerIRQ(); }*/ } @@ -758,7 +770,10 @@ void ARMv4::Execute() { Halted = 0; if (NDS.IME[1] & 0x1) - IRQ = 1; + { + if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); + else IRQ = 1; + } } else { @@ -791,7 +806,7 @@ void ARMv4::Execute() if (StopExecution) { if (IRQ) - TriggerIRQ(); + TriggerIRQ(); if (Halted || IdleLoop) { @@ -818,7 +833,7 @@ void ARMv4::Execute() NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead16(R[15]); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else { // actually execute @@ -837,7 +852,7 @@ void ARMv4::Execute() NextInstr[0] = NextInstr[1]; NextInstr[1] = CodeRead32(R[15]); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (CheckCondition(CurInstr >> 28)) // actually execute { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); @@ -859,7 +874,7 @@ void ARMv4::Execute() /*if (NDS::IF[1] & NDS::IE[1]) { if (NDS::IME[1] & 0x1) - TriggerIRQ(); + TriggerIRQ(); }*/ } diff --git a/src/ARM.h b/src/ARM.h index d4d3f5d4..2603e646 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -129,6 +129,7 @@ public: void UpdateMode(u32 oldmode, u32 newmode, bool phony = false); + template void TriggerIRQ(); void SetupCodeMem(u32 addr); From 40e8e8e7bd9574a37d794a5da1a76664c2f22f35 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 23 Aug 2024 19:13:17 -0400 Subject: [PATCH 111/306] rework single load/stores to use a shared instruction --- src/ARMInterpreter_LoadStore.cpp | 314 ++++++++++--------------------- 1 file changed, 101 insertions(+), 213 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 580c66fc..734b57d0 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -58,101 +58,100 @@ namespace melonDS::ARMInterpreter shiftop(offset, shift); \ if (!(cpu->CurInstr & (1<<23))) offset = -offset; +enum class Writeback +{ + None = 0, + Pre, + Post, +}; + +template +void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) +{ + static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); + + u32 addr; + if constexpr (writeback != Writeback::Post) addr = offset + cpu->R[rn]; + else addr = cpu->R[rn]; + + u32 val; + bool dataabort; + if constexpr (size == 8) dataabort = !cpu->DataRead8 (addr, &val); + if constexpr (size == 16) dataabort = !cpu->DataRead16(addr, &val); + if constexpr (size == 32) dataabort = !cpu->DataRead32(addr, &val); + + cpu->AddCycles_CDI(); + if (dataabort) return; + + if constexpr (size == 8 && signror) val = (s32)(s8)val; + if constexpr (size == 16 && signror) val = (s32)(s16)val; + if constexpr (size == 32 && signror) val = ROR(val, ((addr&0x3)<<3)); + + if constexpr (writeback != Writeback::None) cpu->R[rn] += offset; + + if (rd == 15) + { + if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; + cpu->JumpTo(val); + } + else cpu->R[rd] = val; +} + +template +void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) +{ + static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); + + u32 addr; + if constexpr (writeback != Writeback::Post) addr = offset + cpu->R[rn]; + else addr = cpu->R[rn]; + + u32 storeval = cpu->R[rd]; + if (rd == 15) storeval += 4; + + bool dataabort; + if constexpr (size == 8) dataabort = !cpu->DataWrite8 (addr, storeval); + if constexpr (size == 16) dataabort = !cpu->DataWrite16(addr, storeval); + if constexpr (size == 32) dataabort = !cpu->DataWrite32(addr, storeval); + + cpu->AddCycles_CD(); + if (dataabort) return; + + if constexpr (writeback != Writeback::None) cpu->R[rn] += offset; +} #define A_STR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ - storeval += 4; \ - bool dataabort = !cpu->DataWrite32(offset, storeval); \ - cpu->AddCycles_CD(); \ - if (dataabort) return; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else StoreSingle<32, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); // TODO: user mode (bit21) #define A_STR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 0xF) \ - storeval += 4; \ - bool dataabort = !cpu->DataWrite32(addr, storeval); \ - cpu->AddCycles_CD(); \ - if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; + StoreSingle<32, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_STRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ - bool dataabort = !cpu->DataWrite8(offset, storeval); \ - cpu->AddCycles_CD(); \ - if (dataabort) return; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else StoreSingle<8, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); // TODO: user mode (bit21) #define A_STRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ - bool dataabort = !cpu->DataWrite8(addr, storeval); \ - cpu->AddCycles_CD(); \ - if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; + StoreSingle<8, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDR \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead32(offset, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - val = ROR(val, ((offset&0x3)<<3)); \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - { \ - if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; \ - cpu->JumpTo(val); \ - } \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - } + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); // TODO: user mode #define A_LDR_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead32(addr, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - val = ROR(val, ((addr&0x3)<<3)); \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) \ - { \ - if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; \ - cpu->JumpTo(val); \ - } \ - else \ - { \ - cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - } + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); // TODO: user mode #define A_LDRB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); @@ -236,22 +235,11 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ - bool dataabort = !cpu->DataWrite16(offset, storeval); \ - cpu->AddCycles_CD(); \ - if (dataabort) return; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (cpu->CurInstr & (1<<21)) StoreSingle<16, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else StoreSingle<16, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_STRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 storeval = cpu->R[(cpu->CurInstr>>12) & 0xF]; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) storeval+=4; \ - bool dataabort = !cpu->DataWrite16(addr, storeval); \ - cpu->AddCycles_CD(); \ - if (dataabort) return; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; + StoreSingle<16, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); // TODO: CHECK LDRD/STRD TIMINGS!! @@ -304,62 +292,25 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRSB \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead8(offset, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRSB_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead8(addr, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - val = (s32)(s8)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRSH \ - offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead16(offset, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRSH_POST \ - u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ - u32 val; bool dataabort = !cpu->DataRead16(addr, &val); \ - cpu->AddCycles_CDI(); \ - if (dataabort) return; \ - val = (s32)(s16)val; \ - if (((cpu->CurInstr>>12) & 0xF) == 15) cpu->JumpTo8_16Bit(val); \ - else cpu->R[(cpu->CurInstr>>12) & 0xF] = val; \ - cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_IMPLEMENT_HD_LDRSTR(x) \ @@ -651,149 +602,86 @@ void T_LDR_PCREL(ARM* cpu) void T_STR_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite32(addr, cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CD(); + StoreSingle<32, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_STRB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite8(addr, cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CD(); + StoreSingle<8, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_LDR_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - - u32 val; - if (cpu->DataRead32(addr, &val)) - cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(addr&0x3)); - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_LDRB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_STRH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataWrite16(addr, cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CD(); + StoreSingle<16, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_LDRSB_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - if (cpu->DataRead8(addr, &cpu->R[cpu->CurInstr & 0x7])) - cpu->R[cpu->CurInstr & 0x7] = (s32)(s8)cpu->R[cpu->CurInstr & 0x7]; - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_LDRH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7]); - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_LDRSH_REG(ARM* cpu) { - u32 addr = cpu->R[(cpu->CurInstr >> 3) & 0x7] + cpu->R[(cpu->CurInstr >> 6) & 0x7]; - if (cpu->DataRead16(addr, &cpu->R[cpu->CurInstr & 0x7])) - cpu->R[cpu->CurInstr & 0x7] = (s32)(s16)cpu->R[cpu->CurInstr & 0x7]; - - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_STR_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataWrite32(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<32, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); } void T_LDR_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 4) & 0x7C; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - u32 val; - if (cpu->DataRead32(offset, &val)) - cpu->R[cpu->CurInstr & 0x7] = ROR(val, 8*(offset&0x3)); - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); } void T_STRB_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataWrite8(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<8, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F)); } void T_LDRB_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 6) & 0x1F; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataRead8(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F)); } void T_STRH_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataWrite16(offset, cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<16, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E)); } void T_LDRH_IMM(ARM* cpu) { - u32 offset = (cpu->CurInstr >> 5) & 0x3E; - offset += cpu->R[(cpu->CurInstr >> 3) & 0x7]; - - cpu->DataRead16(offset, &cpu->R[cpu->CurInstr & 0x7]); - cpu->AddCycles_CDI(); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E)); } void T_STR_SPREL(ARM* cpu) { - u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->R[13]; - - cpu->DataWrite32(offset, cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CD(); + StoreSingle<32, Writeback::None>(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC)); } void T_LDR_SPREL(ARM* cpu) { - u32 offset = (cpu->CurInstr << 2) & 0x3FC; - offset += cpu->R[13]; - - cpu->DataRead32(offset, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); - cpu->AddCycles_CDI(); + LoadSingle(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC)); } From f692e7391af134e55e998e3e7e5e65c764f07927 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 26 Aug 2024 16:31:20 -0400 Subject: [PATCH 112/306] the docs lied to me (again) --- src/ARMInterpreter.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 6e6c9a8d..15ec42db 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -121,8 +121,7 @@ void A_MSR_IMM(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); - else cpu->AddCycles_C(); + cpu->AddCycles_C(); } void A_MSR_REG(ARM* cpu) @@ -175,8 +174,7 @@ void A_MSR_REG(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); - else cpu->AddCycles_C(); + cpu->AddCycles_C(); } void A_MRS(ARM* cpu) From a9aad74539392e55443b012e868ffbcc83585af8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 26 Aug 2024 20:43:27 -0400 Subject: [PATCH 113/306] implement user mode load/stores --- src/ARMInterpreter_LoadStore.cpp | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 734b57d0..c9128666 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -63,6 +63,7 @@ enum class Writeback None = 0, Pre, Post, + Trans, }; template @@ -71,15 +72,27 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); u32 addr; - if constexpr (writeback != Writeback::Post) addr = offset + cpu->R[rn]; + if constexpr (writeback < Writeback::Post) addr = offset + cpu->R[rn]; else addr = cpu->R[rn]; + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; + } + u32 val; bool dataabort; if constexpr (size == 8) dataabort = !cpu->DataRead8 (addr, &val); if constexpr (size == 16) dataabort = !cpu->DataRead16(addr, &val); if constexpr (size == 32) dataabort = !cpu->DataRead32(addr, &val); + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0 && (cpu->CPSR & 0x1F) != 0x10) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_PrivMap; + } + cpu->AddCycles_CDI(); if (dataabort) return; @@ -125,33 +138,33 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ else StoreSingle<32, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); -// TODO: user mode (bit21) #define A_STR_POST \ - StoreSingle<32, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Trans>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else StoreSingle<32, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_STRB \ if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ else StoreSingle<8, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); -// TODO: user mode (bit21) #define A_STRB_POST \ - StoreSingle<8, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Trans>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else StoreSingle<8, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDR \ if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); -// TODO: user mode #define A_LDR_POST \ - LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRB \ if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); -// TODO: user mode #define A_LDRB_POST \ - LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); From be290da23ccf76a6f76d32347558c52c4bf67b6a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:23:18 -0400 Subject: [PATCH 114/306] de-duplicate swp(b) --- src/ARMInterpreter_LoadStore.cpp | 43 +++++++++++++++----------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index c9128666..8deeaa4f 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -360,48 +360,45 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) -void A_SWP(ARM* cpu) +template +inline void SWP(ARM* cpu) { u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 rm = cpu->R[cpu->CurInstr & 0xF]; if ((cpu->CurInstr & 0xF) == 15) rm += 4; u32 val; - if (cpu->DataRead32(base, &val)) + if ((byte ? cpu->DataRead8 (base, &val) + : cpu->DataRead32(base, &val))) { u32 numD = cpu->DataCycles; - if (cpu->DataWrite32(base, rm)) + + if ((byte ? cpu->DataWrite8 (base, rm) + : cpu->DataWrite32(base, rm))) { // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; - if (rd != 15) cpu->R[rd] = ROR(val, 8*(base&0x3)); - else if (cpu->Num==1) cpu->JumpTo(ROR(val, 8*(base&0x3)) & ~1); // for some reason these jumps don't work on the arm 9? + + if constexpr (!byte) val = ROR(val, 8*(base&0x3)); + + if (rd != 15) cpu->R[rd] = val; + else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't seem to work on the arm 9? } + cpu->DataCycles += numD; } + cpu->AddCycles_CDI(); } +void A_SWP(ARM* cpu) +{ + void SWP(ARM* cpu); +} + void A_SWPB(ARM* cpu) { - u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF] & 0xFF; - if ((cpu->CurInstr & 0xF) == 15) rm += 4; - - u32 val; - if (cpu->DataRead8(base, &val)) - { - u32 numD = cpu->DataCycles; - if (cpu->DataWrite8(base, rm)) - { - // rd only gets updated if both read and write succeed - u32 rd = (cpu->CurInstr >> 12) & 0xF; - if (rd != 15) cpu->R[rd] = val; - else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't work on the arm 9? - } - cpu->DataCycles += numD; - } - cpu->AddCycles_CDI(); + void SWP(ARM* cpu); } From 685c4828a253e53ab52e0d10abff264055a429b4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:45:46 -0400 Subject: [PATCH 115/306] try not forgetting about stores lol --- src/ARMInterpreter_LoadStore.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 8deeaa4f..a8f8cb1b 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -116,17 +116,29 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); u32 addr; - if constexpr (writeback != Writeback::Post) addr = offset + cpu->R[rn]; + if constexpr (writeback < Writeback::Post) addr = offset + cpu->R[rn]; else addr = cpu->R[rn]; u32 storeval = cpu->R[rd]; if (rd == 15) storeval += 4; + + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; + } bool dataabort; if constexpr (size == 8) dataabort = !cpu->DataWrite8 (addr, storeval); if constexpr (size == 16) dataabort = !cpu->DataWrite16(addr, storeval); if constexpr (size == 32) dataabort = !cpu->DataWrite32(addr, storeval); + if constexpr (writeback == Writeback::Trans) + { + if (cpu->Num == 0 && (cpu->CPSR & 0x1F) != 0x10) + ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_PrivMap; + } + cpu->AddCycles_CD(); if (dataabort) return; From 00038217382c2666d598f1b0582698a4bb186a4f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 28 Aug 2024 22:04:22 -0400 Subject: [PATCH 116/306] apparently i never tested this --- src/ARMInterpreter_LoadStore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index a8f8cb1b..659deaef 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -405,12 +405,12 @@ inline void SWP(ARM* cpu) void A_SWP(ARM* cpu) { - void SWP(ARM* cpu); + SWP(cpu); } void A_SWPB(ARM* cpu) { - void SWP(ARM* cpu); + SWP(cpu); } From 2d081a6e02fd48275156b255d985bf064cfb1ad8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 1 Sep 2024 15:15:59 -0400 Subject: [PATCH 117/306] improve arm7 timings --- src/ARM.cpp | 236 ++++++++++++++++++++++++++++++++++++---------------- src/ARM.h | 15 ++-- 2 files changed, 172 insertions(+), 79 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7cfa3589..e3c07397 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -190,6 +190,8 @@ void ARM::Reset() BreakReq = false; #endif + MainRAMTimestamp = 0; + // zorp JumpTo(ExceptionBase); } @@ -201,6 +203,13 @@ void ARMv5::Reset() ARM::Reset(); } +void ARMv4::Reset() +{ + Nonseq = true; + + ARM::Reset(); +} + void ARM::DoSavestate(Savestate* file) { @@ -377,22 +386,15 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) else addr &= ~0x1; } - u32 oldregion = R[15] >> 23; - u32 newregion = addr >> 23; - - CodeRegion = addr >> 24; - CodeCycles = addr >> 15; // cheato - if (addr & 0x1) { addr &= ~0x1; R[15] = addr+2; - //if (newregion != oldregion) SetupCodeMem(addr); - + Nonseq = true; NextInstr[0] = CodeRead16(addr); + Nonseq = false; NextInstr[1] = CodeRead16(addr+2); - Cycles += NDS.ARM7MemTimings[CodeCycles][0] + NDS.ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; } @@ -400,12 +402,11 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) { addr &= ~0x3; R[15] = addr+4; - - //if (newregion != oldregion) SetupCodeMem(addr); - + + Nonseq = true; NextInstr[0] = CodeRead32(addr); + Nonseq = false; NextInstr[1] = CodeRead32(addr+4); - Cycles += NDS.ARM7MemTimings[CodeCycles][2] + NDS.ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; } @@ -815,7 +816,7 @@ void ARMv4::Execute() { if ((Halted == 1 || IdleLoop) && NDS.ARM7Timestamp < NDS.ARM7Target) { - Cycles = 0; + //Cycles = 0; NDS.ARM7Timestamp = NDS.ARM7Target; } IdleLoop = 0; @@ -882,8 +883,8 @@ void ARMv4::Execute() }*/ } - NDS.ARM7Timestamp += Cycles; - Cycles = 0; + //NDS.ARM7Timestamp += Cycles; + //Cycles = 0; } if (Halted == 2) @@ -1152,58 +1153,161 @@ u32 ARMv5::ReadMem(u32 addr, int size) } #endif +u16 ARMv4::CodeRead16(u32 addr) +{ + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?0:1]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 3; + } + + return BusRead16(addr); +} + +u32 ARMv4::CodeRead32(u32 addr) +{ + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?2:3]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 3; + } + + return BusRead32(addr); +} + bool ARMv4::DataRead8(u32 addr, u32* val) { + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 3; + } + *val = BusRead8(addr); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; return true; } bool ARMv4::DataRead16(u32 addr, u32* val) { addr &= ~1; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 3; + } *val = BusRead16(addr); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; return true; } bool ARMv4::DataRead32(u32 addr, u32* val) { addr &= ~3; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][2]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 3; + } *val = BusRead32(addr); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; return true; } bool ARMv4::DataRead32S(u32 addr, u32* val) { addr &= ~3; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][3]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 3; + } *val = BusRead32(addr); - DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; return true; } bool ARMv4::DataWrite8(u32 addr, u8 val) { + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 5; + } + BusWrite8(addr, val); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; return true; } bool ARMv4::DataWrite16(u32 addr, u16 val) { addr &= ~1; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 5; + } BusWrite16(addr, val); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][0]; return true; } @@ -1211,9 +1315,20 @@ bool ARMv4::DataWrite32(u32 addr, u32 val) { addr &= ~3; + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][2]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 5; + } + BusWrite32(addr, val); - DataRegion = addr; - DataCycles = NDS.ARM7MemTimings[addr >> 15][2]; return true; } @@ -1221,8 +1336,20 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) { addr &= ~3; + if ((addr >> 24) == 0x02) + { + if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + } + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][3]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM7Timestamp; + NDS.ARM7Timestamp -= 5; + } + BusWrite32(addr, val); - DataCycles += NDS.ARM7MemTimings[addr >> 15][3]; return true; } @@ -1230,63 +1357,30 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) void ARMv4::AddCycles_C() { // code only. this code fetch is sequential. - Cycles += NDS.ARM7MemTimings[CodeCycles][(CPSR&0x20)?1:3]; + Nonseq = false; } void ARMv4::AddCycles_CI(s32 num) { // code+internal. results in a nonseq code fetch. - Cycles += NDS.ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2] + num; + NDS.ARM7Timestamp += num; + + Nonseq = true; } void ARMv4::AddCycles_CDI() { // LDR/LDM cycles. - s32 numC = NDS.ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; - s32 numD = DataCycles; + NDS.ARM7Timestamp += 1; - if ((DataRegion >> 24) == 0x02) // mainRAM - { - if (CodeRegion == 0x02) - Cycles += numC + numD; - else - { - numC++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - } - else if (CodeRegion == 0x02) - { - numD++; - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - else - { - Cycles += numC + numD + 1; - } + Nonseq = true; } void ARMv4::AddCycles_CD() { // TODO: max gain should be 5c when writing to mainRAM - s32 numC = NDS.ARM7MemTimings[CodeCycles][(CPSR&0x20)?0:2]; - s32 numD = DataCycles; - - if ((DataRegion >> 24) == 0x02) - { - if (CodeRegion == 0x02) - Cycles += numC + numD; - else - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - else if (CodeRegion == 0x02) - { - Cycles += std::max(numC + numD - 3, std::max(numC, numD)); - } - else - { - Cycles += numC + numD; - } + + Nonseq = true; } u8 ARMv5::BusRead8(u32 addr) diff --git a/src/ARM.h b/src/ARM.h index 2603e646..f878d94b 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -185,6 +185,8 @@ public: MemRegion CodeMem; + u64 MainRAMTimestamp; + #ifdef JIT_ENABLED u32 FastBlockLookupStart, FastBlockLookupSize; u64* FastBlockLookup; @@ -384,6 +386,8 @@ class ARMv4 : public ARM { public: ARMv4(melonDS::NDS& nds, std::optional gdb, bool jit); + + void Reset() override; void FillPipeline() override; @@ -393,15 +397,10 @@ public: template void Execute(); - u16 CodeRead16(u32 addr) - { - return BusRead16(addr); - } + bool Nonseq; - u32 CodeRead32(u32 addr) - { - return BusRead32(addr); - } + u16 CodeRead16(u32 addr); + u32 CodeRead32(u32 addr); bool DataRead8(u32 addr, u32* val) override; bool DataRead16(u32 addr, u32* val) override; From 7cfc4b5849c8713201afb609cd65e6027dc120db Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 2 Sep 2024 07:35:49 -0400 Subject: [PATCH 118/306] ARM7: vram is 32 bit? --- src/NDS.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 1023d3c0..89b334d8 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -261,7 +261,7 @@ void NDS::InitTimings() SetARM7RegionTimings(0x02000, 0x03000, Mem7_MainRAM, 16, 8, 1); // main RAM SetARM7RegionTimings(0x03000, 0x04000, Mem7_WRAM, 32, 1, 1); // ARM7/shared WRAM SetARM7RegionTimings(0x04000, 0x04800, Mem7_IO, 32, 1, 1); // IO - SetARM7RegionTimings(0x06000, 0x07000, Mem7_VRAM, 16, 1, 1); // ARM7 VRAM + SetARM7RegionTimings(0x06000, 0x07000, Mem7_VRAM, 32, 1, 1); // ARM7 VRAM // handled later: GBA slot, wifi } From 299713e412f401977a62ddebe77a03cb6b3689cd Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 5 Sep 2024 09:13:46 -0400 Subject: [PATCH 119/306] basic arm9 set up --- src/ARM.cpp | 37 +++++++++++---- src/ARM.h | 56 ++++++++++++----------- src/CP15.cpp | 127 ++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 177 insertions(+), 43 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index e3c07397..105607f5 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -151,6 +151,7 @@ void ARM::Reset() { Cycles = 0; Halted = 0; + DataCycles = 0; IRQ = 0; @@ -199,6 +200,13 @@ void ARM::Reset() void ARMv5::Reset() { PU_Map = PU_PrivMap; + + TimestampActual = 0; + InterlockMem = 16; + InterlockWBCur = 16; + InterlockWBPrev = 16; + Store = false; + InterlockMask = 0; ARM::Reset(); } @@ -310,14 +318,12 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) u32 oldregion = R[15] >> 24; u32 newregion = addr >> 24; - RegionCodeCycles = MemTimings[addr >> 12][0]; - if (addr & 0x1) { addr &= ~0x1; R[15] = addr+2; - if (newregion != oldregion) SetupCodeMem(addr); + //if (newregion != oldregion) SetupCodeMem(addr); // two-opcodes-at-once fetch // doesn't matter if we put garbage in the MSbs there @@ -342,7 +348,7 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) addr &= ~0x3; R[15] = addr+4; - if (newregion != oldregion) SetupCodeMem(addr); + //if (newregion != oldregion) SetupCodeMem(addr); NextInstr[0] = CodeRead32(addr, true); Cycles += CodeCycles; @@ -744,8 +750,8 @@ void ARMv5::Execute() }*/ } - NDS.ARM9Timestamp += Cycles; - Cycles = 0; + //NDS.ARM9Timestamp += Cycles; + //Cycles = 0; } if (Halted == 2) @@ -816,7 +822,7 @@ void ARMv4::Execute() { if ((Halted == 1 || IdleLoop) && NDS.ARM7Timestamp < NDS.ARM7Target) { - //Cycles = 0; + Cycles = 0; NDS.ARM7Timestamp = NDS.ARM7Target; } IdleLoop = 0; @@ -882,9 +888,6 @@ void ARMv4::Execute() TriggerIRQ(); }*/ } - - //NDS.ARM7Timestamp += Cycles; - //Cycles = 0; } if (Halted == 2) @@ -1153,6 +1156,20 @@ u32 ARMv5::ReadMem(u32 addr, int size) } #endif + +void ARMv5::AddCycles_CI(s32 numI) +{ + NDS.ARM9Timestamp += numI; +} + +void ARMv5::AddCycles_MW() +{ + u64 TimestampActual = DataCycles + NDS.ARM9Timestamp; + s32 cycles = DataCycles - (3< 0) NDS.ARM9Timestamp += cycles; +} + u16 ARMv4::CodeRead16(u32 addr) { if ((addr >> 24) == 0x02) diff --git a/src/ARM.h b/src/ARM.h index f878d94b..31ff56cc 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -265,44 +265,41 @@ public: bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; + + template + void ExecuteStage(u8 rn, u8 rm) + { + static_assert((nregs < 2), "too many regs"); + + if constexpr (nregs == 1) + { + InterlockMask = 1 << rn; + } + if constexpr (nregs == 2) + { + InterlockMask = 1 << rn | 1 << rm; + } + + AddCycles_C(); + } + void AddCycles_C() override { - // code only. always nonseq 32-bit for ARM9. - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC; } - void AddCycles_CI(s32 numI) override - { - // code+internal - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - Cycles += numC + numI; - } + void AddCycles_CI(s32 numI) override; + + void AddCycles_MW(); void AddCycles_CDI() override { - // LDR/LDM cycles. ARM9 seems to skip the internal cycle there. - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; + AddCycles_MW(); } void AddCycles_CD() override { - // TODO: ITCM data fetches shouldn't be parallelized, they say - s32 numC = (R[15] & 0x2) ? 0 : CodeCycles; - s32 numD = DataCycles; - - //if (DataRegion != CodeRegion) - Cycles += std::max(numC + numD - 6, std::max(numC, numD)); - //else - // Cycles += numC + numD; + AddCycles_MW(); } void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -367,6 +364,13 @@ public: u8* CurICacheLine; bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); + + u64 TimestampActual; + u8 InterlockMem; + u8 InterlockWBCur; + u8 InterlockWBPrev; + bool Store; + u16 InterlockMask; #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; diff --git a/src/CP15.cpp b/src/CP15.cpp index 6fcaff93..eb84d3ee 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -299,13 +299,16 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) u8 pu = PU_Map[i]; u8* bustimings = NDS.ARM9MemTimings[i >> 2]; + // checkme: should these be (bus timings shifted) - 1 or ((bustimings - 1) shifted) + 1 + // should the last cycle be halved...? + if (pu & 0x40) { MemTimings[i][0] = 0xFF;//kCodeCacheTiming; } else { - MemTimings[i][0] = bustimings[2] << NDS.ARM9ClockShift; + MemTimings[i][0] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; } if (pu & 0x10) @@ -316,9 +319,9 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) } else { - MemTimings[i][1] = bustimings[0] << NDS.ARM9ClockShift; - MemTimings[i][2] = bustimings[2] << NDS.ARM9ClockShift; - MemTimings[i][3] = bustimings[3] << NDS.ARM9ClockShift; + MemTimings[i][1] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][2] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][3] = ((bustimings[3] - 1) << NDS.ARM9ClockShift) + 1; } } } @@ -784,10 +787,11 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (addr < ITCMSize) { CodeCycles = 1; + if ((DataRegion == Mem9_ITCM) && (TimestampActual >= NDS.ARM9Timestamp)) NDS.ARM9Timestamp = TimestampActual + 1; return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } - CodeCycles = RegionCodeCycles; + CodeCycles = MemTimings[addr >> 12][0]; if (CodeCycles == 0xFF) // cached memory. hax { if (branch || !(addr & 0x1F)) @@ -798,7 +802,21 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) //return *(u32*)&CurICacheLine[addr & 0x1C]; } - if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + + if (DataRegion == Mem9_MainRAM) NDS.ARM9Timestamp += CodeCycles; + } + + if (CodeRegion == DataRegion && Store) NDS.ARM9Timestamp += (1<>12] & 0x01)) [[unlikely]] { DataAbort(); @@ -815,15 +834,26 @@ bool ARMv5::DataRead8(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; + DataRegion = Mem9_ITCM; *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; + DataRegion = Mem9_DTCM; *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead8(addr); DataCycles = MemTimings[addr >> 12][1]; @@ -832,6 +862,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { + Store = false; if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); @@ -843,15 +874,26 @@ bool ARMv5::DataRead16(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; + DataRegion = Mem9_ITCM; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; + DataRegion = Mem9_DTCM; *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead16(addr); DataCycles = MemTimings[addr >> 12][1]; @@ -860,6 +902,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { + Store = false; if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { DataAbort(); @@ -871,16 +914,27 @@ bool ARMv5::DataRead32(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; + DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; + DataRegion = Mem9_DTCM; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + *val = BusRead32(addr); DataCycles = MemTimings[addr >> 12][2]; return true; @@ -899,23 +953,36 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; + DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; + DataRegion = Mem9_DTCM; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + *val = BusRead32(addr); - DataCycles += MemTimings[addr >> 12][3]; + NDS.ARM9Timestamp += DataCycles; + DataCycles = MemTimings[addr >> 12][3]; return true; } bool ARMv5::DataWrite8(u32 addr, u8 val) { + Store = true; if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); @@ -925,6 +992,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { DataCycles = 1; + DataRegion = Mem9_ITCM; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); return true; @@ -932,10 +1000,20 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; + DataRegion = Mem9_DTCM; *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + BusWrite8(addr, val); DataCycles = MemTimings[addr >> 12][1]; return true; @@ -943,6 +1021,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { + Store = true; if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); @@ -954,6 +1033,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { DataCycles = 1; + DataRegion = Mem9_ITCM; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); return true; @@ -961,10 +1041,20 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; + DataRegion = Mem9_DTCM; *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + BusWrite16(addr, val); DataCycles = MemTimings[addr >> 12][1]; return true; @@ -972,6 +1062,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { + Store = true; if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { DataAbort(); @@ -983,6 +1074,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles = 1; + DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); return true; @@ -990,10 +1082,20 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; + DataRegion = Mem9_DTCM; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + BusWrite32(addr, val); DataCycles = MemTimings[addr >> 12][2]; return true; @@ -1012,6 +1114,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) if (addr < ITCMSize) { DataCycles += 1; + DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -1021,10 +1124,20 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) if ((addr & DTCMMask) == DTCMBase) { DataCycles += 1; + DataRegion = Mem9_DTCM; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; } + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + BusWrite32(addr, val); DataCycles += MemTimings[addr >> 12][3]; return true; From ceb5a9febee5ab552d9e60dfce8b82c1f24dabf4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 6 Sep 2024 03:59:59 -0400 Subject: [PATCH 120/306] draw (most of) the rest of the owl --- src/ARM.cpp | 13 ++++---- src/ARM.h | 13 ++++---- src/ARMInterpreter.cpp | 14 ++++++-- src/ARMInterpreter_ALU.cpp | 34 +++++++++++++++----- src/CP15.cpp | 65 +++++++++++++++++++++++++++++--------- src/NDS.h | 1 + 6 files changed, 102 insertions(+), 38 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 105607f5..8bac58a2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1157,17 +1157,18 @@ u32 ARMv5::ReadMem(u32 addr, int size) #endif -void ARMv5::AddCycles_CI(s32 numI) +void ARMv5::AddCycles_CI(s32 numX) { - NDS.ARM9Timestamp += numI; + NDS.ARM9Timestamp += numX; } -void ARMv5::AddCycles_MW() +void ARMv5::AddCycles_MW(s32 numM) { - u64 TimestampActual = DataCycles + NDS.ARM9Timestamp; - s32 cycles = DataCycles - (3< 0) NDS.ARM9Timestamp += cycles; + numM -= 3< 0) NDS.ARM9Timestamp += numM; } u16 ARMv4::CodeRead16(u32 addr) diff --git a/src/ARM.h b/src/ARM.h index 31ff56cc..6fa74e22 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -284,22 +284,20 @@ public: } - void AddCycles_C() override - { - } + void AddCycles_C() override {} - void AddCycles_CI(s32 numI) override; + void AddCycles_CI(s32 numX) override; - void AddCycles_MW(); + void AddCycles_MW(s32 numM); void AddCycles_CDI() override { - AddCycles_MW(); + AddCycles_MW(DataCycles); } void AddCycles_CD() override { - AddCycles_MW(); + AddCycles_MW(DataCycles); } void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -365,6 +363,7 @@ public: bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); + u64 ITCMTimestamp; u64 TimestampActual; u8 InterlockMem; u8 InterlockWBCur; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 15ec42db..108d343e 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -203,7 +203,12 @@ void A_MRS(ARM* cpu) cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; - if (cpu->Num != 1) cpu->AddCycles_CI(1); // arm9 + if (cpu->Num != 1) // arm9 + { + cpu->AddCycles_C(); // 1 X + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + } else cpu->AddCycles_C(); // arm7 } @@ -263,7 +268,12 @@ void A_MRC(ARM* cpu) return A_UNK(cpu); // TODO: check what kind of exception it really is } - if (cpu->Num != 1) cpu->AddCycles_CI(1); // checkme + if (cpu->Num != 1) + { + cpu->AddCycles_C(); // 1 Execute cycle + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 Memory cycles + } else cpu->AddCycles_CI(2 + 1); // TODO: checkme } diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 37c79904..2e9c3078 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -774,18 +774,26 @@ void A_MUL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); + else + { + cpu->AddCycles_C(); // 1 X + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 2; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 3; else cycles = 4; + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_MLA(ARM* cpu) @@ -804,18 +812,26 @@ void A_MLA(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 3 : 1; + { + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); + else + { + cpu->AddCycles_C(); // 1 X + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_UMULL(ARM* cpu) @@ -1041,8 +1057,10 @@ void A_SMLALxy(ARM* cpu) cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - - cpu->AddCycles_CI(1); // TODO: interlock?? + + cpu->AddCycles_C(); // 1 X + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M } diff --git a/src/CP15.cpp b/src/CP15.cpp index eb84d3ee..bcbfbb24 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -321,7 +321,7 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) { MemTimings[i][1] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; MemTimings[i][2] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][3] = ((bustimings[3] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][3] = bustimings[3] << NDS.ARM9ClockShift; // inaccurate but ehgh } } } @@ -781,13 +781,21 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] { CodeCycles = 1; + + NDS.ARM9Timestamp += CodeCycles; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; return 0; } if (addr < ITCMSize) { CodeCycles = 1; - if ((DataRegion == Mem9_ITCM) && (TimestampActual >= NDS.ARM9Timestamp)) NDS.ARM9Timestamp = TimestampActual + 1; + + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; + NDS.ARM9Timestamp += CodeCycles; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } @@ -811,13 +819,13 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (DataRegion == Mem9_MainRAM) NDS.ARM9Timestamp += CodeCycles; } - if (CodeRegion == DataRegion && Store) NDS.ARM9Timestamp += (1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<> 12][1]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead8(addr); - DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -874,6 +885,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; @@ -887,16 +899,18 @@ bool ARMv5::DataRead16(u32 addr, u32* val) } NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead16(addr); - DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -914,6 +928,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; @@ -927,16 +942,18 @@ bool ARMv5::DataRead32(u32 addr, u32* val) } NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead32(addr); - DataCycles = MemTimings[addr >> 12][2]; return true; } @@ -953,6 +970,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { DataCycles += 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; @@ -964,19 +982,21 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + + NDS.ARM9Timestamp += DataCycles; + DataCycles = MemTimings[addr >> 12][3]; NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead32(addr); - NDS.ARM9Timestamp += DataCycles; - DataCycles = MemTimings[addr >> 12][3]; return true; } @@ -992,6 +1012,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -1006,16 +1027,19 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) } NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= (2<>14]; BusWrite8(addr, val); - DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -1033,6 +1057,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -1047,16 +1072,19 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= (2<>14]; BusWrite16(addr, val); - DataCycles = MemTimings[addr >> 12][1]; return true; } @@ -1074,6 +1102,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); @@ -1088,16 +1117,19 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= (2<>14]; BusWrite32(addr, val); - DataCycles = MemTimings[addr >> 12][2]; return true; } @@ -1114,6 +1146,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) if (addr < ITCMSize) { DataCycles += 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED @@ -1130,14 +1163,16 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) } NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) NDS.ARM9Timestamp = MainRAMTimestamp - DataCycles; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; - + BusWrite32(addr, val); DataCycles += MemTimings[addr >> 12][3]; return true; diff --git a/src/NDS.h b/src/NDS.h index b2bfb385..6ee64ac8 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -192,6 +192,7 @@ enum Mem9_VRAM = 0x00000100, Mem9_GBAROM = 0x00020000, Mem9_GBARAM = 0x00040000, + Mem9_Null = 0x80000000, Mem7_BIOS = 0x00000001, Mem7_MainRAM = 0x00000002, From 41db7b9df6337e8860a8f5bbc00c25a1ed0b7b30 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 7 Sep 2024 13:45:33 -0400 Subject: [PATCH 121/306] fix main ram adding cycles twice if code and memory region are the same --- src/CP15.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index bcbfbb24..db1c67ea 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -814,12 +814,9 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - - if (DataRegion == Mem9_MainRAM) NDS.ARM9Timestamp += CodeCycles; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1< Date: Sun, 8 Sep 2024 11:10:31 -0400 Subject: [PATCH 122/306] fix thumb "no fetches" --- src/ARM.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 8bac58a2..f398f03e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -689,8 +689,17 @@ void ARMv5::Execute() R[15] += 2; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - if (R[15] & 0x2) { NextInstr[1] >>= 16; CodeCycles = 0; } - else NextInstr[1] = CodeRead32(R[15], false); + if (R[15] & 0x2) + { + // no fetch is performed. + // unclear if it's a "1 cycle fetch" or a legitmately 0 cycle fetch stage? + // in practice it doesn't matter though. + NextInstr[1] >>= 16; + NDS.ARM9Timestamp++; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; + } + else NextInstr[1] = CodeRead32(R[15], false); if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); From cacaf0ec7c3a3c3ed2b94d8b55a6c5a2a21b7bc6 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Sep 2024 12:05:10 -0400 Subject: [PATCH 123/306] make it work --- src/CP15.cpp | 65 +++++++++++++++++++++++++++++++++++++++------------- src/NDS.h | 1 + 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index c2524bdb..b7f984a3 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -379,7 +379,6 @@ u32 ARMv5::ICacheLookup(const u32 addr) { if ((ICacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) { - CodeCycles = 1; u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] { @@ -395,6 +394,9 @@ u32 ARMv5::ICacheLookup(const u32 addr) return NDS.ARM9Read32(addr & ~3); } } + NDS.ARM9Timestamp += 1; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } } @@ -460,7 +462,20 @@ u32 ARMv5::ICacheLookup(const u32 addr) // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); // first N32 remaining S32 - CodeCycles = (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1)) - 1) << NDS.ARM9ClockShift) + 1; + DataRegion = Mem9_Null; return ptr[(addr & (ICACHE_LINELENGTH-1)) >> 2]; } @@ -513,7 +528,6 @@ u32 ARMv5::DCacheLookup(const u32 addr) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) { - DataCycles = 1; u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] { @@ -533,6 +547,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) return BusRead32(addr & ~3); } } + DataCycles += 1; + DataRegion = Mem9_DCache; //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2], set, id>>2); return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } @@ -615,7 +631,20 @@ u32 ARMv5::DCacheLookup(const u32 addr) // ouch :/ //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); // first N32 remaining S32 - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1))) << NDS.ARM9ClockShift; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + + NDS.ARM9Timestamp += ((NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 2)) - 1) << NDS.ARM9ClockShift) + 1; + DataCycles = NDS.ARM9MemTimings[tag>>14][3] << NDS.ARM9ClockShift; return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -632,7 +661,8 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 2] = val; - DataCycles = 1; + DataCycles += 1; + DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) { @@ -667,6 +697,7 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) u16 *cacheLine = (u16 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 1] = val; DataCycles = 1; + DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) { @@ -702,6 +733,7 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) u8 *cacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[addr & (DCACHE_LINELENGTH-1)] = val; DataCycles = 1; + DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) { @@ -1562,9 +1594,6 @@ u32 ARMv5::CP15Read(const u32 id) const // TCM are handled here. // TODO: later on, handle PU -u32 ARMv5::CodeRead32(const u32 addr, bool const branch) -{ - u32 ARMv5::CodeRead32(u32 addr, bool branch) { // prefetch abort @@ -1650,8 +1679,9 @@ bool ARMv5::DataRead8(u32 addr, u32* val) { if (IsAddressDCachable(addr)) { + DataCycles = 0; *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; - return; + return true; } } } @@ -1708,8 +1738,9 @@ bool ARMv5::DataRead16(u32 addr, u32* val) { if (IsAddressDCachable(addr)) { + DataCycles = 0; *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; - return; + return true; } } } @@ -1769,8 +1800,9 @@ bool ARMv5::DataRead32(u32 addr, u32* val) { if (IsAddressDCachable(addr)) { + DataCycles = 0; *val = DCacheLookup(addr); - return; + return true; } } } @@ -1828,7 +1860,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (IsAddressDCachable(addr)) { *val = DCacheLookup(addr); - return; + return true; } } } @@ -1886,7 +1918,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (IsAddressDCachable(addr)) { if (DCacheWrite8(addr, val)) - return; + return true; } } } @@ -1946,7 +1978,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (IsAddressDCachable(addr)) { if (DCacheWrite16(addr, val)) - return; + return true; } } } @@ -2006,8 +2038,9 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) { if (IsAddressDCachable(addr)) { + DataCycles = 0; if (DCacheWrite32(addr, val)) - return; + return true; } } } @@ -2067,7 +2100,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) if (IsAddressDCachable(addr)) { if (DCacheWrite32(addr, val)) - return; + return true; } } } diff --git a/src/NDS.h b/src/NDS.h index 45313572..e23b1f27 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -192,6 +192,7 @@ enum Mem9_VRAM = 0x00000100, Mem9_GBAROM = 0x00020000, Mem9_GBARAM = 0x00040000, + Mem9_DCache = 0x40000000, Mem9_Null = 0x80000000, Mem7_BIOS = 0x00000001, From c5ac682f04814d7c92d5e0eb8c85e093578f2f4b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 12 Sep 2024 12:41:49 -0400 Subject: [PATCH 124/306] improve data abort handling further --- src/ARM.cpp | 2 +- src/ARM.h | 6 +- src/ARMInterpreter_LoadStore.cpp | 228 ++++++++++++++++++------------- src/CP15.cpp | 34 +++-- 4 files changed, 162 insertions(+), 108 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7cfa3589..a7c6c11e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1217,7 +1217,7 @@ bool ARMv4::DataWrite32(u32 addr, u32 val) return true; } -bool ARMv4::DataWrite32S(u32 addr, u32 val, bool dataabort) +bool ARMv4::DataWrite32S(u32 addr, u32 val) { addr &= ~3; diff --git a/src/ARM.h b/src/ARM.h index 2603e646..26080b51 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -142,7 +142,7 @@ public: virtual bool DataWrite8(u32 addr, u8 val) = 0; virtual bool DataWrite16(u32 addr, u16 val) = 0; virtual bool DataWrite32(u32 addr, u32 val) = 0; - virtual bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) = 0; + virtual bool DataWrite32S(u32 addr, u32 val) = 0; virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; @@ -262,7 +262,7 @@ public: bool DataWrite8(u32 addr, u8 val) override; bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; - bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; + bool DataWrite32S(u32 addr, u32 val) override; void AddCycles_C() override { @@ -410,7 +410,7 @@ public: bool DataWrite8(u32 addr, u8 val) override; bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; - bool DataWrite32S(u32 addr, u32 val, bool dataabort = false) override; + bool DataWrite32S(u32 addr, u32 val) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 659deaef..bbbe08fd 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -82,10 +82,10 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } u32 val; - bool dataabort; - if constexpr (size == 8) dataabort = !cpu->DataRead8 (addr, &val); - if constexpr (size == 16) dataabort = !cpu->DataRead16(addr, &val); - if constexpr (size == 32) dataabort = !cpu->DataRead32(addr, &val); + bool dabort; + if constexpr (size == 8) dabort = !cpu->DataRead8 (addr, &val); + if constexpr (size == 16) dabort = !cpu->DataRead16(addr, &val); + if constexpr (size == 32) dabort = !cpu->DataRead32(addr, &val); if constexpr (writeback == Writeback::Trans) { @@ -94,8 +94,11 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } cpu->AddCycles_CDI(); - if (dataabort) return; - + if (dabort) + { + ((ARMv5*)cpu)->DataAbort(); + return; + } if constexpr (size == 8 && signror) val = (s32)(s8)val; if constexpr (size == 16 && signror) val = (s32)(s16)val; if constexpr (size == 32 && signror) val = ROR(val, ((addr&0x3)<<3)); @@ -128,10 +131,10 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; } - bool dataabort; - if constexpr (size == 8) dataabort = !cpu->DataWrite8 (addr, storeval); - if constexpr (size == 16) dataabort = !cpu->DataWrite16(addr, storeval); - if constexpr (size == 32) dataabort = !cpu->DataWrite32(addr, storeval); + bool dabort; + if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval); + if constexpr (size == 16) dabort = !cpu->DataWrite16(addr, storeval); + if constexpr (size == 32) dabort = !cpu->DataWrite32(addr, storeval); if constexpr (writeback == Writeback::Trans) { @@ -140,7 +143,11 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } cpu->AddCycles_CD(); - if (dataabort) return; + if (dabort) + { + ((ARMv5*)cpu)->DataAbort(); + return; + } if constexpr (writeback != Writeback::None) cpu->R[rn] += offset; } @@ -273,8 +280,12 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (offset , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - u32 val; if (!cpu->DataRead32S(offset+4, &val)) {cpu->AddCycles_CDI(); return;} \ + bool dabort = !cpu->DataRead32(offset, &cpu->R[r]); \ + u32 val; dabort |= !cpu->DataRead32S(offset+4, &val); \ + if (dabort) { \ + cpu->AddCycles_CDI(); \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ @@ -285,8 +296,12 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - if (!cpu->DataRead32 (addr , &cpu->R[r ])) {cpu->AddCycles_CDI(); return;} \ - u32 val; if (!cpu->DataRead32S(addr+4, &val)) {cpu->AddCycles_CDI(); return;} \ + bool dabort = !cpu->DataRead32(addr, &cpu->R[r]); \ + u32 val; dabort |= !cpu->DataRead32S(addr+4, &val); \ + if (dabort) { \ + cpu->AddCycles_CDI(); \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ else cpu->R[r+1] = val; \ cpu->AddCycles_CDI(); \ @@ -297,11 +312,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ + bool dabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ - dataabort |= !cpu->DataWrite32S (offset+4, storeval, dataabort); /* no, i dont understand it either */ \ + dabort |= !cpu->DataWrite32S (offset+4, storeval); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ - if (dataabort) return; \ + if (dabort) { \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_STRD_POST \ @@ -309,11 +326,13 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ - bool dataabort = !cpu->DataWrite32(addr, cpu->R[r]); \ + bool dabort = !cpu->DataWrite32(addr, cpu->R[r]); \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ - dataabort |= !cpu->DataWrite32S (addr+4, storeval, dataabort); \ + dabort |= !cpu->DataWrite32S (addr+4, storeval); \ cpu->AddCycles_CD(); \ - if (dataabort) return; \ + if (dabort) { \ + ((ARMv5*)cpu)->DataAbort(); \ + return; } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ @@ -396,9 +415,11 @@ inline void SWP(ARM* cpu) if (rd != 15) cpu->R[rd] = val; else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't seem to work on the arm 9? } + else ((ARMv5*)cpu)->DataAbort(); cpu->DataCycles += numD; } + else ((ARMv5*)cpu)->DataAbort(); cpu->AddCycles_CDI(); } @@ -423,6 +444,7 @@ void A_LDM(ARM* cpu) u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; + bool dabort = false; if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -451,11 +473,12 @@ void A_LDM(ARM* cpu) if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]) - : cpu->DataRead32S(base, &cpu->R[i]))) - { - goto dataabort; - } + u32 val; + dabort |= !(first ? cpu->DataRead32 (base, &val) + : cpu->DataRead32S(base, &val)); + + // remaining loads still occur but are not written to a reg after a data abort is raised + if (!dabort) cpu->R[i] = val; first = false; if (!preinc) base += 4; @@ -466,11 +489,8 @@ void A_LDM(ARM* cpu) if ((cpu->CurInstr & (1<<15))) { if (preinc) base += 4; - if (!(first ? cpu->DataRead32 (base, &pc) - : cpu->DataRead32S(base, &pc))) - { - goto dataabort; - } + dabort |= !(first ? cpu->DataRead32 (base, &pc) + : cpu->DataRead32S(base, &pc)); if (!preinc) base += 4; @@ -482,6 +502,14 @@ void A_LDM(ARM* cpu) if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + // handle data aborts + if (dabort) + { + cpu->AddCycles_CDI(); + ((ARMv5*)cpu)->DataAbort(); + return; + } + // writeback to base if (cpu->CurInstr & (1<<21)) { @@ -506,19 +534,6 @@ void A_LDM(ARM* cpu) if (cpu->CurInstr & (1<<15)) cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); - // jump here if a data abort occurred; writeback is ignored, and any jumps were aborted - if (false) - { - dataabort: - - // switch back to original set of regs - if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) - cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - - // restore original value of base in case the reg got written to - cpu->R[baseid] = oldbase; - } - cpu->AddCycles_CDI(); } @@ -529,6 +544,7 @@ void A_STM(ARM* cpu) u32 oldbase = base; u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; + bool dabort = false; if (!(cpu->CurInstr & (1<<23))) { @@ -573,11 +589,8 @@ void A_STM(ARM* cpu) if (i == 15) val+=4; - if (!(first ? cpu->DataWrite32 (base, val) - : cpu->DataWrite32S(base, val))) - { - goto dataabort; - } + dabort |= !(first ? cpu->DataWrite32 (base, val) + : cpu->DataWrite32S(base, val)); first = false; @@ -587,21 +600,20 @@ void A_STM(ARM* cpu) if (cpu->CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + + // handle data aborts + if (dabort) + { + // restore original value of base + cpu->R[baseid] = oldbase; + cpu->AddCycles_CD(); + ((ARMv5*)cpu)->DataAbort(); + return; + } if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) cpu->R[baseid] = base; - // jump here if a data abort occurred - if (false) - { - dataabort: - - if (cpu->CurInstr & (1<<22)) - cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - - // restore original value of base - cpu->R[baseid] = oldbase; - } cpu->AddCycles_CD(); } @@ -616,9 +628,13 @@ void A_STM(ARM* cpu) void T_LDR_PCREL(ARM* cpu) { u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); - cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); + bool dabort = !cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); + if (dabort) + { + ((ARMv5*)cpu)->DataAbort(); + } } @@ -711,6 +727,7 @@ void T_PUSH(ARM* cpu) { int nregs = 0; bool first = true; + bool dabort = false; for (int i = 0; i < 8; i++) { @@ -729,11 +746,9 @@ void T_PUSH(ARM* cpu) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]) - : cpu->DataWrite32S(base, cpu->R[i]))) - { - goto dataabort; - } + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[i]) + : cpu->DataWrite32S(base, cpu->R[i])); + first = false; base += 4; } @@ -741,16 +756,19 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) { - if (!(first ? cpu->DataWrite32 (base, cpu->R[14]) - : cpu->DataWrite32S(base, cpu->R[14]))) - { - goto dataabort; - } + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[14]) + : cpu->DataWrite32S(base, cpu->R[14])); + } + + if (dabort) + { + cpu->AddCycles_CD(); + ((ARMv5*)cpu)->DataAbort(); + return; } cpu->R[13] = wbbase; - dataabort: cpu->AddCycles_CD(); } @@ -758,16 +776,18 @@ void T_POP(ARM* cpu) { u32 base = cpu->R[13]; bool first = true; + bool dabort = false; for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]) - : cpu->DataRead32S(base, &cpu->R[i]))) - { - goto dataabort; - } + u32 val; + dabort |= !(first ? cpu->DataRead32 (base, &val) + : cpu->DataRead32S(base, &val)); + + if (!dabort) cpu->R[i] = val; + first = false; base += 4; } @@ -776,19 +796,25 @@ void T_POP(ARM* cpu) if (cpu->CurInstr & (1<<8)) { u32 pc; - if (!(first ? cpu->DataRead32 (base, &pc) - : cpu->DataRead32S(base, &pc))) - { - goto dataabort; - } + dabort |= !(first ? cpu->DataRead32 (base, &pc) + : cpu->DataRead32S(base, &pc)); + + if (dabort) goto dataabort; if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; cpu->JumpTo(pc); base += 4; } + if (dabort) + { + dataabort: + cpu->AddCycles_CDI(); + ((ARMv5*)cpu)->DataAbort(); + return; + } + cpu->R[13] = base; - dataabort: cpu->AddCycles_CDI(); } @@ -796,24 +822,29 @@ void T_STMIA(ARM* cpu) { u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; + bool dabort = false; for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]) - : cpu->DataWrite32S(base, cpu->R[i]))) - { - goto dataabort; - } + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[i]) + : cpu->DataWrite32S(base, cpu->R[i])); + first = false; base += 4; } } + if (dabort) + { + cpu->AddCycles_CD(); + ((ARMv5*)cpu)->DataAbort(); + return; + } + // TODO: check "Rb included in Rlist" case cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - dataabort: cpu->AddCycles_CD(); } @@ -821,25 +852,32 @@ void T_LDMIA(ARM* cpu) { u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; + bool dabort = false; for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &cpu->R[i]) - : cpu->DataRead32S(base, &cpu->R[i]))) - { - goto dataabort; - } + u32 val; + dabort |= !(first ? cpu->DataRead32 (base, &val) + : cpu->DataRead32S(base, &val)); + + if (!dabort) cpu->R[i] = val; first = false; base += 4; } } + if (dabort) + { + cpu->AddCycles_CDI(); + ((ARMv5*)cpu)->DataAbort(); + return; + } + if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - dataabort: cpu->AddCycles_CDI(); } diff --git a/src/CP15.cpp b/src/CP15.cpp index 6fcaff93..5bffb185 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -806,9 +806,11 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) bool ARMv5::DataRead8(u32 addr, u32* val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { - DataAbort(); + DataCycles = 1; return false; } @@ -832,9 +834,11 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { - DataAbort(); + DataCycles = 1; return false; } @@ -860,9 +864,11 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { - DataAbort(); + DataCycles = 1; return false; } @@ -888,9 +894,11 @@ bool ARMv5::DataRead32(u32 addr, u32* val) bool ARMv5::DataRead32S(u32 addr, u32* val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { - DataAbort(); + DataCycles += 1; return false; } @@ -916,9 +924,11 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) bool ARMv5::DataWrite8(u32 addr, u8 val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { - DataAbort(); + DataCycles = 1; return false; } @@ -943,9 +953,11 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { - DataAbort(); + DataCycles = 1; return false; } @@ -972,9 +984,11 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { - DataAbort(); + DataCycles = 1; return false; } @@ -999,11 +1013,13 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) return true; } -bool ARMv5::DataWrite32S(u32 addr, u32 val, bool dataabort) +bool ARMv5::DataWrite32S(u32 addr, u32 val) { + // Data Aborts + // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { - if (!dataabort) DataAbort(); + DataCycles += 1; return false; } From a0d71135a1ff2ddd55b0e26b5b55ef5260fdb448 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 13 Sep 2024 07:33:18 -0400 Subject: [PATCH 125/306] very minor optimization attempt --- src/ARMInterpreter_LoadStore.cpp | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index bbbe08fd..bf187aca 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -94,7 +94,7 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } cpu->AddCycles_CDI(); - if (dabort) + if (dabort) [[unlikely]] { ((ARMv5*)cpu)->DataAbort(); return; @@ -143,7 +143,7 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } cpu->AddCycles_CD(); - if (dabort) + if (dabort) [[unlikely]] { ((ARMv5*)cpu)->DataAbort(); return; @@ -316,7 +316,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dabort |= !cpu->DataWrite32S (offset+4, storeval); /* no, i dont understand it either */ \ cpu->AddCycles_CD(); \ - if (dabort) { \ + if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; @@ -330,7 +330,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dabort |= !cpu->DataWrite32S (addr+4, storeval); \ cpu->AddCycles_CD(); \ - if (dabort) { \ + if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; @@ -400,12 +400,12 @@ inline void SWP(ARM* cpu) u32 val; if ((byte ? cpu->DataRead8 (base, &val) - : cpu->DataRead32(base, &val))) + : cpu->DataRead32(base, &val))) [[likely]] { u32 numD = cpu->DataCycles; if ((byte ? cpu->DataWrite8 (base, rm) - : cpu->DataWrite32(base, rm))) + : cpu->DataWrite32(base, rm))) [[likely]] { // rd only gets updated if both read and write succeed u32 rd = (cpu->CurInstr >> 12) & 0xF; @@ -478,7 +478,7 @@ void A_LDM(ARM* cpu) : cpu->DataRead32S(base, &val)); // remaining loads still occur but are not written to a reg after a data abort is raised - if (!dabort) cpu->R[i] = val; + if (!dabort) [[likely]] cpu->R[i] = val; first = false; if (!preinc) base += 4; @@ -503,7 +503,7 @@ void A_LDM(ARM* cpu) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); // handle data aborts - if (dabort) + if (dabort) [[unlikely]] { cpu->AddCycles_CDI(); ((ARMv5*)cpu)->DataAbort(); @@ -602,7 +602,7 @@ void A_STM(ARM* cpu) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); // handle data aborts - if (dabort) + if (dabort) [[unlikely]] { // restore original value of base cpu->R[baseid] = oldbase; @@ -631,7 +631,7 @@ void T_LDR_PCREL(ARM* cpu) bool dabort = !cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); - if (dabort) + if (dabort) [[unlikely]] { ((ARMv5*)cpu)->DataAbort(); } @@ -760,7 +760,7 @@ void T_PUSH(ARM* cpu) : cpu->DataWrite32S(base, cpu->R[14])); } - if (dabort) + if (dabort) [[unlikely]] { cpu->AddCycles_CD(); ((ARMv5*)cpu)->DataAbort(); @@ -786,7 +786,7 @@ void T_POP(ARM* cpu) dabort |= !(first ? cpu->DataRead32 (base, &val) : cpu->DataRead32S(base, &val)); - if (!dabort) cpu->R[i] = val; + if (!dabort) [[likely]] cpu->R[i] = val; first = false; base += 4; @@ -799,13 +799,13 @@ void T_POP(ARM* cpu) dabort |= !(first ? cpu->DataRead32 (base, &pc) : cpu->DataRead32S(base, &pc)); - if (dabort) goto dataabort; + if (dabort) [[unlikely]] goto dataabort; if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; cpu->JumpTo(pc); base += 4; } - if (dabort) + if (dabort) [[unlikely]] { dataabort: cpu->AddCycles_CDI(); @@ -836,7 +836,7 @@ void T_STMIA(ARM* cpu) } } - if (dabort) + if (dabort) [[unlikely]] { cpu->AddCycles_CD(); ((ARMv5*)cpu)->DataAbort(); @@ -862,13 +862,13 @@ void T_LDMIA(ARM* cpu) dabort |= !(first ? cpu->DataRead32 (base, &val) : cpu->DataRead32S(base, &val)); - if (!dabort) cpu->R[i] = val; + if (!dabort) [[likely]] cpu->R[i] = val; first = false; base += 4; } } - if (dabort) + if (dabort) [[unlikely]] { cpu->AddCycles_CDI(); ((ARMv5*)cpu)->DataAbort(); From 3b9a9e4eb3d8de840ceab9e0ff57c8dc1092d6a5 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Sep 2024 10:23:15 -0400 Subject: [PATCH 126/306] multiply instructions can't write to r15 --- src/ARMInterpreter_ALU.cpp | 62 ++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 37c79904..350ed168 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -766,7 +766,9 @@ void A_MUL(ARM* cpu) u32 res = rm * rs; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) // check arm7 + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + if (cpu->CurInstr & (1<<20)) { cpu->SetNZ(res & 0x80000000, @@ -795,8 +797,10 @@ void A_MLA(ARM* cpu) u32 rn = cpu->R[(cpu->CurInstr >> 12) & 0xF]; u32 res = (rm * rs) + rn; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) // check arm7 + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (cpu->CurInstr & (1<<20)) { cpu->SetNZ(res & 0x80000000, @@ -825,8 +829,11 @@ void A_UMULL(ARM* cpu) u64 res = (u64)rm * (u64)rs; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (cpu->CurInstr & (1<<20)) { cpu->SetNZ((u32)(res >> 63ULL), @@ -857,9 +864,12 @@ void A_UMLAL(ARM* cpu) u64 rd = (u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL); res += rd; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); if (cpu->CurInstr & (1<<20)) { cpu->SetNZ((u32)(res >> 63ULL), @@ -887,9 +897,12 @@ void A_SMULL(ARM* cpu) u32 rs = cpu->R[(cpu->CurInstr >> 8) & 0xF]; s64 res = (s64)(s32)rm * (s64)(s32)rs; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); if (cpu->CurInstr & (1<<20)) { cpu->SetNZ((u32)(res >> 63ULL), @@ -920,9 +933,12 @@ void A_SMLAL(ARM* cpu) s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); res += rd; + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); if (cpu->CurInstr & (1<<20)) { cpu->SetNZ((u32)(res >> 63ULL), @@ -959,8 +975,10 @@ void A_SMLAxy(ARM* cpu) u32 res_mul = ((s16)rm * (s16)rs); u32 res = res_mul + rn; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; @@ -980,8 +998,9 @@ void A_SMLAWy(ARM* cpu) u32 res_mul = ((s64)(s32)rm * (s16)rs) >> 16; u32 res = res_mul + rn; - - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; @@ -1001,8 +1020,9 @@ void A_SMULxy(ARM* cpu) else rs &= 0xFFFF; u32 res = ((s16)rm * (s16)rs); - - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; cpu->AddCycles_C(); // TODO: interlock?? } @@ -1017,8 +1037,9 @@ void A_SMULWy(ARM* cpu) else rs &= 0xFFFF; u32 res = ((s64)(s32)rm * (s16)rs) >> 16; - - cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; cpu->AddCycles_C(); // TODO: interlock?? } @@ -1039,8 +1060,11 @@ void A_SMLALxy(ARM* cpu) s64 rd = (s64)((u64)cpu->R[(cpu->CurInstr >> 12) & 0xF] | ((u64)cpu->R[(cpu->CurInstr >> 16) & 0xF] << 32ULL)); res += rd; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; - cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = (u32)res; + + if (((cpu->CurInstr >> 16) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); cpu->AddCycles_CI(1); // TODO: interlock?? } From ac8c942565f956402f681c9cc8fa8b6eb6e0e74b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Sep 2024 13:10:13 -0400 Subject: [PATCH 127/306] sat add/sub also fail to jump --- src/ARMInterpreter_ALU.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 350ed168..54c1d6d3 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -1110,7 +1110,9 @@ void A_QADD(ARM* cpu) cpu->CPSR |= 0x08000000; } - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; + cpu->AddCycles_C(); // TODO: interlock?? } @@ -1127,8 +1129,10 @@ void A_QSUB(ARM* cpu) res = (res & 0x80000000) ? 0x7FFFFFFF : 0x80000000; cpu->CPSR |= 0x08000000; } + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; cpu->AddCycles_C(); // TODO: interlock?? } @@ -1153,8 +1157,10 @@ void A_QDADD(ARM* cpu) res = (res & 0x80000000) ? 0x7FFFFFFF : 0x80000000; cpu->CPSR |= 0x08000000; } + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; cpu->AddCycles_C(); // TODO: interlock?? } @@ -1179,8 +1185,10 @@ void A_QDSUB(ARM* cpu) res = (res & 0x80000000) ? 0x7FFFFFFF : 0x80000000; cpu->CPSR |= 0x08000000; } + + if (((cpu->CurInstr >> 12) & 0xF) != 15) + cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; cpu->AddCycles_C(); // TODO: interlock?? } From e2f3dd1e6f1ae6602c5dc63f65ffb4908203ad7f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Sep 2024 13:27:36 -0400 Subject: [PATCH 128/306] clarify --- src/ARMInterpreter_ALU.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 54c1d6d3..46c703cd 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -766,7 +766,8 @@ void A_MUL(ARM* cpu) u32 res = rm * rs; - if (((cpu->CurInstr >> 16) & 0xF) != 15) // check arm7 + // all multiply instructions fail writes to r15 on arm7/9 + if (((cpu->CurInstr >> 16) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (cpu->CurInstr & (1<<20)) @@ -798,7 +799,7 @@ void A_MLA(ARM* cpu) u32 res = (rm * rs) + rn; - if (((cpu->CurInstr >> 16) & 0xF) != 15) // check arm7 + if (((cpu->CurInstr >> 16) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; if (cpu->CurInstr & (1<<20)) @@ -1110,6 +1111,7 @@ void A_QADD(ARM* cpu) cpu->CPSR |= 0x08000000; } + // all saturated math instructions fail writes to r15 if (((cpu->CurInstr >> 12) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; From e5654ec541528f606b27c854bb4d2ae981ab79d2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:50:09 -0400 Subject: [PATCH 129/306] r15 mrc mrs --- src/ARMInterpreter.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 15ec42db..8ce15db1 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -201,7 +201,12 @@ void A_MRS(ARM* cpu) else psr = cpu->CPSR; - cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; + if (((cpu->CurInstr>>12) & 0xF) == 15) + { + if (cpu->Num == 1) // doesn't seem to jump on the arm9? checkme + cpu->JumpTo(psr & ~0x1); // checkme: this shouldn't be able to switch to thumb? + } + else cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; if (cpu->Num != 1) cpu->AddCycles_CI(1); // arm9 else cpu->AddCycles_C(); // arm7 @@ -248,12 +253,13 @@ void A_MRC(ARM* cpu) u32 cn = (cpu->CurInstr >> 16) & 0xF; u32 cm = cpu->CurInstr & 0xF; u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; + u32 rd = (cpu->CurInstr>>12) & 0xF; - if (cpu->Num==0 && cp==15) + if (cpu->Num==0 && cp==15 && rd!=15) { - cpu->R[(cpu->CurInstr>>12)&0xF] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo); + cpu->R[rd] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo); } - else if (cpu->Num==1 && cp==14) + else if (cpu->Num==1 && cp==14 && rd!=15) { Log(LogLevel::Debug, "MRC p14,%d,%d,%d on ARM7\n", cn, cm, cpinfo); } From 89e8549a556c0172feece95ce35dfeb61b01f2c1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Sep 2024 21:27:31 -0400 Subject: [PATCH 130/306] implement comparison instrs w/ rd == 15 --- src/ARMInterpreter_ALU.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 46c703cd..f04ab9b5 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -583,6 +583,11 @@ A_IMPLEMENT_ALU_OP(RSC,) u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + { \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING TST w/ rd == 15???"); \ + } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); A_IMPLEMENT_ALU_TEST(TST,_S) @@ -593,6 +598,11 @@ A_IMPLEMENT_ALU_TEST(TST,_S) u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + { \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING TEQ w/ rd == 15???"); \ + } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); A_IMPLEMENT_ALU_TEST(TEQ,_S) @@ -605,6 +615,11 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) !res, \ CarrySub(a, b), \ OverflowSub(a, b)); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + { \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING CMP w/ rd == 15???"); \ + } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); A_IMPLEMENT_ALU_TEST(CMP,) @@ -617,6 +632,11 @@ A_IMPLEMENT_ALU_TEST(CMP,) !res, \ CarryAdd(a, b), \ OverflowAdd(a, b)); \ + if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + { \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING CMN w/ rd == 15???"); \ + } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); A_IMPLEMENT_ALU_TEST(CMN,) @@ -1569,6 +1589,11 @@ void T_CMP_HIREG(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + if (rd == 15) \ + { \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15."); \ + } \ cpu->AddCycles_C(); } From 6ebabde39217e948406f4aff123f8a4afdf7f30b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 18 Sep 2024 19:23:23 -0400 Subject: [PATCH 131/306] implement changing thumb bit. and bkpt ig probably wrong --- src/ARM.cpp | 33 ++++++++++++++++++++++++++++++--- src/ARM.h | 6 +++++- src/ARMInterpreter.cpp | 20 ++++++++++++++------ src/ARMInterpreter.h | 1 + src/ARM_InstrInfo.cpp | 1 + src/ARM_InstrTable.h | 2 +- 6 files changed, 52 insertions(+), 11 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index a7c6c11e..c7fea92d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -201,6 +201,13 @@ void ARMv5::Reset() ARM::Reset(); } +void ARMv4::Reset() +{ + Thumb = false; + + ARM::Reset(); +} + void ARM::DoSavestate(Savestate* file) { @@ -395,6 +402,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) Cycles += NDS.ARM7MemTimings[CodeCycles][0] + NDS.ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; + Thumb = true; } else { @@ -408,6 +416,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) Cycles += NDS.ARM7MemTimings[CodeCycles][2] + NDS.ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; + Thumb = false; } } @@ -724,7 +733,12 @@ void ARMv5::Execute() ARMInterpreter::A_BLX_IMM(this); } else - AddCycles_C(); + { + if ((((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0)) == 0x127) + ARMInterpreter::A_BKPT(this); // always passes regardless of condition code + else + AddCycles_C(); + } } // TODO optimize this shit!!! @@ -826,8 +840,11 @@ void ARMv4::Execute() else #endif { - if (CPSR & 0x20) // THUMB + if (Thumb) // THUMB { + Thumb = (CPSR & 0x20); + bool fix = !Thumb; + if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); @@ -841,12 +858,22 @@ void ARMv4::Execute() else { // actually execute - u32 icode = (CurInstr >> 6); + u32 icode = (CurInstr >> 6) & 0x3FF; ARMInterpreter::THUMBInstrTable[icode](this); } + + if (fix) [[unlikely]] + { + // probably wrong? + // fixup + R[15] &= ~0x3; + NextInstr[1] = CodeRead32(R[15]); + } } else { + Thumb = (CPSR & 0x20); + if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); diff --git a/src/ARM.h b/src/ARM.h index 26080b51..8d640a30 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -385,6 +385,8 @@ class ARMv4 : public ARM public: ARMv4(melonDS::NDS& nds, std::optional gdb, bool jit); + void Reset() override; + void FillPipeline() override; void JumpTo(u32 addr, bool restorecpsr = false) override; @@ -393,7 +395,7 @@ public: template void Execute(); - u16 CodeRead16(u32 addr) + u32 CodeRead16(u32 addr) { return BusRead16(addr); } @@ -403,6 +405,8 @@ public: return BusRead32(addr); } + bool Thumb; + bool DataRead8(u32 addr, u32* val) override; bool DataRead16(u32 addr, u32* val) override; bool DataRead32(u32 addr, u32* val) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 8ce15db1..979e3bb8 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -69,6 +69,14 @@ void T_UNK(ARM* cpu) cpu->JumpTo(cpu->ExceptionBase + 0x04); } +void A_BKPT(ARM* cpu) +{ + if (cpu->Num == 1) A_UNK(cpu); // checkme + + Log(LogLevel::Warn, "BKPT: "); // combine with the prefetch abort warning message + ((ARMv5*)cpu)->PrefetchAbort(); +} + void A_MSR_IMM(ARM* cpu) @@ -105,9 +113,6 @@ void A_MSR_IMM(ARM* cpu) //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; // unused by arm 7 & 9 if (cpu->CurInstr & (1<<19)) mask |= ((cpu->Num==1) ? 0xF0000000 : 0xF8000000); - if (!(cpu->CurInstr & (1<<22))) - mask &= 0xFFFFFFDF; - if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; u32 val = ROR((cpu->CurInstr & 0xFF), ((cpu->CurInstr >> 7) & 0x1E)); @@ -121,6 +126,9 @@ void A_MSR_IMM(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); + if (cpu->Num == 0) + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + cpu->AddCycles_C(); } @@ -158,9 +166,6 @@ void A_MSR_REG(ARM* cpu) //if (cpu->CurInstr & (1<<18)) mask |= 0x00FF0000; // unused by arm 7 & 9 if (cpu->CurInstr & (1<<19)) mask |= ((cpu->Num==1) ? 0xF0000000 : 0xF8000000); - if (!(cpu->CurInstr & (1<<22))) - mask &= 0xFFFFFFDF; - if ((cpu->CPSR & 0x1F) == 0x10) mask &= 0xFFFFFF00; u32 val = cpu->R[cpu->CurInstr & 0xF]; @@ -173,6 +178,9 @@ void A_MSR_REG(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); + + if (cpu->Num == 0) + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this cpu->AddCycles_C(); } diff --git a/src/ARMInterpreter.h b/src/ARMInterpreter.h index 1066ac69..4c5ddafe 100644 --- a/src/ARMInterpreter.h +++ b/src/ARMInterpreter.h @@ -36,6 +36,7 @@ void A_MRS(ARM* cpu); void A_MCR(ARM* cpu); void A_MRC(ARM* cpu); void A_SVC(ARM* cpu); +void A_BKPT(ARM* cpu); void T_SVC(ARM* cpu); diff --git a/src/ARM_InstrInfo.cpp b/src/ARM_InstrInfo.cpp index 58838307..d1be9761 100644 --- a/src/ARM_InstrInfo.cpp +++ b/src/ARM_InstrInfo.cpp @@ -194,6 +194,7 @@ const u32 A_BX = A_BranchAlways | A_Read0 | ak(ak_BX); const u32 A_BLX_REG = A_BranchAlways | A_Link | A_Read0 | ak(ak_BLX_REG); const u32 A_UNK = A_BranchAlways | A_Link | ak(ak_UNK); +const u32 A_BKPT = A_BranchAlways | A_Link | ak(ak_UNK); const u32 A_MSR_IMM = ak(ak_MSR_IMM); const u32 A_MSR_REG = A_Read0 | ak(ak_MSR_REG); const u32 A_MRS = A_Write12 | ak(ak_MRS); diff --git a/src/ARM_InstrTable.h b/src/ARM_InstrTable.h index 8213c2e0..2c480f8d 100644 --- a/src/ARM_InstrTable.h +++ b/src/ARM_InstrTable.h @@ -130,7 +130,7 @@ INSTRFUNC_PROTO(ARMInstrTable[4096]) = // 0001 0010 0000 A_MSR_REG, A_BX, A_UNK, A_BLX_REG, - A_UNK, A_QSUB, A_UNK, A_UNK, + A_UNK, A_QSUB, A_UNK, A_BKPT, A_SMLAWy, A_UNK, A_SMULWy, A_STRH_REG, A_SMLAWy, A_LDRD_REG, A_SMULWy, A_STRD_REG, From 45f87a1c8d529289f031619e3b13d4e6d67c3d57 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:57:55 -0400 Subject: [PATCH 132/306] prevent t bit changes without pipeline flush on arm7 idk what's happening fully and its gonna be slow to emulate most likely we'll figure this out later --- src/ARM.cpp | 26 ++------------ src/ARM.h | 6 +--- src/ARMInterpreter.cpp | 24 ++++++++++--- src/ARMInterpreter_ALU.cpp | 71 +++++++++++++++++++++++++++++++------- 4 files changed, 80 insertions(+), 47 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index c7fea92d..6518b751 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -201,13 +201,6 @@ void ARMv5::Reset() ARM::Reset(); } -void ARMv4::Reset() -{ - Thumb = false; - - ARM::Reset(); -} - void ARM::DoSavestate(Savestate* file) { @@ -402,7 +395,6 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) Cycles += NDS.ARM7MemTimings[CodeCycles][0] + NDS.ARM7MemTimings[CodeCycles][1]; CPSR |= 0x20; - Thumb = true; } else { @@ -416,7 +408,6 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) Cycles += NDS.ARM7MemTimings[CodeCycles][2] + NDS.ARM7MemTimings[CodeCycles][3]; CPSR &= ~0x20; - Thumb = false; } } @@ -840,11 +831,8 @@ void ARMv4::Execute() else #endif { - if (Thumb) // THUMB + if (CPSR & 0x20) // THUMB { - Thumb = (CPSR & 0x20); - bool fix = !Thumb; - if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); @@ -858,22 +846,12 @@ void ARMv4::Execute() else { // actually execute - u32 icode = (CurInstr >> 6) & 0x3FF; + u32 icode = (CurInstr >> 6); ARMInterpreter::THUMBInstrTable[icode](this); } - - if (fix) [[unlikely]] - { - // probably wrong? - // fixup - R[15] &= ~0x3; - NextInstr[1] = CodeRead32(R[15]); - } } else { - Thumb = (CPSR & 0x20); - if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); diff --git a/src/ARM.h b/src/ARM.h index 8d640a30..26080b51 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -385,8 +385,6 @@ class ARMv4 : public ARM public: ARMv4(melonDS::NDS& nds, std::optional gdb, bool jit); - void Reset() override; - void FillPipeline() override; void JumpTo(u32 addr, bool restorecpsr = false) override; @@ -395,7 +393,7 @@ public: template void Execute(); - u32 CodeRead16(u32 addr) + u16 CodeRead16(u32 addr) { return BusRead16(addr); } @@ -405,8 +403,6 @@ public: return BusRead32(addr); } - bool Thumb; - bool DataRead8(u32 addr, u32* val) override; bool DataRead16(u32 addr, u32* val) override; bool DataRead32(u32 addr, u32* val) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 979e3bb8..b11913ef 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -126,8 +126,15 @@ void A_MSR_IMM(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - if (cpu->Num == 0) - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) + { + if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + else + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR IMM T bit change on ARM7\n"); + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); // keep it from crashing the emulator at least + } + } cpu->AddCycles_C(); } @@ -178,9 +185,16 @@ void A_MSR_REG(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - - if (cpu->Num == 0) - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) + { + if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + else + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); // keep it from crashing the emulator at least + } + } cpu->AddCycles_C(); } diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index f04ab9b5..fd60b5f0 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -585,8 +585,17 @@ A_IMPLEMENT_ALU_OP(RSC,) !res); \ if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING TST w/ rd == 15???"); \ + if (cpu->Num == 1) \ + { \ + u32 oldcpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST T bit change on ARM7\n"); \ + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + } \ + } \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -600,8 +609,17 @@ A_IMPLEMENT_ALU_TEST(TST,_S) !res); \ if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING TEQ w/ rd == 15???"); \ + if (cpu->Num == 1) \ + { \ + u32 oldcpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ T bit change on ARM7\n"); \ + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + } \ + } \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -617,8 +635,17 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) OverflowSub(a, b)); \ if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING CMP w/ rd == 15???"); \ + if (cpu->Num == 1) \ + { \ + u32 oldcpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP T bit change on ARM7\n"); \ + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + } \ + } \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -634,8 +661,17 @@ A_IMPLEMENT_ALU_TEST(CMP,) OverflowAdd(a, b)); \ if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: WHO IS USING CMN w/ rd == 15???"); \ + if (cpu->Num == 1) \ + { \ + u32 oldcpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN T bit change on ARM7\n"); \ + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + } \ + } \ + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -1589,11 +1625,20 @@ void T_CMP_HIREG(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); - if (rd == 15) \ - { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15."); \ - } \ + if (rd == 15) + { + if (cpu->Num == 1) + { + u32 oldcpsr = cpu->CPSR; + cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. + if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); // keep it from crashing the emulator at least + } + } + else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15 on ARM9\n"); + } cpu->AddCycles_C(); } From c1338147137dab672034c9f3074c1bace7b31e4f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Sep 2024 04:39:16 -0400 Subject: [PATCH 133/306] some day i will remember to test before pushing --- src/ARMInterpreter_ALU.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index fd60b5f0..a638a49c 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -587,7 +587,7 @@ A_IMPLEMENT_ALU_OP(RSC,) { \ if (cpu->Num == 1) \ { \ - u32 oldcpsr = cpu->CPSR; \ + u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ { \ @@ -611,7 +611,7 @@ A_IMPLEMENT_ALU_TEST(TST,_S) { \ if (cpu->Num == 1) \ { \ - u32 oldcpsr = cpu->CPSR; \ + u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ { \ @@ -637,7 +637,7 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) { \ if (cpu->Num == 1) \ { \ - u32 oldcpsr = cpu->CPSR; \ + u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ { \ @@ -663,7 +663,7 @@ A_IMPLEMENT_ALU_TEST(CMP,) { \ if (cpu->Num == 1) \ { \ - u32 oldcpsr = cpu->CPSR; \ + u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ { \ @@ -1629,7 +1629,7 @@ void T_CMP_HIREG(ARM* cpu) { if (cpu->Num == 1) { - u32 oldcpsr = cpu->CPSR; + u32 oldpsr = cpu->CPSR; cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) { From 7afa805afc3719b145424c642f3858d3be75169a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Sep 2024 05:37:51 -0400 Subject: [PATCH 134/306] slightly better code --- src/ARMInterpreter.cpp | 10 +++++----- src/ARMInterpreter_ALU.cpp | 30 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index b11913ef..72d1e189 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -126,13 +126,13 @@ void A_MSR_IMM(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) + if (cpu->CPSR & 0x20) [[unlikely]] { if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this else { - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR IMM T bit change on ARM7\n"); - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); // keep it from crashing the emulator at least + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least } } @@ -186,13 +186,13 @@ void A_MSR_REG(ARM* cpu) if (!(cpu->CurInstr & (1<<22))) cpu->UpdateMode(oldpsr, cpu->CPSR); - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) + if (cpu->CPSR & 0x20) [[unlikely]] { if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); // keep it from crashing the emulator at least + cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least } } diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index a638a49c..9305fc42 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -583,16 +583,16 @@ A_IMPLEMENT_ALU_OP(RSC,) u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ if (cpu->Num == 1) \ { \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + if (cpu->CPSR & 0x20) \ { \ Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST T bit change on ARM7\n"); \ - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST w/ rd == 15 on ARM9\n"); \ @@ -607,16 +607,16 @@ A_IMPLEMENT_ALU_TEST(TST,_S) u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ if (cpu->Num == 1) \ { \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + if (cpu->CPSR & 0x20) \ { \ Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ T bit change on ARM7\n"); \ - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ w/ rd == 15 on ARM9\n"); \ @@ -633,16 +633,16 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) !res, \ CarrySub(a, b), \ OverflowSub(a, b)); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ if (cpu->Num == 1) \ { \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + if (cpu->CPSR & 0x20) \ { \ Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP T bit change on ARM7\n"); \ - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP w/ rd == 15 on ARM9\n"); \ @@ -659,16 +659,16 @@ A_IMPLEMENT_ALU_TEST(CMP,) !res, \ CarryAdd(a, b), \ OverflowAdd(a, b)); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ if (cpu->Num == 1) \ { \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) \ + if (cpu->CPSR & 0x20) \ { \ Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN T bit change on ARM7\n"); \ - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); /* keep it from crashing the emulator at least */ \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN w/ rd == 15 on ARM9\n"); \ @@ -1625,16 +1625,16 @@ void T_CMP_HIREG(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); - if (rd == 15) + if (rd == 15) [[unlikely]] { if (cpu->Num == 1) { u32 oldpsr = cpu->CPSR; cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. - if (!(oldpsr & 0x20) && (cpu->CPSR & 0x20)) + if (!(cpu->CPSR & 0x20)) { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); - cpu->CPSR = (cpu->CPSR & ~0x20) | (oldpsr & 0x20); // keep it from crashing the emulator at least + cpu->CPSR |= 0x20; // keep it from crashing the emulator at least } } else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15 on ARM9\n"); From 157e9c5b046199658d5c5e12a3a5b29bf944a451 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Sep 2024 13:34:27 -0400 Subject: [PATCH 135/306] reimplement changing t bit with arm7 kinda slow though? --- src/ARM.cpp | 52 +++++++++++++++++++++++++++-------- src/ARM.h | 3 +++ src/ARMInterpreter.cpp | 4 +-- src/ARMInterpreter_ALU.cpp | 55 ++++---------------------------------- 4 files changed, 51 insertions(+), 63 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 6518b751..ade9649f 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -385,6 +385,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) if (addr & 0x1) { + Thumb = true; addr &= ~0x1; R[15] = addr+2; @@ -398,6 +399,7 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) } else { + Thumb = false; addr &= ~0x3; R[15] = addr+4; @@ -831,35 +833,63 @@ void ARMv4::Execute() else #endif { - if (CPSR & 0x20) // THUMB + if (Thumb) // THUMB { + // attempt to delay t bit changes without a pipeline flush (msr) by one instruction + Thumb = CPSR & 0x20; if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15]); + // thumb bit can change without a flush and is usually delayed 1 instruction + // but if the code fetch takes more than 1 cycle(?) it can take effect early for just the code fetch + if (!Thumb && (NDS.ARM7MemTimings[CodeCycles][2] > 1)) [[unlikely]] // checkme + { + R[15] = (R[15] + 4) & ~0x3; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + NextInstr[1] = CodeRead32(R[15]); + } + else + { + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + NextInstr[1] = CodeRead16(R[15]); + } if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else { // actually execute - u32 icode = (CurInstr >> 6); + u32 icode = (CurInstr >> 6) & 0x3FF; ARMInterpreter::THUMBInstrTable[icode](this); } } else { + // attempt to delay t bit changes without a pipeline flush (msr) by one instruction + Thumb = CPSR & 0x20; if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); - + // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15]); + // thumb bit can change without a flush and is usually delayed 1 instruction + // but if the code fetch takes more than 1 cycle(?) it can take effect early for just the code fetch + if (Thumb && (NDS.ARM7MemTimings[CodeCycles][2] > 1)) [[unlikely]] // checkme? + { + R[15] = (R[15] + 4) & ~0x3; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + NextInstr[1] = CodeRead16(R[15]); + } + else + { + R[15] = (R[15] + 4) & ~0x3; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + NextInstr[1] = CodeRead32(R[15]); + } if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (CheckCondition(CurInstr >> 28)) // actually execute diff --git a/src/ARM.h b/src/ARM.h index 26080b51..81d6be39 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -416,6 +416,9 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; +private: + bool Thumb; + protected: u8 BusRead8(u32 addr) override; u16 BusRead16(u32 addr) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 72d1e189..cc19df3b 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -129,11 +129,11 @@ void A_MSR_IMM(ARM* cpu) if (cpu->CPSR & 0x20) [[unlikely]] { if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this - else + /*else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least - } + }*/ } cpu->AddCycles_C(); diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 9305fc42..abe2bce0 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -585,16 +585,7 @@ A_IMPLEMENT_ALU_OP(RSC,) !res); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) \ - { \ - u32 oldpsr = cpu->CPSR; \ - cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (cpu->CPSR & 0x20) \ - { \ - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST T bit change on ARM7\n"); \ - cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ - } \ - } \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -609,16 +600,7 @@ A_IMPLEMENT_ALU_TEST(TST,_S) !res); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) \ - { \ - u32 oldpsr = cpu->CPSR; \ - cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (cpu->CPSR & 0x20) \ - { \ - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ T bit change on ARM7\n"); \ - cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ - } \ - } \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -635,16 +617,7 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) OverflowSub(a, b)); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) \ - { \ - u32 oldpsr = cpu->CPSR; \ - cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (cpu->CPSR & 0x20) \ - { \ - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP T bit change on ARM7\n"); \ - cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ - } \ - } \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -661,16 +634,7 @@ A_IMPLEMENT_ALU_TEST(CMP,) OverflowAdd(a, b)); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) \ - { \ - u32 oldpsr = cpu->CPSR; \ - cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ - if (cpu->CPSR & 0x20) \ - { \ - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN T bit change on ARM7\n"); \ - cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ - } \ - } \ + if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -1627,16 +1591,7 @@ void T_CMP_HIREG(ARM* cpu) OverflowSub(a, b)); if (rd == 15) [[unlikely]] { - if (cpu->Num == 1) - { - u32 oldpsr = cpu->CPSR; - cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. - if (!(cpu->CPSR & 0x20)) - { - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); - cpu->CPSR |= 0x20; // keep it from crashing the emulator at least - } - } + if (cpu->Num == 1) cpu->RestoreCPSR(); // ARM7 restores cpsr and does ___not___ flush the pipeline. else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15 on ARM9\n"); } cpu->AddCycles_C(); From 8d451dff48b3932225d9d2b222ec7bdeedbda265 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Sep 2024 23:47:40 -0400 Subject: [PATCH 136/306] misaligned pc.......... --- src/ARM.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index ade9649f..509027c5 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -837,6 +837,7 @@ void ARMv4::Execute() { // attempt to delay t bit changes without a pipeline flush (msr) by one instruction Thumb = CPSR & 0x20; + bool fix = !Thumb; if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); @@ -845,17 +846,17 @@ void ARMv4::Execute() // but if the code fetch takes more than 1 cycle(?) it can take effect early for just the code fetch if (!Thumb && (NDS.ARM7MemTimings[CodeCycles][2] > 1)) [[unlikely]] // checkme { - R[15] = (R[15] + 4) & ~0x3; + R[15] += 4; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15]); + NextInstr[1] = CodeRead32(R[15] & ~3); } - else + else [[likely]] { R[15] += 2; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15]); + NextInstr[1] = CodeRead16(R[15] & ~1); } if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); @@ -865,6 +866,12 @@ void ARMv4::Execute() u32 icode = (CurInstr >> 6) & 0x3FF; ARMInterpreter::THUMBInstrTable[icode](this); } + + if (fix) [[unlikely]] // attempt at fixing flushless t bit changes + { + R[15] += 2; // yes it can end up misaligned. that's correct. + NextInstr[1] = CodeRead32(R[15] & ~3); + } } else { @@ -878,17 +885,17 @@ void ARMv4::Execute() // but if the code fetch takes more than 1 cycle(?) it can take effect early for just the code fetch if (Thumb && (NDS.ARM7MemTimings[CodeCycles][2] > 1)) [[unlikely]] // checkme? { - R[15] = (R[15] + 4) & ~0x3; + R[15] += 4; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15]); + NextInstr[1] = CodeRead16(R[15] & ~1); } - else + else [[likely]] { - R[15] = (R[15] + 4) & ~0x3; + R[15] += 4; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15]); + NextInstr[1] = CodeRead32(R[15] & ~3); } if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); From 7b0d71dbbedcea1ab0a311147a75cdd3909d0995 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 22 Sep 2024 19:57:33 -0400 Subject: [PATCH 137/306] Revert T bit changing support for arm7 i cannot comprehend what is happening currently --- src/ARM.cpp | 59 +++++++------------------------------- src/ARM.h | 3 -- src/ARMInterpreter.cpp | 4 +-- src/ARMInterpreter_ALU.cpp | 55 +++++++++++++++++++++++++++++++---- 4 files changed, 63 insertions(+), 58 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 509027c5..6518b751 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -385,7 +385,6 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) if (addr & 0x1) { - Thumb = true; addr &= ~0x1; R[15] = addr+2; @@ -399,7 +398,6 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) } else { - Thumb = false; addr &= ~0x3; R[15] = addr+4; @@ -833,70 +831,35 @@ void ARMv4::Execute() else #endif { - if (Thumb) // THUMB + if (CPSR & 0x20) // THUMB { - // attempt to delay t bit changes without a pipeline flush (msr) by one instruction - Thumb = CPSR & 0x20; - bool fix = !Thumb; if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); // prefetch - // thumb bit can change without a flush and is usually delayed 1 instruction - // but if the code fetch takes more than 1 cycle(?) it can take effect early for just the code fetch - if (!Thumb && (NDS.ARM7MemTimings[CodeCycles][2] > 1)) [[unlikely]] // checkme - { - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15] & ~3); - } - else [[likely]] - { - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15] & ~1); - } + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + NextInstr[1] = CodeRead16(R[15]); if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else { // actually execute - u32 icode = (CurInstr >> 6) & 0x3FF; + u32 icode = (CurInstr >> 6); ARMInterpreter::THUMBInstrTable[icode](this); } - - if (fix) [[unlikely]] // attempt at fixing flushless t bit changes - { - R[15] += 2; // yes it can end up misaligned. that's correct. - NextInstr[1] = CodeRead32(R[15] & ~3); - } } else { - // attempt to delay t bit changes without a pipeline flush (msr) by one instruction - Thumb = CPSR & 0x20; if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); - + // prefetch - // thumb bit can change without a flush and is usually delayed 1 instruction - // but if the code fetch takes more than 1 cycle(?) it can take effect early for just the code fetch - if (Thumb && (NDS.ARM7MemTimings[CodeCycles][2] > 1)) [[unlikely]] // checkme? - { - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15] & ~1); - } - else [[likely]] - { - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15] & ~3); - } + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + NextInstr[1] = CodeRead32(R[15]); if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (CheckCondition(CurInstr >> 28)) // actually execute diff --git a/src/ARM.h b/src/ARM.h index 81d6be39..26080b51 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -416,9 +416,6 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; -private: - bool Thumb; - protected: u8 BusRead8(u32 addr) override; u16 BusRead16(u32 addr) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index cc19df3b..72d1e189 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -129,11 +129,11 @@ void A_MSR_IMM(ARM* cpu) if (cpu->CPSR & 0x20) [[unlikely]] { if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this - /*else + else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least - }*/ + } } cpu->AddCycles_C(); diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index abe2bce0..9305fc42 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -585,7 +585,16 @@ A_IMPLEMENT_ALU_OP(RSC,) !res); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->Num == 1) \ + { \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -600,7 +609,16 @@ A_IMPLEMENT_ALU_TEST(TST,_S) !res); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->Num == 1) \ + { \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -617,7 +635,16 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) OverflowSub(a, b)); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->Num == 1) \ + { \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -634,7 +661,16 @@ A_IMPLEMENT_ALU_TEST(CMP,) OverflowAdd(a, b)); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ { \ - if (cpu->Num == 1) cpu->RestoreCPSR(); /* ARM7 restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->Num == 1) \ + { \ + u32 oldpsr = cpu->CPSR; \ + cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ + if (cpu->CPSR & 0x20) \ + { \ + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN T bit change on ARM7\n"); \ + cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ + } \ + } \ else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN w/ rd == 15 on ARM9\n"); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -1591,7 +1627,16 @@ void T_CMP_HIREG(ARM* cpu) OverflowSub(a, b)); if (rd == 15) [[unlikely]] { - if (cpu->Num == 1) cpu->RestoreCPSR(); // ARM7 restores cpsr and does ___not___ flush the pipeline. + if (cpu->Num == 1) + { + u32 oldpsr = cpu->CPSR; + cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. + if (!(cpu->CPSR & 0x20)) + { + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR |= 0x20; // keep it from crashing the emulator at least + } + } else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15 on ARM9\n"); } cpu->AddCycles_C(); From 8af790beeec9e1a74648e8cb01a3492efcb6d340 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 23 Sep 2024 15:00:35 -0400 Subject: [PATCH 138/306] ldm/str with empty rlist --- src/ARMInterpreter_LoadStore.cpp | 88 ++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index bf187aca..f181476b 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -434,7 +434,59 @@ void A_SWPB(ARM* cpu) SWP(cpu); } +void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeback, const bool decrement, bool preinc, const bool usermode) +{ + if (cpu->Num == 1) + { + u32 base = cpu->R[baseid]; + if (decrement) + { + preinc = !preinc; + base -= 0x40; + } + if (preinc) base+=4; + + if (load) + { + u32 pc; + if (cpu->DataRead32(base, &pc)) + { + cpu->AddCycles_CDI(); + cpu->JumpTo(pc, usermode); // checkme can we restore cpsr? + } + else + { + cpu->AddCycles_CDI(); + ((ARMv5*)cpu)->DataAbort(); + return; + } + } + else + { + if (!cpu->DataWrite32(base, cpu->R[15])) + { + cpu->AddCycles_CD(); + ((ARMv5*)cpu)->DataAbort(); + return; + } + else + { + cpu->AddCycles_CD(); + } + } + } + else + { + cpu->AddCycles_C(); // checkme + } + + if (writeback) + { + if (decrement) cpu->R[baseid] -= 0x40; + else cpu->R[baseid] += 0x40; + } +} void A_LDM(ARM* cpu) { @@ -445,6 +497,12 @@ void A_LDM(ARM* cpu) u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; bool dabort = false; + + if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] + { + ReglessLDMSTM(cpu, true, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, cpu->CurInstr & (1<<22)); + return; + } if (!(cpu->CurInstr & (1<<23))) // decrement { @@ -545,6 +603,12 @@ void A_STM(ARM* cpu) u32 preinc = (cpu->CurInstr & (1<<24)); bool first = true; bool dabort = false; + + if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] + { + ReglessLDMSTM(cpu, false, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, false); + return; + } if (!(cpu->CurInstr & (1<<23))) { @@ -737,6 +801,12 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) nregs++; + + if (!nregs) [[unlikely]] + { + ReglessLDMSTM(cpu, false, 13, true, true, true, false); + return; + } u32 base = cpu->R[13]; base -= (nregs<<2); @@ -777,6 +847,12 @@ void T_POP(ARM* cpu) u32 base = cpu->R[13]; bool first = true; bool dabort = false; + + if (!(cpu->CurInstr & 0x1FF)) [[unlikely]] + { + ReglessLDMSTM(cpu, true, 13, true, false, false, false); + return; + } for (int i = 0; i < 8; i++) { @@ -823,6 +899,12 @@ void T_STMIA(ARM* cpu) u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; bool dabort = false; + + if (!(cpu->CurInstr & 0xFF)) [[unlikely]] + { + ReglessLDMSTM(cpu, false, (cpu->CurInstr >> 8) & 0x7, true, false, false, false); + return; + } for (int i = 0; i < 8; i++) { @@ -853,6 +935,12 @@ void T_LDMIA(ARM* cpu) u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; bool dabort = false; + + if (!(cpu->CurInstr & 0xFF)) [[unlikely]] + { + ReglessLDMSTM(cpu, true, (cpu->CurInstr >> 8) & 0x7, true, false, false, false); + return; + } for (int i = 0; i < 8; i++) { From 3b73f21bb7b5836bea24d82b5ce8326dee6ac7f9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:12:23 -0400 Subject: [PATCH 139/306] str r15 is incremented by +2/+4 oop --- src/ARMInterpreter_LoadStore.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index f181476b..e8e6accc 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -434,7 +434,7 @@ void A_SWPB(ARM* cpu) SWP(cpu); } -void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeback, const bool decrement, bool preinc, const bool usermode) +void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeback, const bool decrement, bool preinc, const bool usermode, const bool thumb) { if (cpu->Num == 1) { @@ -464,7 +464,7 @@ void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeb } else { - if (!cpu->DataWrite32(base, cpu->R[15])) + if (!cpu->DataWrite32(base, cpu->R[15] + (thumb ? 2 : 4))) { cpu->AddCycles_CD(); ((ARMv5*)cpu)->DataAbort(); @@ -500,7 +500,7 @@ void A_LDM(ARM* cpu) if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] { - ReglessLDMSTM(cpu, true, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, cpu->CurInstr & (1<<22)); + ReglessLDMSTM(cpu, true, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, cpu->CurInstr & (1<<22), false); return; } @@ -606,7 +606,7 @@ void A_STM(ARM* cpu) if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] { - ReglessLDMSTM(cpu, false, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, false); + ReglessLDMSTM(cpu, false, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, false, false); return; } @@ -804,7 +804,7 @@ void T_PUSH(ARM* cpu) if (!nregs) [[unlikely]] { - ReglessLDMSTM(cpu, false, 13, true, true, true, false); + ReglessLDMSTM(cpu, false, 13, true, true, true, false, true); return; } @@ -850,7 +850,7 @@ void T_POP(ARM* cpu) if (!(cpu->CurInstr & 0x1FF)) [[unlikely]] { - ReglessLDMSTM(cpu, true, 13, true, false, false, false); + ReglessLDMSTM(cpu, true, 13, true, false, false, false, true); return; } @@ -902,7 +902,7 @@ void T_STMIA(ARM* cpu) if (!(cpu->CurInstr & 0xFF)) [[unlikely]] { - ReglessLDMSTM(cpu, false, (cpu->CurInstr >> 8) & 0x7, true, false, false, false); + ReglessLDMSTM(cpu, false, (cpu->CurInstr >> 8) & 0x7, true, false, false, false, true); return; } @@ -938,7 +938,7 @@ void T_LDMIA(ARM* cpu) if (!(cpu->CurInstr & 0xFF)) [[unlikely]] { - ReglessLDMSTM(cpu, true, (cpu->CurInstr >> 8) & 0x7, true, false, false, false); + ReglessLDMSTM(cpu, true, (cpu->CurInstr >> 8) & 0x7, true, false, false, false, true); return; } From 7fb18b11552374df2cd51454cca7ce8fbdc583bc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 23 Sep 2024 20:03:58 -0400 Subject: [PATCH 140/306] clean up code --- src/ARMInterpreter_LoadStore.cpp | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index e8e6accc..59b9bc30 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -450,30 +450,16 @@ void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeb if (load) { u32 pc; - if (cpu->DataRead32(base, &pc)) - { - cpu->AddCycles_CDI(); - cpu->JumpTo(pc, usermode); // checkme can we restore cpsr? - } - else - { - cpu->AddCycles_CDI(); - ((ARMv5*)cpu)->DataAbort(); - return; - } + cpu->DataRead32(base, &pc); + + cpu->AddCycles_CDI(); + cpu->JumpTo(pc, usermode); } else { - if (!cpu->DataWrite32(base, cpu->R[15] + (thumb ? 2 : 4))) - { - cpu->AddCycles_CD(); - ((ARMv5*)cpu)->DataAbort(); - return; - } - else - { - cpu->AddCycles_CD(); - } + cpu->DataWrite32(base, cpu->R[15] + (thumb ? 2 : 4)); + + cpu->AddCycles_CD(); } } else From e1d4fbef750ce25e0b0c2c3f0f69bef7b5c79e85 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 24 Sep 2024 09:47:32 -0400 Subject: [PATCH 141/306] i can't reproduce this anymore --- src/ARM.cpp | 26 -------------------------- src/ARM.h | 3 --- 2 files changed, 29 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 6518b751..c194cc71 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -346,27 +346,6 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) NDS.MonitorARM9Jump(addr); } -void ARMv5::JumpTo8_16Bit(const u32 addr) -{ - // 8 and 16 loads (signed included) to pc - if (!(CP15Control & 0x1)) - { - // if the pu is disabled it behaves like a normal jump - JumpTo((CP15Control & (1<<15)) ? (addr & ~0x1) : addr); - } - else - { - if (addr & 0x3) - { - // if the pu is enabled it will always prefetch abort if not word aligned - // although it will still attempt (and fail) to enter thumb mode if enabled - if ((addr & 0x1) && !(CP15Control & (1<<15))) CPSR |= 0x20; - PrefetchAbort(); - } - else JumpTo(addr); - } -} - void ARMv4::JumpTo(u32 addr, bool restorecpsr) { if (restorecpsr) @@ -411,11 +390,6 @@ void ARMv4::JumpTo(u32 addr, bool restorecpsr) } } -void ARMv4::JumpTo8_16Bit(const u32 addr) -{ - JumpTo(addr & ~1); // checkme? -} - void ARM::RestoreCPSR() { u32 oldcpsr = CPSR; diff --git a/src/ARM.h b/src/ARM.h index 26080b51..e7156d72 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -75,7 +75,6 @@ public: virtual void FillPipeline() = 0; virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; - virtual void JumpTo8_16Bit(u32 addr) = 0; void RestoreCPSR(); void Halt(u32 halt) @@ -244,7 +243,6 @@ public: void FillPipeline() override; void JumpTo(u32 addr, bool restorecpsr = false) override; - void JumpTo8_16Bit(const u32 addr) override; void PrefetchAbort(); void DataAbort(); @@ -388,7 +386,6 @@ public: void FillPipeline() override; void JumpTo(u32 addr, bool restorecpsr = false) override; - void JumpTo8_16Bit(const u32 addr) override; template void Execute(); From 3065141ed751778523876b92c54f9b89c33becec Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 24 Sep 2024 17:03:18 -0400 Subject: [PATCH 142/306] probably not faster --- src/ARM.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index c194cc71..ac3fe200 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -697,13 +697,12 @@ void ARMv5::Execute() { ARMInterpreter::A_BLX_IMM(this); } - else + else if ((CurInstr & 0x0FF000F0) == 0x01200070) { - if ((((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0)) == 0x127) - ARMInterpreter::A_BKPT(this); // always passes regardless of condition code - else - AddCycles_C(); + ARMInterpreter::A_BKPT(this); // always passes regardless of condition code } + else + AddCycles_C(); } // TODO optimize this shit!!! From a11208ec6db98722579c047a78717408db1463be Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 24 Sep 2024 21:02:17 -0400 Subject: [PATCH 143/306] oops --- src/ARM.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ARM.cpp b/src/ARM.cpp index ac3fe200..f97c26e2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -523,9 +523,11 @@ void ARM::TriggerIRQ() UpdateMode(oldcpsr, CPSR); R_IRQ[2] = oldcpsr; +#ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) R[14] = R[15] + (oldcpsr & 0x20 ? 2 : 0); else +#endif R[14] = R[15] - (oldcpsr & 0x20 ? 0 : 4); JumpTo(ExceptionBase + 0x18); From 6e30cf3bfb015a7372cbc24074adca2b526ea30c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 8 Oct 2024 17:18:09 -0400 Subject: [PATCH 144/306] functional write buffer prototype --- src/ARM.cpp | 7 +- src/ARM.h | 13 ++ src/CP15.cpp | 506 +++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 406 insertions(+), 120 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index d547174e..7072978d 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -208,6 +208,9 @@ void ARMv5::Reset() Store = false; InterlockMask = 0; + WBWritePointer = 16; + WBFillPointer = 0; + ARM::Reset(); } @@ -609,6 +612,7 @@ void ARMv5::Execute() else { NDS.ARM9Timestamp = NDS.ARM9Target; + WriteBufferCheck(); return; } } @@ -742,6 +746,7 @@ void ARMv5::Execute() //NDS.ARM9Timestamp += Cycles; //Cycles = 0; } + WriteBufferCheck(); if (Halted == 2) Halted = 0; @@ -757,7 +762,7 @@ void ARMv4::Execute() { if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckB(); - + if (Halted) { if (Halted == 2) diff --git a/src/ARM.h b/src/ARM.h index 569f936a..12675023 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -291,11 +291,13 @@ public: void AddCycles_CDI() override { AddCycles_MW(DataCycles); + DataCycles = 0; } void AddCycles_CD() override { AddCycles_MW(DataCycles); + DataCycles = 0; } void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -314,6 +316,10 @@ public: void ICacheLookup(u32 addr); void ICacheInvalidateByAddr(u32 addr); void ICacheInvalidateAll(); + + void WriteBufferCheck(); + void WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr = 0); + void WriteBufferDrain(); void CP15Write(u32 id, u32 val); u32 CP15Read(u32 id) const; @@ -369,6 +375,13 @@ public: bool Store; u16 InterlockMask; + u8 WBWritePointer; + u8 WBFillPointer; + u32 WBAddr; // current working address for the write buffer + u32 storeaddr[16]; // debugging + u64 WBCycles[16]; // timestamp each write will complete + u64 WriteBufferFifo[16]; // 0-31: value | 62-63: 0 byte, 1 half, 2 word, 3 addr + #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; void WriteMem(u32 addr, int size, u32 v) override; diff --git a/src/CP15.cpp b/src/CP15.cpp index c83c5a49..613c1bd5 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -172,13 +172,13 @@ void ARMv5::UpdatePURegion(u32 n) if (CP15Control & (1<<2)) { datacache = (PU_DataCacheable >> n) & 0x1; - datawrite = (PU_DataCacheWrite >> n) & 0x1; } else { - datacache = 0; - datawrite = 0; + datacache = false; } + + datawrite = (PU_DataCacheWrite >> n) & 0x1; u32 rgn = PU_Region[n]; if (!(rgn & (1<<0))) @@ -224,12 +224,12 @@ void ARMv5::UpdatePURegion(u32 n) { privmask |= 0x10; usermask |= 0x10; - - if (datawrite & 0x1) - { - privmask |= 0x20; - usermask |= 0x20; - } + } + + if (datawrite & 0x1) + { + privmask |= 0x20; + usermask |= 0x20; } if (codecache & 0x1) @@ -438,6 +438,223 @@ void ARMv5::ICacheInvalidateAll() ICacheTags[i] = 1; } +void ARMv5::WriteBufferCheck() +{ + if (WBWritePointer == 16) return; + + while (WBCycles[WBWritePointer] <= (NDS.ARM9Timestamp + DataCycles)) + { + //printf("drainingwb %lli, %i %08X %i\n", WBCycles[WBWritePointer], WBWritePointer, WBAddr, WriteBufferFifo[WBWritePointer] >> 62); + switch ((u64)WriteBufferFifo[WBWritePointer] >> 62) + { + case 0: // byte + { + u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; + if (WBAddr < ITCMSize) + { + *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; + else BusWrite8(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + break; + } + case 1: // halfword + { + u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; + if (WBAddr < ITCMSize) + { + *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; + else BusWrite16(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + break; + } + case 2: // word + { + u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; + if (WBAddr < ITCMSize) + { + *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; + else BusWrite32(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + WBAddr += 4; + break; + } + case 3: // address update + WBAddr = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; + break; + } + + WBWritePointer = (WBWritePointer + 1) & 0xF; + if (WBWritePointer == WBFillPointer) + { + WBWritePointer = 16; + WBFillPointer = 0; + break; + } + } +} + +void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) +{ + WriteBufferCheck(); + + if (WBFillPointer == WBWritePointer) + { + //printf("forcedrainingwb %lli, %i %08X %i\n", WBCycles[WBWritePointer], WBWritePointer, WBAddr, WriteBufferFifo[WBWritePointer] >> 62); + if (NDS.ARM9Timestamp < WBCycles[WBWritePointer]) + { + NDS.ARM9Timestamp = WBCycles[WBWritePointer]; + DataCycles = 0; // checkme + } + + switch ((u64)WriteBufferFifo[WBWritePointer] >> 62) + { + case 0: // byte + { + u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; + if (WBAddr < ITCMSize) + { + *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; + else BusWrite8(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + break; + } + case 1: // halfword + { + u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; + if (WBAddr < ITCMSize) + { + *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; + else BusWrite16(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + break; + } + case 2: // word + { + u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; + if (WBAddr < ITCMSize) + { + *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; + else BusWrite32(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + WBAddr += 4; + break; + } + case 3: // address update + WBAddr = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; + break; + } + + WBWritePointer = (WBWritePointer + 1) & 0xF; + if (WBWritePointer == WBFillPointer) + { + WBWritePointer = 16; + WBFillPointer = 0; + } + } + + //printf("fillingwb %lli %i %i %08X %i\n", NDS.ARM9Timestamp, WBWritePointer, WBFillPointer, val, flag); + if (WBWritePointer == 16) + { + WBCycles[WBFillPointer] = NDS.ARM9Timestamp + DataCycles + cycles; + WBWritePointer = 0; + } + else + { + WBCycles[WBFillPointer] = WBCycles[(WBFillPointer-1) & 0xF] + cycles; + } + WriteBufferFifo[WBFillPointer] = val | (u64)flag << 62; + storeaddr[WBFillPointer] = addr; + WBFillPointer = (WBFillPointer + 1) & 0xF; +} + +void ARMv5::WriteBufferDrain() +{ + if (WBWritePointer == 16) return; + + while (true) + { + //printf("fullydrainingwb %lli, %i %08X %i\n", WBCycles[WBWritePointer], WBWritePointer, WBAddr, WriteBufferFifo[WBWritePointer] >> 62); + if (NDS.ARM9Timestamp < WBCycles[WBWritePointer]) + { + NDS.ARM9Timestamp = WBCycles[WBWritePointer]; + DataCycles = 0; // checkme + } + + switch (WriteBufferFifo[WBWritePointer] >> 62) + { + case 0: // byte + { + u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; + if (WBAddr < ITCMSize) + { + *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; + else BusWrite8(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + break; + } + case 1: // halfword + { + u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; + if (WBAddr < ITCMSize) + { + *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; + else BusWrite16(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + break; + } + case 2: // word + { + u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; + if (WBAddr < ITCMSize) + { + *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); + } + else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; + else BusWrite32(storeaddr[WBWritePointer], val); + if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + WBAddr += 4; + break; + } + case 3: // address update + WBAddr = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; + break; + } + + WBWritePointer = (WBWritePointer + 1) & 0xF; + if (WBWritePointer == WBFillPointer) + { + WBWritePointer = 16; + WBFillPointer = 0; + break; + } + } + //printf("wbdrained\n"); +} void ARMv5::CP15Write(u32 id, u32 val) { @@ -788,6 +1005,9 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) return 0; } + if ((PU_Map[addr>>12] & 0x30)) + WriteBufferDrain(); + if (addr < ITCMSize) { CodeCycles = 1; @@ -837,6 +1057,9 @@ bool ARMv5::DataRead8(u32 addr, u32* val) DataCycles = 1; return false; } + + if ((PU_Map[addr>>12] & 0x30)) + WriteBufferDrain(); if (addr < ITCMSize) { @@ -882,6 +1105,8 @@ bool ARMv5::DataRead16(u32 addr, u32* val) } addr &= ~1; + if ((PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & (CP15_MAP_DCACHEWRITEBACK | CP15_MAP_DCACHEABLE))) + WriteBufferDrain(); if (addr < ITCMSize) { @@ -928,6 +1153,9 @@ bool ARMv5::DataRead32(u32 addr, u32* val) addr &= ~3; + if ((PU_Map[addr>>12] & 0x30)) + WriteBufferDrain(); + if (addr < ITCMSize) { DataCycles = 1; @@ -972,6 +1200,9 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) addr &= ~3; + if ((PU_Map[addr>>12] & 0x30)) + WriteBufferDrain(); + if (addr < ITCMSize) { DataCycles += 1; @@ -1016,37 +1247,46 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) return false; } - if (addr < ITCMSize) + if (!(PU_Map[addr>>12] & (0x30))) { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<(addr); + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } - DataCycles = MemTimings[addr >> 12][1]; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; - if ((addr >> 24) == 0x02) - { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - DataRegion = Mem9_MainRAM; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (2<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= (2<>14]; + + BusWrite8(addr, val); + } + else + { + DataCycles = 1; + WriteBufferWrite(addr, 3, 1); + WriteBufferWrite(val, 0, MemTimings[addr >> 12][1], addr); } - else DataRegion = NDS.ARM9Regions[addr>>14]; - - BusWrite8(addr, val); return true; } @@ -1063,37 +1303,47 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) addr &= ~1; - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30)) + { + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } - DataCycles = MemTimings[addr >> 12][1]; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; - if ((addr >> 24) == 0x02) - { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - DataRegion = Mem9_MainRAM; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (2<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= (2<>14]; + + BusWrite16(addr, val); + } + else + { + DataCycles = 1; + WriteBufferWrite(addr, 3, 1); + WriteBufferWrite(val, 1, MemTimings[addr >> 12][1], addr); } - else DataRegion = NDS.ARM9Regions[addr>>14]; - - BusWrite16(addr, val); return true; } @@ -1110,37 +1360,47 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) addr &= ~3; - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30)) + { + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } - DataCycles = MemTimings[addr >> 12][2]; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; - if ((addr >> 24) == 0x02) - { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - DataRegion = Mem9_MainRAM; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (2<> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + DataRegion = Mem9_MainRAM; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= (2<>14]; + + BusWrite32(addr, val); + } + else + { + DataCycles = 1; + WriteBufferWrite(addr, 3, 1); + WriteBufferWrite(val, 2, MemTimings[addr >> 12][2], addr); } - else DataRegion = NDS.ARM9Regions[addr>>14]; - - BusWrite32(addr, val); return true; } @@ -1156,38 +1416,46 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) addr &= ~3; - if (addr < ITCMSize) + + if (!(PU_Map[addr>>12] & 0x30)) + { + if (addr < ITCMSize) + { + DataCycles += 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; + #ifdef JIT_ENABLED + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); + #endif + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles += 1; + DataRegion = Mem9_DTCM; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) NDS.ARM9Timestamp = MainRAMTimestamp - DataCycles; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + + BusWrite32(addr, val); + DataCycles += MemTimings[addr >> 12][3]; + } + else { DataCycles += 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); -#endif - return true; + WriteBufferWrite(val, 2, MemTimings[addr >> 12][3], addr); } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles += 1; - DataRegion = Mem9_DTCM; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) - { - if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) NDS.ARM9Timestamp = MainRAMTimestamp - DataCycles; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_MainRAM; - } - else DataRegion = NDS.ARM9Regions[addr>>14]; - - BusWrite32(addr, val); - DataCycles += MemTimings[addr >> 12][3]; return true; } From 9cf065e54f9555dfd919c6d92f0da946e26d7044 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 9 Oct 2024 17:37:25 -0400 Subject: [PATCH 145/306] idk --- src/CP15.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 613c1bd5..d085cf02 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -457,7 +457,6 @@ void ARMv5::WriteBufferCheck() } else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; else BusWrite8(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); break; } case 1: // halfword @@ -470,7 +469,6 @@ void ARMv5::WriteBufferCheck() } else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; else BusWrite16(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); break; } case 2: // word @@ -483,7 +481,6 @@ void ARMv5::WriteBufferCheck() } else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; else BusWrite32(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); WBAddr += 4; break; } @@ -527,7 +524,6 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) } else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; else BusWrite8(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); break; } case 1: // halfword @@ -540,7 +536,6 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) } else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; else BusWrite16(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); break; } case 2: // word @@ -553,7 +548,6 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) } else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; else BusWrite32(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); WBAddr += 4; break; } @@ -610,7 +604,6 @@ void ARMv5::WriteBufferDrain() } else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; else BusWrite8(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); break; } case 1: // halfword @@ -623,7 +616,6 @@ void ARMv5::WriteBufferDrain() } else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; else BusWrite16(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); break; } case 2: // word @@ -636,7 +628,6 @@ void ARMv5::WriteBufferDrain() } else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; else BusWrite32(storeaddr[WBWritePointer], val); - if (WBAddr != storeaddr[WBWritePointer]) printf("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); WBAddr += 4; break; } @@ -1218,8 +1209,11 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } - + NDS.ARM9Timestamp += DataCycles; + + if (!(addr & 0x3FF)) return DataRead32(addr, val); // bursts cannot cross a 1kb boundary + DataCycles = MemTimings[addr >> 12][3]; NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) { if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) NDS.ARM9Timestamp = MainRAMTimestamp - DataCycles; From 35c382acabac3084391b83e3be85db46b974e83f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 9 Oct 2024 17:51:00 -0400 Subject: [PATCH 146/306] jit --- src/CP15.cpp | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index d085cf02..c49034af 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -453,7 +453,9 @@ void ARMv5::WriteBufferCheck() if (WBAddr < ITCMSize) { *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; else BusWrite8(storeaddr[WBWritePointer], val); @@ -465,7 +467,9 @@ void ARMv5::WriteBufferCheck() if (WBAddr < ITCMSize) { *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; else BusWrite16(storeaddr[WBWritePointer], val); @@ -477,7 +481,9 @@ void ARMv5::WriteBufferCheck() if (WBAddr < ITCMSize) { *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; else BusWrite32(storeaddr[WBWritePointer], val); @@ -520,7 +526,9 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) if (WBAddr < ITCMSize) { *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; else BusWrite8(storeaddr[WBWritePointer], val); @@ -532,7 +540,9 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) if (WBAddr < ITCMSize) { *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; else BusWrite16(storeaddr[WBWritePointer], val); @@ -544,7 +554,9 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) if (WBAddr < ITCMSize) { *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; else BusWrite32(storeaddr[WBWritePointer], val); @@ -600,7 +612,9 @@ void ARMv5::WriteBufferDrain() if (WBAddr < ITCMSize) { *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; else BusWrite8(storeaddr[WBWritePointer], val); @@ -612,7 +626,9 @@ void ARMv5::WriteBufferDrain() if (WBAddr < ITCMSize) { *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; else BusWrite16(storeaddr[WBWritePointer], val); @@ -624,7 +640,9 @@ void ARMv5::WriteBufferDrain() if (WBAddr < ITCMSize) { *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); +#endif } else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; else BusWrite32(storeaddr[WBWritePointer], val); @@ -1096,7 +1114,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) } addr &= ~1; - if ((PU_Map[addr>>CP15_MAP_ENTRYSIZE_LOG2] & (CP15_MAP_DCACHEWRITEBACK | CP15_MAP_DCACHEABLE))) + if ((PU_Map[addr>>12] & 0x30)) WriteBufferDrain(); if (addr < ITCMSize) @@ -1249,7 +1267,9 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return true; } if ((addr & DTCMMask) == DTCMBase) @@ -1306,7 +1326,9 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return true; } if ((addr & DTCMMask) == DTCMBase) @@ -1363,7 +1385,9 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif return true; } if ((addr & DTCMMask) == DTCMBase) @@ -1419,9 +1443,9 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; - #ifdef JIT_ENABLED +#ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); - #endif +#endif return true; } if ((addr & DTCMMask) == DTCMBase) From f2bc0fae4d67d02cf8d63941b2e227b7b476d7e3 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:05:26 -0400 Subject: [PATCH 147/306] cache imp --- src/CP15.cpp | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index e213e112..bb228404 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -385,6 +385,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) // Disabled ICACHE Streaming: // retreive the data from memory, even if the data was cached // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + WriteBufferDrain(); CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; if (CodeMem.Mem) { @@ -407,6 +408,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] { + WriteBufferDrain(); CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; if (CodeMem.Mem) { @@ -446,6 +448,8 @@ u32 ARMv5::ICacheLookup(const u32 addr) line += id; u32* ptr = (u32 *)&ICache[line << ICACHE_LINELENGTH_LOG2]; + + WriteBufferDrain(); if (CodeMem.Mem) { @@ -534,6 +538,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) // Disabled DCACHE Streaming: // retreive the data from memory, even if the data was cached // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") + WriteBufferDrain(); DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; if (addr < ITCMSize) { @@ -560,6 +565,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] { + WriteBufferDrain(); DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; if (addr < ITCMSize) { @@ -609,6 +615,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) // Datacycles will be incremented by the required cycles to do so DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); #endif + + WriteBufferDrain(); //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", tag); for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { @@ -831,7 +839,13 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) { - //Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); + WriteBufferWrite(tag, 3, 1); + WriteBufferWrite(ptr[0x00], 2, MemTimings[tag >> 12][2], tag+0x00); + WriteBufferWrite(ptr[0x04], 2, MemTimings[tag >> 12][3], tag+0x04); + WriteBufferWrite(ptr[0x08], 2, MemTimings[tag >> 12][3], tag+0x08); + WriteBufferWrite(ptr[0x0C], 2, MemTimings[tag >> 12][3], tag+0x0C); + DataCycles += 5; + /*//Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) { //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); @@ -848,12 +862,12 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) BusWrite32(tag+i, ptr[i >> 2]); } } - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift;*/ } if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) { //Log(LogLevel::Debug, "Writing back %i / %i, upper half-> %08lx\n", cacheSet, cacheLine, tag); - for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) + /*for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); if (tag+i < ITCMSize) @@ -869,7 +883,13 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) BusWrite32(tag+i, ptr[i >> 2]); } } - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift; + DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift;*/ + WriteBufferWrite(tag+0x10, 3, 1); + WriteBufferWrite(ptr[0x10], 2, MemTimings[tag >> 12][2], tag+0x10); + WriteBufferWrite(ptr[0x14], 2, MemTimings[tag >> 12][3], tag+0x14); + WriteBufferWrite(ptr[0x18], 2, MemTimings[tag >> 12][3], tag+0x18); + WriteBufferWrite(ptr[0x1C], 2, MemTimings[tag >> 12][3], tag+0x1C); + DataCycles += 5; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); #endif From 746f6edb0ab7b94f892a927fe810800224d4a71f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:06:17 -0400 Subject: [PATCH 148/306] should addr writes to the fifo take 1 cycle? probably? --- src/CP15.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index c49034af..acf15695 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1297,7 +1297,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) } else { - DataCycles = 1; + DataCycles = 2; WriteBufferWrite(addr, 3, 1); WriteBufferWrite(val, 0, MemTimings[addr >> 12][1], addr); } @@ -1356,7 +1356,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } else { - DataCycles = 1; + DataCycles = 2; WriteBufferWrite(addr, 3, 1); WriteBufferWrite(val, 1, MemTimings[addr >> 12][1], addr); } @@ -1415,7 +1415,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } else { - DataCycles = 1; + DataCycles = 2; WriteBufferWrite(addr, 3, 1); WriteBufferWrite(val, 2, MemTimings[addr >> 12][2], addr); } From 60234a96b8c9acebb082b7de67f46e2b0cca9a63 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:10:02 -0400 Subject: [PATCH 149/306] im dumb --- src/CP15.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index d9cba070..a88321f8 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -840,10 +840,10 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) { WriteBufferWrite(tag, 3, 1); - WriteBufferWrite(ptr[0x00], 2, MemTimings[tag >> 12][2], tag+0x00); - WriteBufferWrite(ptr[0x04], 2, MemTimings[tag >> 12][3], tag+0x04); - WriteBufferWrite(ptr[0x08], 2, MemTimings[tag >> 12][3], tag+0x08); - WriteBufferWrite(ptr[0x0C], 2, MemTimings[tag >> 12][3], tag+0x0C); + WriteBufferWrite(ptr[0], 2, MemTimings[tag >> 12][2], tag+0x00); + WriteBufferWrite(ptr[1], 2, MemTimings[tag >> 12][3], tag+0x04); + WriteBufferWrite(ptr[2], 2, MemTimings[tag >> 12][3], tag+0x08); + WriteBufferWrite(ptr[3], 2, MemTimings[tag >> 12][3], tag+0x0C); DataCycles += 5; /*//Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) @@ -885,10 +885,10 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) } DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift;*/ WriteBufferWrite(tag+0x10, 3, 1); - WriteBufferWrite(ptr[0x10], 2, MemTimings[tag >> 12][2], tag+0x10); - WriteBufferWrite(ptr[0x14], 2, MemTimings[tag >> 12][3], tag+0x14); - WriteBufferWrite(ptr[0x18], 2, MemTimings[tag >> 12][3], tag+0x18); - WriteBufferWrite(ptr[0x1C], 2, MemTimings[tag >> 12][3], tag+0x1C); + WriteBufferWrite(ptr[4], 2, MemTimings[tag >> 12][2], tag+0x10); + WriteBufferWrite(ptr[5], 2, MemTimings[tag >> 12][3], tag+0x14); + WriteBufferWrite(ptr[6], 2, MemTimings[tag >> 12][3], tag+0x18); + WriteBufferWrite(ptr[7], 2, MemTimings[tag >> 12][3], tag+0x1C); DataCycles += 5; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); From 2c3ef9f90303d1f2e9a7f6ee987d0981ce22b749 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 9 Oct 2024 19:00:01 -0400 Subject: [PATCH 150/306] writing to the write buffer seems to require bus cycle alignment --- src/CP15.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index acf15695..c0da4711 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1297,7 +1297,8 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) } else { - DataCycles = 2; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1], addr); } @@ -1356,7 +1357,8 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } else { - DataCycles = 2; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1], addr); } @@ -1415,7 +1417,8 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } else { - DataCycles = 2; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2], addr); } @@ -1456,7 +1459,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) return true; } - DataCycles += ((NDS.ARM9Timestamp + ((1<> 12][3], addr); } From e25dca003020cc39de90456c58340b69d0d108f1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 03:14:01 -0400 Subject: [PATCH 151/306] writing to the write buffer has a 1 cycle delay before it can be done again --- src/ARM.cpp | 1 + src/ARM.h | 1 + src/CP15.cpp | 11 +++++++---- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7072978d..8c74a248 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -210,6 +210,7 @@ void ARMv5::Reset() WBWritePointer = 16; WBFillPointer = 0; + WBDelay = 0; ARM::Reset(); } diff --git a/src/ARM.h b/src/ARM.h index 12675023..16f4b72a 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -377,6 +377,7 @@ public: u8 WBWritePointer; u8 WBFillPointer; + u64 WBDelay; u32 WBAddr; // current working address for the write buffer u32 storeaddr[16]; // debugging u64 WBCycles[16]; // timestamp each write will complete diff --git a/src/CP15.cpp b/src/CP15.cpp index c0da4711..1d082f04 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1297,10 +1297,11 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) } else { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1< NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; DataCycles = 1; WriteBufferWrite(addr, 3, 1); WriteBufferWrite(val, 0, MemTimings[addr >> 12][1], addr); + WBDelay = NDS.ARM9Timestamp + 2; } return true; } @@ -1357,10 +1358,11 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } else { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1< NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; DataCycles = 1; WriteBufferWrite(addr, 3, 1); WriteBufferWrite(val, 1, MemTimings[addr >> 12][1], addr); + WBDelay = NDS.ARM9Timestamp + 2; } return true; } @@ -1417,10 +1419,11 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } else { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1< NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; DataCycles = 1; WriteBufferWrite(addr, 3, 1); WriteBufferWrite(val, 2, MemTimings[addr >> 12][2], addr); + WBDelay = NDS.ARM9Timestamp + 2; } return true; } @@ -1476,9 +1479,9 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) } else { - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<> 12][3], addr); + WBDelay = NDS.ARM9Timestamp + DataCycles + 1; } return true; } From 53b38c363fb820f97c4f44b8601540970f9017cf Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 03:32:53 -0400 Subject: [PATCH 152/306] ok no it didn't lie to me --- src/ARMInterpreter.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 72d1e189..1f95c1f8 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -136,7 +136,8 @@ void A_MSR_IMM(ARM* cpu) } } - cpu->AddCycles_C(); + if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); + else cpu->AddCycles_C(); } void A_MSR_REG(ARM* cpu) @@ -196,7 +197,8 @@ void A_MSR_REG(ARM* cpu) } } - cpu->AddCycles_C(); + if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); + else cpu->AddCycles_C(); } void A_MRS(ARM* cpu) From 3870216fd06f634c2b0864feea5c4353ca94fd0b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 03:53:51 -0400 Subject: [PATCH 153/306] correction: --- src/ARMInterpreter.cpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 1f95c1f8..2b14de73 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -98,7 +98,8 @@ void A_MSR_IMM(ARM* cpu) case 0x1A: case 0x1B: psr = &cpu->R_UND[2]; break; default: - cpu->AddCycles_C(); + if (cpu->Num != 1) cpu->AddCycles_C(); // arm 7 + else cpu->AddCycles_CI(2); // arm 9 return; } } @@ -135,8 +136,16 @@ void A_MSR_IMM(ARM* cpu) cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least } } - - if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); + + if (cpu->Num != 1) + { + if (cpu->CurInstr & (1<<22)) + { + cpu->AddCycles_CI(2); // spsr_fsxc + } + else if (cpu->CurInstr & (0x7<<16)) cpu->AddCycles_CI(2); // cpsr_sxc + else cpu->AddCycles_C(); + } else cpu->AddCycles_C(); } @@ -159,7 +168,8 @@ void A_MSR_REG(ARM* cpu) case 0x1A: case 0x1B: psr = &cpu->R_UND[2]; break; default: - cpu->AddCycles_C(); + if (cpu->Num != 1) cpu->AddCycles_C(); // arm 7 + else cpu->AddCycles_CI(2); // arm 9 return; } } @@ -196,8 +206,16 @@ void A_MSR_REG(ARM* cpu) cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least } } - - if ((cpu->Num != 1) && (cpu->CurInstr & (0x7<<16))) cpu->AddCycles_CI(2); + + if (cpu->Num != 1) + { + if (cpu->CurInstr & (1<<22)) + { + cpu->AddCycles_CI(2); // spsr_fsxc + } + else if (cpu->CurInstr & (0x7<<16)) cpu->AddCycles_CI(2); // cpsr_sxc + else cpu->AddCycles_C(); + } else cpu->AddCycles_C(); } From 93dce82b078a1df48fb2d74a6091300f5e807b37 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:48:17 -0400 Subject: [PATCH 154/306] implement cmp with "rd == 15" on arm9 cmp and friends with bits 12-15 set to 1 borrow characteristics from their legacy 26 bit p variants thumb version does nothing of note --- src/ARMInterpreter.cpp | 4 +- src/ARMInterpreter_ALU.cpp | 82 ++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 2b14de73..a04b6140 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -141,7 +141,7 @@ void A_MSR_IMM(ARM* cpu) { if (cpu->CurInstr & (1<<22)) { - cpu->AddCycles_CI(2); // spsr_fsxc + cpu->AddCycles_CI(2); // spsr } else if (cpu->CurInstr & (0x7<<16)) cpu->AddCycles_CI(2); // cpsr_sxc else cpu->AddCycles_C(); @@ -211,7 +211,7 @@ void A_MSR_REG(ARM* cpu) { if (cpu->CurInstr & (1<<22)) { - cpu->AddCycles_CI(2); // spsr_fsxc + cpu->AddCycles_CI(2); // spsr } else if (cpu->CurInstr & (0x7<<16)) cpu->AddCycles_CI(2); // cpsr_sxc else cpu->AddCycles_C(); diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 9305fc42..83fc1944 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -581,12 +581,12 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & b; \ - cpu->SetNZ(res & 0x80000000, \ - !res); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (cpu->CPSR & 0x20) \ @@ -595,7 +595,12 @@ A_IMPLEMENT_ALU_OP(RSC,) cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TST w/ rd == 15 on ARM9\n"); \ + else cpu->JumpTo(res & ~1, true); /* TSTP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -605,12 +610,12 @@ A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a ^ b; \ - cpu->SetNZ(res & 0x80000000, \ - !res); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (cpu->CPSR & 0x20) \ @@ -619,7 +624,12 @@ A_IMPLEMENT_ALU_TEST(TST,_S) cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: TEQ w/ rd == 15 on ARM9\n"); \ + else cpu->JumpTo(res & ~1, true); /* TEQP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZ(res & 0x80000000, \ + !res); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -629,14 +639,14 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b; \ - cpu->SetNZCV(res & 0x80000000, \ - !res, \ - CarrySub(a, b), \ - OverflowSub(a, b)); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarrySub(a, b), \ + OverflowSub(a, b)); \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (cpu->CPSR & 0x20) \ @@ -645,7 +655,14 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP w/ rd == 15 on ARM9\n"); \ + else cpu->JumpTo(res & ~1, true); /* CMPP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarrySub(a, b), \ + OverflowSub(a, b)); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -655,14 +672,14 @@ A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ - cpu->SetNZCV(res & 0x80000000, \ - !res, \ - CarryAdd(a, b), \ - OverflowAdd(a, b)); \ - if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* yes this instruction has a secret rd for some reason */ \ + if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarryAdd(a, b), \ + OverflowAdd(a, b)); \ u32 oldpsr = cpu->CPSR; \ cpu->RestoreCPSR(); /* ARM7TDMI restores cpsr and does ___not___ flush the pipeline. */ \ if (cpu->CPSR & 0x20) \ @@ -671,7 +688,14 @@ A_IMPLEMENT_ALU_TEST(CMP,) cpu->CPSR &= ~0x20; /* keep it from crashing the emulator at least */ \ } \ } \ - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMN w/ rd == 15 on ARM9\n"); \ + else cpu->JumpTo(res & ~1, true); /* CMNP dna, doesn't update flags */ \ + } \ + else \ + { \ + cpu->SetNZCV(res & 0x80000000, \ + !res, \ + CarryAdd(a, b), \ + OverflowAdd(a, b)); \ } \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); @@ -1625,20 +1649,18 @@ void T_CMP_HIREG(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); - if (rd == 15) [[unlikely]] + + if ((cpu->Num == 1) && (rd == 15)) { - if (cpu->Num == 1) + u32 oldpsr = cpu->CPSR; + cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. + if (!(cpu->CPSR & 0x20)) { - u32 oldpsr = cpu->CPSR; - cpu->RestoreCPSR(); // ARM7TDMI restores cpsr and does ___not___ flush the pipeline. - if (!(cpu->CPSR & 0x20)) - { - Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); - cpu->CPSR |= 0x20; // keep it from crashing the emulator at least - } + Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); + cpu->CPSR |= 0x20; // keep it from crashing the emulator at least } - else Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: CMP HIREG w/ rd == 15 on ARM9\n"); } + cpu->AddCycles_C(); } From 787d0c9afcd963380eb364b72ac71e7012d85689 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:09:07 -0400 Subject: [PATCH 155/306] mrc r15 updates flags also my prior implementation made mrc w/ r15 raise an exception by accident oops! --- src/ARMInterpreter.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index a04b6140..82dc6876 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -297,11 +297,17 @@ void A_MRC(ARM* cpu) u32 cpinfo = (cpu->CurInstr >> 5) & 0x7; u32 rd = (cpu->CurInstr>>12) & 0xF; - if (cpu->Num==0 && cp==15 && rd!=15) + if (cpu->Num==0 && cp==15) { - cpu->R[rd] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo); + if (rd != 15) cpu->R[rd] = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo); + else + { + // r15 updates the top 4 bits of the cpsr, done to "allow for conditional branching based on coprocessor status" + u32 flags = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo) & 0xF0000000; + cpu->CPSR = (cpu->CPSR & ~0xF0000000) | flags; + } } - else if (cpu->Num==1 && cp==14 && rd!=15) + else if (cpu->Num==1 && cp==14) { Log(LogLevel::Debug, "MRC p14,%d,%d,%d on ARM7\n", cn, cm, cpinfo); } From 34bba2589e3ddbe94642dd3325a49179cdf27f82 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 20:51:52 -0400 Subject: [PATCH 156/306] tcm (and cache?) reads dont trigger write buffer drains additionally drains are triggered even in no cache + no buffer regions despite documentation not specifying such --- src/CP15.cpp | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 1d082f04..3ea67f3f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1014,9 +1014,6 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) return 0; } - if ((PU_Map[addr>>12] & 0x30)) - WriteBufferDrain(); - if (addr < ITCMSize) { CodeCycles = 1; @@ -1039,6 +1036,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) //return *(u32*)&CurICacheLine[addr & 0x1C]; } + WriteBufferDrain(); + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) @@ -1066,9 +1065,6 @@ bool ARMv5::DataRead8(u32 addr, u32* val) DataCycles = 1; return false; } - - if ((PU_Map[addr>>12] & 0x30)) - WriteBufferDrain(); if (addr < ITCMSize) { @@ -1085,6 +1081,8 @@ bool ARMv5::DataRead8(u32 addr, u32* val) *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + + WriteBufferDrain(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30)) - WriteBufferDrain(); if (addr < ITCMSize) { @@ -1133,6 +1129,8 @@ bool ARMv5::DataRead16(u32 addr, u32* val) return true; } + WriteBufferDrain(); + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; @@ -1162,9 +1160,6 @@ bool ARMv5::DataRead32(u32 addr, u32* val) addr &= ~3; - if ((PU_Map[addr>>12] & 0x30)) - WriteBufferDrain(); - if (addr < ITCMSize) { DataCycles = 1; @@ -1181,6 +1176,8 @@ bool ARMv5::DataRead32(u32 addr, u32* val) return true; } + WriteBufferDrain(); + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; @@ -1209,9 +1206,6 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) addr &= ~3; - if ((PU_Map[addr>>12] & 0x30)) - WriteBufferDrain(); - if (addr < ITCMSize) { DataCycles += 1; @@ -1228,6 +1222,8 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) return true; } + WriteBufferDrain(); + NDS.ARM9Timestamp += DataCycles; if (!(addr & 0x3FF)) return DataRead32(addr, val); // bursts cannot cross a 1kb boundary From 3d246ddf739af6b9c22e01349f9d139b3ab60c79 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 10 Oct 2024 22:54:33 -0400 Subject: [PATCH 157/306] tcms just aren't bufferable --- src/CP15.cpp | 245 ++++++++++++++++++--------------------------------- 1 file changed, 85 insertions(+), 160 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 3ea67f3f..ce56949b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -450,43 +450,19 @@ void ARMv5::WriteBufferCheck() case 0: // byte { u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; - if (WBAddr < ITCMSize) - { - *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; - else BusWrite8(storeaddr[WBWritePointer], val); + BusWrite8(storeaddr[WBWritePointer], val); break; } case 1: // halfword { u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; - if (WBAddr < ITCMSize) - { - *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; - else BusWrite16(storeaddr[WBWritePointer], val); + BusWrite16(storeaddr[WBWritePointer], val); break; } case 2: // word { u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - if (WBAddr < ITCMSize) - { - *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; - else BusWrite32(storeaddr[WBWritePointer], val); + BusWrite32(storeaddr[WBWritePointer], val); WBAddr += 4; break; } @@ -523,43 +499,19 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) case 0: // byte { u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; - if (WBAddr < ITCMSize) - { - *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; - else BusWrite8(storeaddr[WBWritePointer], val); + BusWrite8(storeaddr[WBWritePointer], val); break; } case 1: // halfword { u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; - if (WBAddr < ITCMSize) - { - *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; - else BusWrite16(storeaddr[WBWritePointer], val); + BusWrite16(storeaddr[WBWritePointer], val); break; } case 2: // word { u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - if (WBAddr < ITCMSize) - { - *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; - else BusWrite32(storeaddr[WBWritePointer], val); + BusWrite32(storeaddr[WBWritePointer], val); WBAddr += 4; break; } @@ -609,43 +561,19 @@ void ARMv5::WriteBufferDrain() case 0: // byte { u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; - if (WBAddr < ITCMSize) - { - *(u8*)&ITCM[WBAddr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u8*)&DTCM[WBAddr & (DTCMPhysicalSize - 1)] = val; - else BusWrite8(storeaddr[WBWritePointer], val); + BusWrite8(storeaddr[WBWritePointer], val); break; } case 1: // halfword { u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; - if (WBAddr < ITCMSize) - { - *(u16*)&ITCM[WBAddr & (ITCMPhysicalSize - 2)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u16*)&DTCM[WBAddr & (DTCMPhysicalSize - 2)] = val; - else BusWrite16(storeaddr[WBWritePointer], val); + BusWrite16(storeaddr[WBWritePointer], val); break; } case 2: // word { u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - if (WBAddr < ITCMSize) - { - *(u32*)&ITCM[WBAddr & (ITCMPhysicalSize - 4)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(WBAddr); -#endif - } - else if ((WBAddr & DTCMMask) == DTCMBase) *(u32*)&DTCM[WBAddr & (DTCMPhysicalSize - 4)] = val; - else BusWrite32(storeaddr[WBWritePointer], val); + BusWrite32(storeaddr[WBWritePointer], val); WBAddr += 4; break; } @@ -1255,29 +1183,29 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) return false; } + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } + if (!(PU_Map[addr>>12] & (0x30))) { - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); -#endif - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; if ((addr >> 24) == 0x02) @@ -1315,30 +1243,29 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) addr &= ~1; + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } if (!(PU_Map[addr>>12] & 0x30)) { - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); -#endif - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; if ((addr >> 24) == 0x02) @@ -1376,30 +1303,29 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) addr &= ~3; + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } if (!(PU_Map[addr>>12] & 0x30)) { - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); -#endif - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; if ((addr >> 24) == 0x02) @@ -1436,28 +1362,27 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) addr &= ~3; + if (addr < ITCMSize) + { + DataCycles += 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; +#ifdef JIT_ENABLED + NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); +#endif + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles += 1; + DataRegion = Mem9_DTCM; + *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; + return true; + } if (!(PU_Map[addr>>12] & 0x30)) { - if (addr < ITCMSize) - { - DataCycles += 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; -#ifdef JIT_ENABLED - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); -#endif - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles += 1; - DataRegion = Mem9_DTCM; - *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; - } - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1< Date: Thu, 10 Oct 2024 23:24:20 -0400 Subject: [PATCH 158/306] implement drain write buffer cache command --- src/CP15.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index ce56949b..b1ed52a5 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -786,7 +786,9 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x7A2: //printf("flush data cache SI\n"); return; - + case 0x7A4: + WriteBufferDrain(); + return; case 0x910: DTCMSetting = val & 0xFFFFF03E; From a8722d8c562ca2dc42485bb97aac6f76d1bd058c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 11 Oct 2024 02:47:41 -0400 Subject: [PATCH 159/306] tcms shouldn't be cacheable --- src/CP15.cpp | 319 +++++++++++++++++++++++---------------------------- 1 file changed, 145 insertions(+), 174 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 7cf8109f..fd088b1f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -540,17 +540,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") WriteBufferDrain(); DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; - if (addr < ITCMSize) - { - return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; - } else - if ((addr & DTCMMask) == DTCMBase) - { - return *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; - } else - { - return BusRead32(addr & ~3); - } + return BusRead32(addr & ~3); } DataCycles += 1; DataRegion = Mem9_DCache; @@ -567,17 +557,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) { WriteBufferDrain(); DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; - if (addr < ITCMSize) - { - return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 3)]; - } else - if ((addr & DTCMMask) == DTCMBase) - { - return *(u32*)&DTCM[addr & (DTCMPhysicalSize - 3)]; - } else - { - return BusRead32(addr & ~3); - } + return BusRead32(addr & ~3); } u32 line; @@ -620,17 +600,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", tag); for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { - if (tag+i < ITCMSize) - { - ptr[i >> 2] = *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)]; - } else - if (((tag+i) & DTCMMask) == DTCMBase) - { - ptr[i >> 2] = *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)]; - } else - { - ptr[i >> 2] = BusRead32(tag+i); - } + ptr[i >> 2] = BusRead32(tag+i); //Log(LogLevel::Debug,"DCache store @ %08x: %08x in set %i, line %i\n", tag+i, *(u32*)&ptr[i >> 2], line & 3, line >> 2); } @@ -1783,6 +1753,17 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) DataRegion = Mem9_Null; return 0; } + + if (addr < ITCMSize) + { + CodeCycles = 1; + + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; + NDS.ARM9Timestamp += CodeCycles; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; + return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + } #if !DISABLE_ICACHE #ifdef JIT_ENABLED @@ -1799,17 +1780,6 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } #endif - if (addr < ITCMSize) - { - CodeCycles = 1; - - if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; - NDS.ARM9Timestamp += CodeCycles; - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; - DataRegion = Mem9_Null; - return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - } - CodeCycles = MemTimings[addr >> 12][0]; if (CodeCycles == 0xFF) // cached memory. hax { @@ -1848,6 +1818,22 @@ bool ARMv5::DataRead8(u32 addr, u32* val) DataCycles = 1; return false; } + + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + return true; + } #if !DISABLE_DCACHE #ifdef JIT_ENABLED @@ -1865,22 +1851,6 @@ bool ARMv5::DataRead8(u32 addr, u32* val) } } #endif - - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return true; - } WriteBufferDrain(); @@ -1910,23 +1880,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) DataCycles = 1; return false; } - - #if !DISABLE_DCACHE - #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) - #endif - { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) - { - if (IsAddressDCachable(addr)) - { - DataCycles = 0; - *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; - return true; - } - } - } - #endif + addr &= ~1; if (addr < ITCMSize) @@ -1944,6 +1898,23 @@ bool ARMv5::DataRead16(u32 addr, u32* val) *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; } + + #if !DISABLE_DCACHE + #ifdef JIT_ENABLED + if (!NDS.IsJITEnabled()) + #endif + { + if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + { + if (IsAddressDCachable(addr)) + { + DataCycles = 0; + *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; + return true; + } + } + } + #endif WriteBufferDrain(); @@ -1976,6 +1947,22 @@ bool ARMv5::DataRead32(u32 addr, u32* val) addr &= ~3; + if (addr < ITCMSize) + { + DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_ITCM; + *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + return true; + } + if ((addr & DTCMMask) == DTCMBase) + { + DataCycles = 1; + DataRegion = Mem9_DTCM; + *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; + return true; + } + #if !DISABLE_DCACHE #ifdef JIT_ENABLED if (!NDS.IsJITEnabled()) @@ -1993,22 +1980,6 @@ bool ARMv5::DataRead32(u32 addr, u32* val) } #endif - if (addr < ITCMSize) - { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_ITCM; - *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return true; - } - if ((addr & DTCMMask) == DTCMBase) - { - DataCycles = 1; - DataRegion = Mem9_DTCM; - *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return true; - } - WriteBufferDrain(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & (0x30))) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30)) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30)) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30)) { DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1< Date: Sat, 12 Oct 2024 11:10:06 -0400 Subject: [PATCH 160/306] make empty r-list instructions a bit nicer pass bools as a single u8 instead and combine thumb and restore cpsr flags since they're mutually exclusive --- src/ARMInterpreter_LoadStore.cpp | 50 ++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 59b9bc30..84203310 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -434,30 +434,40 @@ void A_SWPB(ARM* cpu) SWP(cpu); } -void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeback, const bool decrement, bool preinc, const bool usermode, const bool thumb) +void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) { + enum // flags + { + load = (1<<0), + writeback = (1<<1), + decrement = (1<<2), + preinc = (1<<3), + restoreorthumb = (1<<4), // specifies restore cpsr for loads, thumb instr for stores + }; + if (cpu->Num == 1) { u32 base = cpu->R[baseid]; + bool flagpreinc = flags & preinc; - if (decrement) + if (flags & decrement) { - preinc = !preinc; + flagpreinc = !flagpreinc; base -= 0x40; } - if (preinc) base+=4; + if (flagpreinc) base+=4; - if (load) + if (flags & load) { u32 pc; cpu->DataRead32(base, &pc); cpu->AddCycles_CDI(); - cpu->JumpTo(pc, usermode); + cpu->JumpTo(pc, flags & restoreorthumb); } else { - cpu->DataWrite32(base, cpu->R[15] + (thumb ? 2 : 4)); + cpu->DataWrite32(base, cpu->R[15] + ((flags & restoreorthumb) ? 2 : 4)); cpu->AddCycles_CD(); } @@ -467,10 +477,10 @@ void ReglessLDMSTM(ARM* cpu, const bool load, const u8 baseid, const bool writeb cpu->AddCycles_C(); // checkme } - if (writeback) + if (flags & writeback) { - if (decrement) cpu->R[baseid] -= 0x40; - else cpu->R[baseid] += 0x40; + if (flags & decrement) cpu->R[baseid] -= 0x40; + else cpu->R[baseid] += 0x40; } } @@ -486,7 +496,11 @@ void A_LDM(ARM* cpu) if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] { - ReglessLDMSTM(cpu, true, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, cpu->CurInstr & (1<<22), false); + EmptyRListLDMSTM(cpu, baseid, ((1 << 0) | // load + (((cpu->CurInstr >> 21) & 1) << 1) | // writeback + ((!(cpu->CurInstr & (1<<23))) << 2) | // decrement + ((preinc >> 24) << 3) | // preinc + (((cpu->CurInstr >> 22) & 1) << 4))); // restore return; } @@ -592,7 +606,11 @@ void A_STM(ARM* cpu) if (!(cpu->CurInstr & 0xFFFF)) [[unlikely]] { - ReglessLDMSTM(cpu, false, baseid, cpu->CurInstr & (1<<21), !(cpu->CurInstr & (1<<23)), preinc, false, false); + EmptyRListLDMSTM(cpu, baseid, ((0 << 0) | // load + (((cpu->CurInstr >> 21) & 1) << 1) | // writeback + ((!(cpu->CurInstr & (1<<23))) << 2) | // decrement + ((preinc >> 24) << 3) | // preinc + (0 << 4))); // thumb return; } @@ -790,7 +808,7 @@ void T_PUSH(ARM* cpu) if (!nregs) [[unlikely]] { - ReglessLDMSTM(cpu, false, 13, true, true, true, false, true); + EmptyRListLDMSTM(cpu, 13, 0b11110); return; } @@ -836,7 +854,7 @@ void T_POP(ARM* cpu) if (!(cpu->CurInstr & 0x1FF)) [[unlikely]] { - ReglessLDMSTM(cpu, true, 13, true, false, false, false, true); + EmptyRListLDMSTM(cpu, 13, 0b00011); return; } @@ -888,7 +906,7 @@ void T_STMIA(ARM* cpu) if (!(cpu->CurInstr & 0xFF)) [[unlikely]] { - ReglessLDMSTM(cpu, false, (cpu->CurInstr >> 8) & 0x7, true, false, false, false, true); + EmptyRListLDMSTM(cpu, (cpu->CurInstr >> 8) & 0x7, 0b10010); return; } @@ -924,7 +942,7 @@ void T_LDMIA(ARM* cpu) if (!(cpu->CurInstr & 0xFF)) [[unlikely]] { - ReglessLDMSTM(cpu, true, (cpu->CurInstr >> 8) & 0x7, true, false, false, false, true); + EmptyRListLDMSTM(cpu, (cpu->CurInstr >> 8) & 0x7, 0b00011); return; } From 1afefdce1d5fa63766306947356cdd070cc6542f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 13 Oct 2024 08:39:07 -0400 Subject: [PATCH 161/306] use sse for set lookups --- src/CP15.cpp | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index fd088b1f..ca5a17b6 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -18,6 +18,9 @@ #include #include +#if defined(__x86_64__) +#include +#endif #include "NDS.h" #include "DSi.h" #include "ARM.h" @@ -374,10 +377,24 @@ u32 ARMv5::ICacheLookup(const u32 addr) { const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; + +#if defined(__x86_64__) + __m128i tags; memcpy(&tags, &ICacheTags[id], 16); + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); + __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); + tags = _mm_and_si128(tags, mask); + cmp = _mm_cmpeq_epi32(tags, cmp); + u32 set = _mm_movemask_epi8(cmp); + if (!set) goto miss; + else set = (__builtin_ctz(set) >> 2); + + { +#else for (int set = 0; set < ICACHE_SETS; set++) { if ((ICacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) +#endif { u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] @@ -403,7 +420,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) } // cache miss - + miss: // We do not fill the cacheline if it is disabled in the // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] @@ -528,9 +545,23 @@ u32 ARMv5::DCacheLookup(const u32 addr) const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; +#if defined(__x86_64__) + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); + __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); + tags = _mm_and_si128(tags, mask); + cmp = _mm_cmpeq_epi32(tags, cmp); + u32 set = _mm_movemask_epi8(cmp); + + if (!set) goto miss; + else set = (__builtin_ctz(set) >> 2); + + { +#else for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) +#endif { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] @@ -550,7 +581,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) } // cache miss - + miss: // We do not fill the cacheline if it is disabled in the // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] @@ -632,10 +663,24 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; //Log(LogLevel::Debug, "Cache write 32: %08lx <= %08lx\n", addr, val); + +#if defined(__x86_64__) + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); + __m128i cmp = _mm_set1_epi32(tag); + tags = _mm_and_si128(tags, mask); + cmp = _mm_cmpeq_epi32(tags, cmp); + u32 set = _mm_movemask_epi8(cmp); + if (!set) return false; + else set = (__builtin_ctz(set) >> 2); + + { +#else for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 2] = val; @@ -667,10 +712,24 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; //Log(LogLevel::Debug, "Cache write 16: %08lx <= %04x\n", addr, val); + +#if defined(__x86_64__) + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); + __m128i cmp = _mm_set1_epi32(tag); + tags = _mm_and_si128(tags, mask); + cmp = _mm_cmpeq_epi32(tags, cmp); + u32 set = _mm_movemask_epi8(cmp); + if (!set) return false; + else set = (__builtin_ctz(set) >> 2); + + { +#else for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif { u16 *cacheLine = (u16 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 1] = val; @@ -703,10 +762,24 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; //Log(LogLevel::Debug, "Cache write 8: %08lx <= %02x\n", addr, val); + +#if defined(__x86_64__) + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); + __m128i cmp = _mm_set1_epi32(tag); + tags = _mm_and_si128(tags, mask); + cmp = _mm_cmpeq_epi32(tags, cmp); + u32 set = _mm_movemask_epi8(cmp); + if (!set) return false; + else set = (__builtin_ctz(set) >> 2); + + { +#else for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif { u8 *cacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[addr & (DCACHE_LINELENGTH-1)] = val; @@ -738,10 +811,24 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) { const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)) | CACHE_FLAG_VALID; const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + +#if defined(__x86_64__) + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); + __m128i cmp = _mm_set1_epi32(tag); + tags = _mm_and_si128(tags, mask); + cmp = _mm_cmpeq_epi32(tags, cmp); + u32 set = _mm_movemask_epi8(cmp); + if (!set) return; + else set = (__builtin_ctz(set) >> 2); + + { +#else for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) +#endif { //Log(LogLevel::Debug,"DCache invalidated %08lx\n", addr & ~(ICACHE_LINELENGTH-1)); DCacheTags[id+set] &= ~CACHE_FLAG_VALID; From 801f43dfc5a1d3b57213837b72143703cd2c7d56 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 13 Oct 2024 10:14:54 -0400 Subject: [PATCH 162/306] reimplement codemem i dont feel like i actually had a good reason for disabling this... --- src/ARM.cpp | 4 ++-- src/CP15.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 8c74a248..9e768047 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -327,7 +327,7 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) addr &= ~0x1; R[15] = addr+2; - //if (newregion != oldregion) SetupCodeMem(addr); + if (newregion != oldregion) SetupCodeMem(addr); // two-opcodes-at-once fetch // doesn't matter if we put garbage in the MSbs there @@ -352,7 +352,7 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) addr &= ~0x3; R[15] = addr+4; - //if (newregion != oldregion) SetupCodeMem(addr); + if (newregion != oldregion) SetupCodeMem(addr); NextInstr[0] = CodeRead32(addr, true); Cycles += CodeCycles; diff --git a/src/CP15.cpp b/src/CP15.cpp index b1ed52a5..9ac9d568 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -979,8 +979,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) NDS.ARM9Timestamp += CodeCycles; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; - //if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; DataRegion = Mem9_Null; + if (CodeMem.Mem) return *(u32*)&CodeMem.Mem[addr & CodeMem.Mask]; return BusRead32(addr); } From 026719acef269f17e0f44d3efd2da71453001740 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 14 Oct 2024 20:05:06 -0400 Subject: [PATCH 163/306] improve timing model --- src/ARM.cpp | 1 + src/ARM.h | 21 ++-- src/CP15.cpp | 263 +++++++++++++++++++++++---------------------------- 3 files changed, 135 insertions(+), 150 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 9e768047..b783431c 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -211,6 +211,7 @@ void ARMv5::Reset() WBWritePointer = 16; WBFillPointer = 0; WBDelay = 0; + WBWriting = false; ARM::Reset(); } diff --git a/src/ARM.h b/src/ARM.h index 16f4b72a..31044108 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -317,6 +317,7 @@ public: void ICacheInvalidateByAddr(u32 addr); void ICacheInvalidateAll(); + template inline bool WriteBufferHandle(); void WriteBufferCheck(); void WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr = 0); void WriteBufferDrain(); @@ -375,13 +376,19 @@ public: bool Store; u16 InterlockMask; - u8 WBWritePointer; - u8 WBFillPointer; - u64 WBDelay; - u32 WBAddr; // current working address for the write buffer - u32 storeaddr[16]; // debugging - u64 WBCycles[16]; // timestamp each write will complete - u64 WriteBufferFifo[16]; // 0-31: value | 62-63: 0 byte, 1 half, 2 word, 3 addr + + u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing + u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing + bool WBWriting; // whether the buffer is actively trying to perform a write + u8 WBCurCycles; // how long the current write will take; bit 7 is a flag used to indicate main ram + u64 WBCurVal; // current value being written; 0-31: val | 62-32: flag; 0 = byte; 1 = halfword; 2 = word; 3 = address (invalid in this variable) + u32 WBCurAddr; // address the write buffer is currently writing to + u32 storeaddr[16]; // temp until i figure out why using the fifo address entries directly didn't work + u8 WBCycles[16]; // num cycles for each write; bit 7 is a flag used to indicate main ram + u64 WriteBufferFifo[16]; // 0-31: val | 62-32: flag; 0 = byte; 1 = halfword; 2 = word; 3 = address + u64 WBTimestamp; // current timestamp in bus cycles + u64 WBMainRAMDelay; // timestamp in bus cycles used to emulate the delay before the next main ram write can begin + u64 WBDelay; // timestamp in bus cycles use for the delay before next write to the write buffer can occur (seems to be a 1 cycle delay after a write to it) #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; diff --git a/src/CP15.cpp b/src/CP15.cpp index 9ac9d568..14b87c73 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -438,159 +438,109 @@ void ARMv5::ICacheInvalidateAll() ICacheTags[i] = 1; } +template +inline bool ARMv5::WriteBufferHandle() +{ + // handle write buffer writes + if (WBWriting) + { + bool mainram = (WBCurCycles >= 0x80); + + u64 ts; + if (mainram) ts = std::max(WBTimestamp, WBMainRAMDelay) + (WBCurCycles & 0x7F); + else ts = WBTimestamp + (WBCurCycles & 0x7F); + + if (!force && ts > ((NDS.ARM9Timestamp + DataCycles) >> NDS.ARM9ClockShift)) return true; + if ( force && ts > ((NDS.ARM9Timestamp + DataCycles) >> NDS.ARM9ClockShift)) + { + NDS.ARM9Timestamp = ((ts - 1) << NDS.ARM9ClockShift) + 1; + DataCycles = 0; // checkme + } + + WBTimestamp = ts; + if (mainram) WBMainRAMDelay = WBTimestamp + 2; + + switch (WBCurVal >> 62) + { + case 0: // byte + BusWrite8 (WBCurAddr, WBCurVal); + break; + case 1: // halfword + BusWrite16(WBCurAddr, WBCurVal); + break; + case 2: // word + BusWrite32(WBCurAddr, WBCurVal); + break; + default: // address ie. invalid + Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE AN ADDRESS VIA THE WRITE BUFFER! PANIC!!!\n", (u8)(WBCurVal >> 62)); + break; + } + + //printf("writing: adr: %i, val: %lli, cyl: %i", WBCurAddr, WBCurVal, WBCurCycles); + WBWriting = false; + } + + // check if write buffer is empty + if (WBWritePointer == 16) return true; + // attempt to drain write buffer + if ((WriteBufferFifo[WBWritePointer] >> 62) != 3) // not an address + { + if (WBCycles[WBWritePointer] >= 0x80) // main ram handling + { + u64 ts = ((NDS.ARM9Timestamp + DataCycles) >> NDS.ARM9ClockShift); + if (!force && (WBMainRAMDelay > ts)) return true; + if ( force && (WBMainRAMDelay > ts)) + { + NDS.ARM9Timestamp = ((WBMainRAMDelay - 1) << NDS.ARM9ClockShift) + 1; + DataCycles = 0; + } + } + + WBCurVal = WriteBufferFifo[WBWritePointer]; + WBCurCycles = WBCycles[WBWritePointer]; + WBCurAddr = storeaddr[WBWritePointer]; + WBWriting = true; + } + else + { + // todo: i want to set the address here instead + } + + WBWritePointer = (WBWritePointer + 1) & 0xF; + if (WBWritePointer == WBFillPointer) + { + WBWritePointer = 16; + WBFillPointer = 0; + } + return false; +} + void ARMv5::WriteBufferCheck() { - if (WBWritePointer == 16) return; - - while (WBCycles[WBWritePointer] <= (NDS.ARM9Timestamp + DataCycles)) - { - //printf("drainingwb %lli, %i %08X %i\n", WBCycles[WBWritePointer], WBWritePointer, WBAddr, WriteBufferFifo[WBWritePointer] >> 62); - switch ((u64)WriteBufferFifo[WBWritePointer] >> 62) - { - case 0: // byte - { - u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; - BusWrite8(storeaddr[WBWritePointer], val); - break; - } - case 1: // halfword - { - u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; - BusWrite16(storeaddr[WBWritePointer], val); - break; - } - case 2: // word - { - u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - BusWrite32(storeaddr[WBWritePointer], val); - WBAddr += 4; - break; - } - case 3: // address update - WBAddr = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - break; - } - - WBWritePointer = (WBWritePointer + 1) & 0xF; - if (WBWritePointer == WBFillPointer) - { - WBWritePointer = 16; - WBFillPointer = 0; - break; - } - } + while (!WriteBufferHandle()); // loop until we've cleared out all writeable entries } void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) { WriteBufferCheck(); - if (WBFillPointer == WBWritePointer) + if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made + WriteBufferHandle(); + else if (WBWritePointer == 16) // indicates empty write buffer { - //printf("forcedrainingwb %lli, %i %08X %i\n", WBCycles[WBWritePointer], WBWritePointer, WBAddr, WriteBufferFifo[WBWritePointer] >> 62); - if (NDS.ARM9Timestamp < WBCycles[WBWritePointer]) - { - NDS.ARM9Timestamp = WBCycles[WBWritePointer]; - DataCycles = 0; // checkme - } - - switch ((u64)WriteBufferFifo[WBWritePointer] >> 62) - { - case 0: // byte - { - u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; - BusWrite8(storeaddr[WBWritePointer], val); - break; - } - case 1: // halfword - { - u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; - BusWrite16(storeaddr[WBWritePointer], val); - break; - } - case 2: // word - { - u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - BusWrite32(storeaddr[WBWritePointer], val); - WBAddr += 4; - break; - } - case 3: // address update - WBAddr = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - break; - } - - WBWritePointer = (WBWritePointer + 1) & 0xF; - if (WBWritePointer == WBFillPointer) - { - WBWritePointer = 16; - WBFillPointer = 0; - } - } - - //printf("fillingwb %lli %i %i %08X %i\n", NDS.ARM9Timestamp, WBWritePointer, WBFillPointer, val, flag); - if (WBWritePointer == 16) - { - WBCycles[WBFillPointer] = NDS.ARM9Timestamp + DataCycles + cycles; WBWritePointer = 0; + WBTimestamp = (((NDS.ARM9Timestamp + DataCycles) + ((1<> NDS.ARM9ClockShift; } - else - { - WBCycles[WBFillPointer] = WBCycles[(WBFillPointer-1) & 0xF] + cycles; - } + WriteBufferFifo[WBFillPointer] = val | (u64)flag << 62; + WBCycles[WBFillPointer] = cycles; storeaddr[WBFillPointer] = addr; WBFillPointer = (WBFillPointer + 1) & 0xF; } void ARMv5::WriteBufferDrain() { - if (WBWritePointer == 16) return; - - while (true) - { - //printf("fullydrainingwb %lli, %i %08X %i\n", WBCycles[WBWritePointer], WBWritePointer, WBAddr, WriteBufferFifo[WBWritePointer] >> 62); - if (NDS.ARM9Timestamp < WBCycles[WBWritePointer]) - { - NDS.ARM9Timestamp = WBCycles[WBWritePointer]; - DataCycles = 0; // checkme - } - - switch (WriteBufferFifo[WBWritePointer] >> 62) - { - case 0: // byte - { - u8 val = WriteBufferFifo[WBWritePointer] & 0xFF; - BusWrite8(storeaddr[WBWritePointer], val); - break; - } - case 1: // halfword - { - u16 val = WriteBufferFifo[WBWritePointer] & 0xFFFF; - BusWrite16(storeaddr[WBWritePointer], val); - break; - } - case 2: // word - { - u32 val = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - BusWrite32(storeaddr[WBWritePointer], val); - WBAddr += 4; - break; - } - case 3: // address update - WBAddr = WriteBufferFifo[WBWritePointer] & 0xFFFFFFFF; - break; - } - - WBWritePointer = (WBWritePointer + 1) & 0xF; - if (WBWritePointer == WBFillPointer) - { - WBWritePointer = 16; - WBFillPointer = 0; - break; - } - } - //printf("wbdrained\n"); + while (!WriteBufferHandle()); // loop until drained fully } void ARMv5::CP15Write(u32 id, u32 val) @@ -1224,9 +1174,16 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) else { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + u8 cycles = NDS.ARM9MemTimings[addr>>14][0]; + if ((addr >> 24) == 0x02) + { + cycles = (cycles - 2) & 0x80; + } + + WriteBufferWrite(addr, 3, 0); + WriteBufferWrite(val, 0, cycles, addr); DataCycles = 1; - WriteBufferWrite(addr, 3, 1); - WriteBufferWrite(val, 0, MemTimings[addr >> 12][1], addr); WBDelay = NDS.ARM9Timestamp + 2; } return true; @@ -1284,9 +1241,16 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) else { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + u8 cycles = NDS.ARM9MemTimings[addr>>14][0]; + if ((addr >> 24) == 0x02) + { + cycles = (cycles - 2) & 0x80; + } + + WriteBufferWrite(addr, 3, 0); + WriteBufferWrite(val, 1, cycles, addr); DataCycles = 1; - WriteBufferWrite(addr, 3, 1); - WriteBufferWrite(val, 1, MemTimings[addr >> 12][1], addr); WBDelay = NDS.ARM9Timestamp + 2; } return true; @@ -1344,9 +1308,16 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) else { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + u8 cycles = NDS.ARM9MemTimings[addr>>14][2]; + if ((addr >> 24) == 0x02) + { + cycles = (cycles - 2) & 0x80; + } + + WriteBufferWrite(addr, 3, 0); + WriteBufferWrite(val, 2, cycles, addr); DataCycles = 1; - WriteBufferWrite(addr, 3, 1); - WriteBufferWrite(val, 2, MemTimings[addr >> 12][2], addr); WBDelay = NDS.ARM9Timestamp + 2; } return true; @@ -1385,7 +1356,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (!(PU_Map[addr>>12] & 0x30)) { - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<>14][3]; + if ((addr >> 24) == 0x02) + { + cycles = (cycles - 2) & 0x80; + } + + WriteBufferWrite(val, 2, cycles, addr); DataCycles += 1; - WriteBufferWrite(val, 2, MemTimings[addr >> 12][3], addr); WBDelay = NDS.ARM9Timestamp + DataCycles + 1; } return true; From ca7d938bb137013beb1bbfbc0c0083e2fb51f9d4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 14 Oct 2024 20:18:23 -0400 Subject: [PATCH 164/306] update for new write buffer implementation --- src/CP15.cpp | 72 +++++++++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 46 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 4f4bc34c..3d4e8a4e 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -896,56 +896,36 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) { - WriteBufferWrite(tag, 3, 1); - WriteBufferWrite(ptr[0], 2, MemTimings[tag >> 12][2], tag+0x00); - WriteBufferWrite(ptr[1], 2, MemTimings[tag >> 12][3], tag+0x04); - WriteBufferWrite(ptr[2], 2, MemTimings[tag >> 12][3], tag+0x08); - WriteBufferWrite(ptr[3], 2, MemTimings[tag >> 12][3], tag+0x0C); + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + u8 cyclesn = NDS.ARM9MemTimings[tag>>14][2]; + if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) & 0x80; + + u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; + if ((tag >> 24) == 0x02) cycless = (cycless - 2) & 0x80; + + WriteBufferWrite(tag, 3, 0); + WriteBufferWrite(ptr[0], 2, cyclesn, tag+0x00); + WriteBufferWrite(ptr[1], 2, cycless, tag+0x04); + WriteBufferWrite(ptr[2], 2, cycless, tag+0x08); + WriteBufferWrite(ptr[3], 2, cycless, tag+0x0C); DataCycles += 5; - /*//Log(LogLevel::Debug, "Writing back %i / %i, lower half -> %08lx\n", cacheSet, cacheLine, tag); - for (int i = 0; i < DCACHE_LINELENGTH / 2; i+=sizeof(u32)) - { - //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); - if (tag+i < ITCMSize) - { - *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2]; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); - } else - if (((tag+i) & DTCMMask) == DTCMBase) - { - *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; - } else - { - BusWrite32(tag+i, ptr[i >> 2]); - } - } - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift;*/ } if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) { - //Log(LogLevel::Debug, "Writing back %i / %i, upper half-> %08lx\n", cacheSet, cacheLine, tag); - /*for (int i = DCACHE_LINELENGTH / 2; i < DCACHE_LINELENGTH; i+=sizeof(u32)) - { - //Log(LogLevel::Debug, " WB Value %08x\n", ptr[i >> 2]); - if (tag+i < ITCMSize) - { - *(u32*)&ITCM[(tag+i) & (ITCMPhysicalSize - 1)] = ptr[i >> 2]; - NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(tag+i); - } else - if (((tag+i) & DTCMMask) == DTCMBase) - { - *(u32*)&DTCM[(tag+i) & (DTCMPhysicalSize - 1)] = ptr[i >> 2]; - } else - { - BusWrite32(tag+i, ptr[i >> 2]); - } - } - DataCycles += (NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 8) - 1))) << NDS.ARM9ClockShift;*/ - WriteBufferWrite(tag+0x10, 3, 1); - WriteBufferWrite(ptr[4], 2, MemTimings[tag >> 12][2], tag+0x10); - WriteBufferWrite(ptr[5], 2, MemTimings[tag >> 12][3], tag+0x14); - WriteBufferWrite(ptr[6], 2, MemTimings[tag >> 12][3], tag+0x18); - WriteBufferWrite(ptr[7], 2, MemTimings[tag >> 12][3], tag+0x1C); + if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; + + u8 cyclesn = NDS.ARM9MemTimings[tag>>14][2]; + if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) & 0x80; + + u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; + if ((tag >> 24) == 0x02) cycless = (cycless - 2) & 0x80; + + WriteBufferWrite(tag+0x10, 3, 0); + WriteBufferWrite(ptr[4], 2, cyclesn, tag+0x10); + WriteBufferWrite(ptr[5], 2, cycless, tag+0x14); + WriteBufferWrite(ptr[6], 2, cycless, tag+0x18); + WriteBufferWrite(ptr[7], 2, cycless, tag+0x1C); DataCycles += 5; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); From 263dd20ec3d0d495b9e233ff25eae66e12b5d3a5 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 14 Oct 2024 22:48:25 -0400 Subject: [PATCH 165/306] nvmnvmnvm --- src/CP15.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 14b87c73..7d8ff01a 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -915,8 +915,9 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) //return *(u32*)&CurICacheLine[addr & 0x1C]; } - - WriteBufferDrain(); + + if (PU_Map[addr>>12] & 0x30) + WriteBufferDrain(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30) + WriteBufferDrain(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30) + WriteBufferDrain(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30) + WriteBufferDrain(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30) + WriteBufferDrain(); NDS.ARM9Timestamp += DataCycles; From d8d2fcd94a51b2739abaadfe4a87a22c72952b3e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:43:11 -0400 Subject: [PATCH 166/306] more optimizations --- src/ARM.cpp | 2 +- src/ARM.h | 2 +- src/CP15.cpp | 146 ++++++++++++++++++++------------------------------- 3 files changed, 58 insertions(+), 92 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index b783431c..e90fee28 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -269,7 +269,7 @@ void ARM::DoSavestate(Savestate* file) if (!Num) { SetupCodeMem(R[15]); // should fix it - ((ARMv5*)this)->RegionCodeCycles = ((ARMv5*)this)->MemTimings[R[15] >> 12][0]; + ((ARMv5*)this)->RegionCodeCycles = ((ARMv5*)this)->MemTimings[R[15] >> 12][2]; if ((CPSR & 0x1F) == 0x10) ((ARMv5*)this)->PU_Map = ((ARMv5*)this)->PU_UserMap; diff --git a/src/ARM.h b/src/ARM.h index 57908573..dbf2d9cb 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -670,7 +670,7 @@ public: u8* PU_Map; //! Current valid Region Mapping (is either @ref PU_PrivMap or PU_UserMap) // code/16N/32N/32S - u8 MemTimings[CP15_MAP_ENTRYCOUNT][4]; + u8 MemTimings[CP15_MAP_ENTRYCOUNT][3]; bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); diff --git a/src/CP15.cpp b/src/CP15.cpp index 3d4e8a4e..bb2e1045 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -338,26 +338,26 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) // checkme: should these be (bus timings shifted) - 1 or ((bustimings - 1) shifted) + 1 // should the last cycle be halved...? - if (pu & CP15_MAP_ICACHEABLE) + /*if (pu & CP15_MAP_ICACHEABLE) { MemTimings[i][0] = 0xFF;//kCodeCacheTiming; } else { MemTimings[i][0] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - } - + }*/ + /* if (pu & CP15_MAP_DCACHEABLE) { MemTimings[i][1] = kDataCacheTiming; MemTimings[i][2] = kDataCacheTiming; MemTimings[i][3] = 1; } - else + else*/ { - MemTimings[i][1] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][2] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][3] = bustimings[3] << NDS.ARM9ClockShift; // inaccurate but ehgh + MemTimings[i][0] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][1] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][2] = bustimings[3] << NDS.ARM9ClockShift; // inaccurate but ehgh } } } @@ -1784,27 +1784,17 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) #if !DISABLE_ICACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) + if (IsAddressICachable(addr)) { - if (IsAddressICachable(addr)) - { - return ICacheLookup(addr); - } + return ICacheLookup(addr); } - } #endif + } - CodeCycles = MemTimings[addr >> 12][0]; - if (CodeCycles == 0xFF) // cached memory. hax - { - if (branch || !(addr & (ICACHE_LINELENGTH-1))) - CodeCycles = kCodeCacheTiming;//ICacheLookup(addr); - else - CodeCycles = 1; - } + CodeCycles = MemTimings[addr >> 12][1]; WriteBufferDrain(); @@ -1854,17 +1844,14 @@ bool ARMv5::DataRead8(u32 addr, u32* val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - DataCycles = 0; - *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; - return true; - } + DataCycles = 0; + *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; + return true; } } #endif @@ -1873,7 +1860,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; + DataCycles = MemTimings[addr >> 12][0]; if ((addr >> 24) == 0x02) { @@ -1918,17 +1905,14 @@ bool ARMv5::DataRead16(u32 addr, u32* val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - DataCycles = 0; - *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; - return true; - } + DataCycles = 0; + *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; + return true; } } #endif @@ -1937,7 +1921,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; + DataCycles = MemTimings[addr >> 12][0]; if ((addr >> 24) == 0x02) { @@ -1982,17 +1966,14 @@ bool ARMv5::DataRead32(u32 addr, u32* val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - DataCycles = 0; - *val = DCacheLookup(addr); - return true; - } + DataCycles = 0; + *val = DCacheLookup(addr); + return true; } } #endif @@ -2001,7 +1982,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; + DataCycles = MemTimings[addr >> 12][1]; if ((addr >> 24) == 0x02) { @@ -2045,16 +2026,13 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - *val = DCacheLookup(addr); - return true; - } + *val = DCacheLookup(addr); + return true; } } #endif @@ -2065,7 +2043,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (!(addr & 0x3FF)) return DataRead32(addr, val); // bursts cannot cross a 1kb boundary - DataCycles = MemTimings[addr >> 12][3]; + DataCycles = MemTimings[addr >> 12][2]; NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; + DataCycles = MemTimings[addr >> 12][0]; if ((addr >> 24) == 0x02) { @@ -2196,16 +2171,13 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - if (DCacheWrite16(addr, val)) - return true; - } + if (DCacheWrite16(addr, val)) + return true; } } #endif @@ -2214,7 +2186,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; + DataCycles = MemTimings[addr >> 12][0]; if ((addr >> 24) == 0x02) { @@ -2279,17 +2251,14 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - DataCycles = 0; - if (DCacheWrite32(addr, val)) - return true; - } + DataCycles = 0; + if (DCacheWrite32(addr, val)) + return true; } } #endif @@ -2298,7 +2267,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; + DataCycles = MemTimings[addr >> 12][1]; if ((addr >> 24) == 0x02) { @@ -2362,16 +2331,13 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) #if !DISABLE_DCACHE #ifdef JIT_ENABLED - if (!NDS.IsJITEnabled()) + //if (!NDS.IsJITEnabled()) #endif { - if (CP15Control & CP15_CACHE_CR_DCACHEENABLE) + if (IsAddressDCachable(addr)) { - if (IsAddressDCachable(addr)) - { - if (DCacheWrite32(addr, val)) - return true; - } + if (DCacheWrite32(addr, val)) + return true; } } #endif @@ -2391,7 +2357,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) else DataRegion = NDS.ARM9Regions[addr>>14]; BusWrite32(addr, val); - DataCycles += MemTimings[addr >> 12][3]; + DataCycles += MemTimings[addr >> 12][2]; } else { From d476593eec26f29da7d2cfda61c0e3a386f36e2b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 12:32:13 -0400 Subject: [PATCH 167/306] add notes --- src/CP15.cpp | 116 +++++++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index bb2e1045..309c6d01 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -379,18 +379,21 @@ u32 ARMv5::ICacheLookup(const u32 addr) const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; #if defined(__x86_64__) - __m128i tags; memcpy(&tags, &ICacheTags[id], 16); - __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); - __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); - tags = _mm_and_si128(tags, mask); - cmp = _mm_cmpeq_epi32(tags, cmp); - u32 set = _mm_movemask_epi8(cmp); + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop - if (!set) goto miss; - else set = (__builtin_ctz(set) >> 2); + __m128i tags; memcpy(&tags, &ICacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match + u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + + if (!set) goto miss; // check if none of them were a match + else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match { #else + // fallback for loop; slow for (int set = 0; set < ICACHE_SETS; set++) { if ((ICacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) @@ -546,18 +549,21 @@ u32 ARMv5::DCacheLookup(const u32 addr) const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; #if defined(__x86_64__) - __m128i tags; memcpy(&tags, &DCacheTags[id], 16); - __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); - __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); - tags = _mm_and_si128(tags, mask); - cmp = _mm_cmpeq_epi32(tags, cmp); - u32 set = _mm_movemask_epi8(cmp); + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop - if (!set) goto miss; - else set = (__builtin_ctz(set) >> 2); + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match + u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + + if (!set) goto miss; // check if none of them were a match + else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match { #else + // fallback for loop; slow for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == (tag | CACHE_FLAG_VALID)) @@ -665,18 +671,21 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) //Log(LogLevel::Debug, "Cache write 32: %08lx <= %08lx\n", addr, val); #if defined(__x86_64__) - __m128i tags; memcpy(&tags, &DCacheTags[id], 16); - __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); - __m128i cmp = _mm_set1_epi32(tag); - tags = _mm_and_si128(tags, mask); - cmp = _mm_cmpeq_epi32(tags, cmp); - u32 set = _mm_movemask_epi8(cmp); + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop - if (!set) return false; - else set = (__builtin_ctz(set) >> 2); + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match + u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + + if (!set) return false; // check if none of them were a match + else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match { #else + // fallback for loop; slow for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) @@ -714,18 +723,21 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) //Log(LogLevel::Debug, "Cache write 16: %08lx <= %04x\n", addr, val); #if defined(__x86_64__) - __m128i tags; memcpy(&tags, &DCacheTags[id], 16); - __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); - __m128i cmp = _mm_set1_epi32(tag); - tags = _mm_and_si128(tags, mask); - cmp = _mm_cmpeq_epi32(tags, cmp); - u32 set = _mm_movemask_epi8(cmp); + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop - if (!set) return false; - else set = (__builtin_ctz(set) >> 2); + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match + u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + + if (!set) return false; // check if none of them were a match + else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match { #else + // fallback for loop; slow for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) @@ -764,18 +776,21 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) //Log(LogLevel::Debug, "Cache write 8: %08lx <= %02x\n", addr, val); #if defined(__x86_64__) - __m128i tags; memcpy(&tags, &DCacheTags[id], 16); - __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); - __m128i cmp = _mm_set1_epi32(tag); - tags = _mm_and_si128(tags, mask); - cmp = _mm_cmpeq_epi32(tags, cmp); - u32 set = _mm_movemask_epi8(cmp); + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop - if (!set) return false; - else set = (__builtin_ctz(set) >> 2); + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match + u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + + if (!set) return false; // check if none of them were a match + else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match { #else + // fallback for loop; slow for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) @@ -813,18 +828,21 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; #if defined(__x86_64__) - __m128i tags; memcpy(&tags, &DCacheTags[id], 16); - __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); - __m128i cmp = _mm_set1_epi32(tag); - tags = _mm_and_si128(tags, mask); - cmp = _mm_cmpeq_epi32(tags, cmp); - u32 set = _mm_movemask_epi8(cmp); + // we use sse here to greatly speed up checking for valid sets vs the fallback for loop - if (!set) return; - else set = (__builtin_ctz(set) >> 2); + __m128i tags; memcpy(&tags, &DCacheTags[id], 16); // load the tags for all 4 sets, one for each 32 bits + __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits + __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit + tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match + u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + + if (!set) return; // check if none of them were a match + else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match { #else + // fallback for loop; slow for (int set = 0; set < DCACHE_SETS; set++) { if ((DCacheTags[id+set] & ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)) == tag) @@ -911,7 +929,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) WriteBufferWrite(ptr[3], 2, cycless, tag+0x0C); DataCycles += 5; } - if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) + if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) // todo: check how this behaves when both fields need to be written { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; From 5f003eb967bfe5a4571e6830462f5e167dcf83f8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:23:03 -0400 Subject: [PATCH 168/306] fix builds with jit disabled --- src/ARM.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index f97c26e2..6ac989af 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -595,8 +595,11 @@ void ARMv5::Execute() Halted = 0; if (NDS.IME[0] & 0x1) { +#ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); - else IRQ = 1; + else +#endif + IRQ = 1; } } else From 460fd45aed65543bb704bed15bd06e5168ff6b6a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 20:26:01 -0400 Subject: [PATCH 169/306] remove some old code --- src/CP15.cpp | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 579824cb..6abf83ee 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -338,27 +338,9 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) // checkme: should these be (bus timings shifted) - 1 or ((bustimings - 1) shifted) + 1 // should the last cycle be halved...? - /*if (pu & CP15_MAP_ICACHEABLE) - { - MemTimings[i][0] = 0xFF;//kCodeCacheTiming; - } - else - { - MemTimings[i][0] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - }*/ - /* - if (pu & CP15_MAP_DCACHEABLE) - { - MemTimings[i][1] = kDataCacheTiming; - MemTimings[i][2] = kDataCacheTiming; - MemTimings[i][3] = 1; - } - else*/ - { - MemTimings[i][0] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][1] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][2] = bustimings[3] << NDS.ARM9ClockShift; // inaccurate but ehgh - } + MemTimings[i][0] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][1] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; + MemTimings[i][2] = ((bustimings[3] - 1) << NDS.ARM9ClockShift) + 1; } } From c00b188c056be4480174645215b94d21d33f2ebe Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:08:07 -0400 Subject: [PATCH 170/306] im dumb --- src/CP15.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 7d8ff01a..44b7e88a 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -529,7 +529,7 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - WBTimestamp = (((NDS.ARM9Timestamp + DataCycles) + ((1<> NDS.ARM9ClockShift; + WBTimestamp = (((NDS.ARM9Timestamp + DataCycles + 1) + ((1<> NDS.ARM9ClockShift; } WriteBufferFifo[WBFillPointer] = val | (u64)flag << 62; @@ -1183,7 +1183,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) u8 cycles = NDS.ARM9MemTimings[addr>>14][0]; if ((addr >> 24) == 0x02) { - cycles = (cycles - 2) & 0x80; + cycles = (cycles - 2) | 0x80; } WriteBufferWrite(addr, 3, 0); @@ -1250,7 +1250,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) u8 cycles = NDS.ARM9MemTimings[addr>>14][0]; if ((addr >> 24) == 0x02) { - cycles = (cycles - 2) & 0x80; + cycles = (cycles - 2) | 0x80; } WriteBufferWrite(addr, 3, 0); @@ -1317,7 +1317,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) u8 cycles = NDS.ARM9MemTimings[addr>>14][2]; if ((addr >> 24) == 0x02) { - cycles = (cycles - 2) & 0x80; + cycles = (cycles - 2) | 0x80; } WriteBufferWrite(addr, 3, 0); @@ -1381,7 +1381,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) u8 cycles = NDS.ARM9MemTimings[addr>>14][3]; if ((addr >> 24) == 0x02) { - cycles = (cycles - 2) & 0x80; + cycles = (cycles - 2) | 0x80; } WriteBufferWrite(val, 2, cycles, addr); From c605c93d8e21e1383a7ecd10e8651eba3b95b84e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:08:46 -0400 Subject: [PATCH 171/306] still dumb --- src/CP15.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 93aad098..3807373c 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -899,10 +899,10 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; u8 cyclesn = NDS.ARM9MemTimings[tag>>14][2]; - if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) & 0x80; + if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) | 0x80; u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; - if ((tag >> 24) == 0x02) cycless = (cycless - 2) & 0x80; + if ((tag >> 24) == 0x02) cycless = (cycless - 2) | 0x80; WriteBufferWrite(tag, 3, 0); WriteBufferWrite(ptr[0], 2, cyclesn, tag+0x00); @@ -916,10 +916,10 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; u8 cyclesn = NDS.ARM9MemTimings[tag>>14][2]; - if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) & 0x80; + if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) | 0x80; u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; - if ((tag >> 24) == 0x02) cycless = (cycless - 2) & 0x80; + if ((tag >> 24) == 0x02) cycless = (cycless - 2) | 0x80; WriteBufferWrite(tag+0x10, 3, 0); WriteBufferWrite(ptr[4], 2, cyclesn, tag+0x10); From 21763ceed3fb95fbf73ac55b4ae3ec0cda63dac8 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:20:10 -0400 Subject: [PATCH 172/306] reduce memtimings lut granularity --- src/ARM.h | 2 +- src/CP15.cpp | 33 ++++++++++++--------------------- src/DSi.cpp | 2 +- src/NDS.cpp | 2 +- 4 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index dbf2d9cb..b0d5abe3 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -670,7 +670,7 @@ public: u8* PU_Map; //! Current valid Region Mapping (is either @ref PU_PrivMap or PU_UserMap) // code/16N/32N/32S - u8 MemTimings[CP15_MAP_ENTRYCOUNT][3]; + u8 MemTimings[0x40000][3]; bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); diff --git a/src/CP15.cpp b/src/CP15.cpp index 3807373c..9e4736f2 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -292,8 +292,6 @@ void ARMv5::UpdatePURegion(const u32 n) PU_UserMap[i] = usermask; PU_PrivMap[i] = privmask; } - - UpdateRegionTimings(start, end); } void ARMv5::UpdatePURegions(const bool update_all) @@ -307,7 +305,6 @@ void ARMv5::UpdatePURegions(const bool update_all) memset(PU_UserMap, mask, CP15_MAP_ENTRYCOUNT); memset(PU_PrivMap, mask, CP15_MAP_ENTRYCOUNT); - UpdateRegionTimings(0x00000, CP15_MAP_ENTRYCOUNT); return; } @@ -322,10 +319,6 @@ void ARMv5::UpdatePURegions(const bool update_all) UpdatePURegion(n); } - // TODO: this is way unoptimized - // should be okay unless the game keeps changing shit, tho - if (update_all) UpdateRegionTimings(0x00000, CP15_MAP_ENTRYCOUNT); - // TODO: throw exception if the region we're running in has become non-executable, I guess } @@ -333,8 +326,7 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) { for (u32 i = addrstart; i < addrend; i++) { - u8 pu = PU_Map[i]; - u8* bustimings = NDS.ARM9MemTimings[i >> 2]; + u8* bustimings = NDS.ARM9MemTimings[i]; // checkme: should these be (bus timings shifted) - 1 or ((bustimings - 1) shifted) + 1 // should the last cycle be halved...? @@ -630,7 +622,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) // first N32 remaining S32 NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][2]; + NDS.ARM9Timestamp += ((NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 2)) - 1) << NDS.ARM9ClockShift) + 1; + DataCycles = NDS.ARM9MemTimings[tag>>14][3] << NDS.ARM9ClockShift; if ((addr >> 24) == 0x02) { @@ -640,8 +633,6 @@ u32 ARMv5::DCacheLookup(const u32 addr) } else DataRegion = NDS.ARM9Regions[addr>>14]; - NDS.ARM9Timestamp += ((NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 2)) - 1) << NDS.ARM9ClockShift) + 1; - DataCycles = NDS.ARM9MemTimings[tag>>14][3] << NDS.ARM9ClockShift; return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -1794,7 +1785,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) #endif } - CodeCycles = MemTimings[addr >> 12][1]; + CodeCycles = MemTimings[addr >> 14][1]; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); @@ -1862,7 +1853,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][0]; + DataCycles = MemTimings[addr >> 14][0]; if ((addr >> 24) == 0x02) { @@ -1924,7 +1915,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][0]; + DataCycles = MemTimings[addr >> 14][0]; if ((addr >> 24) == 0x02) { @@ -1986,7 +1977,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; + DataCycles = MemTimings[addr >> 14][1]; if ((addr >> 24) == 0x02) { @@ -2048,7 +2039,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (!(addr & 0x3FF)) return DataRead32(addr, val); // bursts cannot cross a 1kb boundary - DataCycles = MemTimings[addr >> 12][2]; + DataCycles = MemTimings[addr >> 14][2]; NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][0]; + DataCycles = MemTimings[addr >> 14][0]; if ((addr >> 24) == 0x02) { @@ -2191,7 +2182,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][0]; + DataCycles = MemTimings[addr >> 14][0]; if ((addr >> 24) == 0x02) { @@ -2272,7 +2263,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][1]; + DataCycles = MemTimings[addr >> 14][1]; if ((addr >> 24) == 0x02) { @@ -2362,7 +2353,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) else DataRegion = NDS.ARM9Regions[addr>>14]; BusWrite32(addr, val); - DataCycles += MemTimings[addr >> 12][2]; + DataCycles += MemTimings[addr >> 14][2]; } else { diff --git a/src/DSi.cpp b/src/DSi.cpp index 0e35841f..9b947ba5 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -1288,7 +1288,7 @@ void DSi::Set_SCFG_Clock9(u16 val) ARM9Timestamp <<= ARM9ClockShift; ARM9Target <<= ARM9ClockShift; - ARM9.UpdateRegionTimings(0x00000, 0x100000); + ARM9.UpdateRegionTimings(0x00000, 0x40000); } void DSi::Set_SCFG_MC(u32 val) diff --git a/src/NDS.cpp b/src/NDS.cpp index e0edf03b..591c22a0 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -179,7 +179,7 @@ void NDS::SetARM9RegionTimings(u32 addrstart, u32 addrend, u32 region, int buswi ARM9Regions[i] = region; } - ARM9.UpdateRegionTimings(addrstart<<2, addrend<<2); + ARM9.UpdateRegionTimings(addrstart, addrend); } void NDS::SetARM7RegionTimings(u32 addrstart, u32 addrend, u32 region, int buswidth, int nonseq, int seq) From 52ddaa73cf9a96a124d58552419ea1624849dea0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:36:21 -0400 Subject: [PATCH 173/306] fix resets --- src/ARM.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ARM.cpp b/src/ARM.cpp index 04de1ca6..b573747b 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -212,6 +212,7 @@ void ARMv5::Reset() WBFillPointer = 0; WBDelay = 0; WBWriting = false; + WBMainRAMDelay = 0; ARM::Reset(); } From d7212643f1eab31731e9a0c1d43a36c9923281b6 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 16 Oct 2024 12:21:32 -0400 Subject: [PATCH 174/306] move arm9 code fetches into the cycle add routine setting up for re-adding interlocks --- src/ARM.cpp | 40 ++++++++++++++++++++------------ src/ARM.h | 5 +++- src/ARMInterpreter.cpp | 4 ++++ src/ARMInterpreter_Branch.cpp | 12 ++++++++-- src/ARMInterpreter_LoadStore.cpp | 20 ++++++++++++++++ 5 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index b573747b..6a51318f 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -527,6 +527,7 @@ void ARM::UpdateMode(u32 oldmode, u32 newmode, bool phony) template void ARM::TriggerIRQ() { + AddCycles_C(); if (CPSR & 0x80) return; @@ -560,6 +561,7 @@ template void ARM::TriggerIRQ(); void ARMv5::PrefetchAbort() { + AddCycles_C(); Log(LogLevel::Warn, "ARM9: prefetch abort (%08X)\n", R[15]); u32 oldcpsr = CPSR; @@ -675,19 +677,11 @@ void ARMv5::Execute() R[15] += 2; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - if (R[15] & 0x2) - { - // no fetch is performed. - // unclear if it's a "1 cycle fetch" or a legitmately 0 cycle fetch stage? - // in practice it doesn't matter though. - NextInstr[1] >>= 16; - NDS.ARM9Timestamp++; - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; - DataRegion = Mem9_Null; - } - else NextInstr[1] = CodeRead32(R[15], false); - - + // code fetch is done during the execute stage cycle handling + if (R[15] & 0x2) NullFetch = true; + else NullFetch = false; + PC = R[15]; + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] // handle aborted instructions { @@ -708,8 +702,9 @@ void ARMv5::Execute() R[15] += 4; CurInstr = NextInstr[0]; NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15], false); - + // code fetch is done during the execute stage cycle handling + NullFetch = false; + PC = R[15]; if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // handle aborted instructions @@ -1157,8 +1152,23 @@ u32 ARMv5::ReadMem(u32 addr, int size) #endif +inline void ARMv5::CodeFetch() +{ + if (NullFetch) + { + // no fetch is performed. + // in practice it doesn't matter though. + NextInstr[1] >>= 16; + NDS.ARM9Timestamp++; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; + } + else NextInstr[1] = CodeRead32(PC, false); +} + void ARMv5::AddCycles_CI(s32 numX) { + CodeFetch(); NDS.ARM9Timestamp += numX; } diff --git a/src/ARM.h b/src/ARM.h index 31044108..327f72c3 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -281,8 +281,9 @@ public: AddCycles_C(); } + void CodeFetch(); - void AddCycles_C() override {} + void AddCycles_C() override { CodeFetch(); } void AddCycles_CI(s32 numX) override; @@ -375,6 +376,8 @@ public: u8 InterlockWBPrev; bool Store; u16 InterlockMask; + bool NullFetch; + u32 PC; u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 44948183..b1d856a0 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -36,6 +36,7 @@ namespace melonDS::ARMInterpreter void A_UNK(ARM* cpu) { + cpu->AddCycles_C(); Log(LogLevel::Warn, "undefined ARM%d instruction %08X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-8); #ifdef GDBSTUB_ENABLED cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-8); @@ -54,6 +55,7 @@ void A_UNK(ARM* cpu) void T_UNK(ARM* cpu) { + cpu->AddCycles_C(); Log(LogLevel::Warn, "undefined THUMB%d instruction %04X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-4); #ifdef GDBSTUB_ENABLED cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-4); @@ -335,6 +337,7 @@ void A_MRC(ARM* cpu) void A_SVC(ARM* cpu) // A_SWI { + cpu->AddCycles_C(); u32 oldcpsr = cpu->CPSR; cpu->CPSR &= ~0xBF; cpu->CPSR |= 0x93; @@ -347,6 +350,7 @@ void A_SVC(ARM* cpu) // A_SWI void T_SVC(ARM* cpu) // T_SWI { + cpu->AddCycles_C(); u32 oldcpsr = cpu->CPSR; cpu->CPSR &= ~0xBF; cpu->CPSR |= 0x93; diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index 623be41a..a95aa27d 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -27,12 +27,14 @@ using Platform::LogLevel; void A_B(ARM* cpu) { + cpu->AddCycles_C(); s32 offset = (s32)(cpu->CurInstr << 8) >> 6; cpu->JumpTo(cpu->R[15] + offset); } void A_BL(ARM* cpu) { + cpu->AddCycles_C(); s32 offset = (s32)(cpu->CurInstr << 8) >> 6; cpu->R[14] = cpu->R[15] - 4; cpu->JumpTo(cpu->R[15] + offset); @@ -40,6 +42,7 @@ void A_BL(ARM* cpu) void A_BLX_IMM(ARM* cpu) { + cpu->AddCycles_C(); s32 offset = (s32)(cpu->CurInstr << 8) >> 6; if (cpu->CurInstr & 0x01000000) offset += 2; cpu->R[14] = cpu->R[15] - 4; @@ -48,11 +51,13 @@ void A_BLX_IMM(ARM* cpu) void A_BX(ARM* cpu) { + cpu->AddCycles_C(); cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); } void A_BLX_REG(ARM* cpu) { + cpu->AddCycles_C(); u32 lr = cpu->R[15] - 4; cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); cpu->R[14] = lr; @@ -62,22 +67,23 @@ void A_BLX_REG(ARM* cpu) void T_BCOND(ARM* cpu) { + cpu->AddCycles_C(); if (cpu->CheckCondition((cpu->CurInstr >> 8) & 0xF)) { s32 offset = (s32)(cpu->CurInstr << 24) >> 23; cpu->JumpTo(cpu->R[15] + offset + 1); } - else - cpu->AddCycles_C(); } void T_BX(ARM* cpu) { + cpu->AddCycles_C(); cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); } void T_BLX_REG(ARM* cpu) { + cpu->AddCycles_C(); if (cpu->Num==1) { Log(LogLevel::Warn, "!! THUMB BLX_REG ON ARM7\n"); @@ -91,6 +97,7 @@ void T_BLX_REG(ARM* cpu) void T_B(ARM* cpu) { + cpu->AddCycles_C(); s32 offset = (s32)((cpu->CurInstr & 0x7FF) << 21) >> 20; cpu->JumpTo(cpu->R[15] + offset + 1); } @@ -104,6 +111,7 @@ void T_BL_LONG_1(ARM* cpu) void T_BL_LONG_2(ARM* cpu) { + cpu->AddCycles_C(); s32 offset = (cpu->CurInstr & 0x7FF) << 1; u32 pc = cpu->R[14] + offset; cpu->R[14] = (cpu->R[15] - 2) | 1; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 84203310..bab4f25b 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -24,6 +24,12 @@ namespace melonDS::ARMInterpreter { +void ExecuteStage(ARM* cpu) +{ + if (cpu->Num == 0) cpu->AddCycles_C(); +} + + // copypasta from ALU. bad #define LSL_IMM(x, s) \ x <<= s; @@ -69,6 +75,7 @@ enum class Writeback template void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) { + ExecuteStage(cpu); static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); u32 addr; @@ -116,6 +123,7 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) template void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) { + ExecuteStage(cpu); static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); u32 addr; @@ -277,6 +285,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD \ if (cpu->Num != 0) return; \ + ExecuteStage(cpu); \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ @@ -293,6 +302,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_LDRD_POST \ if (cpu->Num != 0) return; \ + ExecuteStage(cpu); \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ @@ -309,6 +319,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD \ if (cpu->Num != 0) return; \ + ExecuteStage(cpu); \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ @@ -323,6 +334,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD_POST \ if (cpu->Num != 0) return; \ + ExecuteStage(cpu); \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ @@ -394,6 +406,7 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) template inline void SWP(ARM* cpu) { + ExecuteStage(cpu); u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 rm = cpu->R[cpu->CurInstr & 0xF]; if ((cpu->CurInstr & 0xF) == 15) rm += 4; @@ -486,6 +499,7 @@ void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) void A_LDM(ARM* cpu) { + ExecuteStage(cpu); u32 baseid = (cpu->CurInstr >> 16) & 0xF; u32 base = cpu->R[baseid]; u32 wbbase; @@ -597,6 +611,7 @@ void A_LDM(ARM* cpu) void A_STM(ARM* cpu) { + ExecuteStage(cpu); u32 baseid = (cpu->CurInstr >> 16) & 0xF; u32 base = cpu->R[baseid]; u32 oldbase = base; @@ -695,6 +710,7 @@ void A_STM(ARM* cpu) void T_LDR_PCREL(ARM* cpu) { + ExecuteStage(cpu); u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); bool dabort = !cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); @@ -793,6 +809,7 @@ void T_LDR_SPREL(ARM* cpu) void T_PUSH(ARM* cpu) { + ExecuteStage(cpu); int nregs = 0; bool first = true; bool dabort = false; @@ -848,6 +865,7 @@ void T_PUSH(ARM* cpu) void T_POP(ARM* cpu) { + ExecuteStage(cpu); u32 base = cpu->R[13]; bool first = true; bool dabort = false; @@ -900,6 +918,7 @@ void T_POP(ARM* cpu) void T_STMIA(ARM* cpu) { + ExecuteStage(cpu); u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; bool dabort = false; @@ -936,6 +955,7 @@ void T_STMIA(ARM* cpu) void T_LDMIA(ARM* cpu) { + ExecuteStage(cpu); u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; bool dabort = false; From 26a6e887ad3dad5a1fc913553136dae232c3325d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 16 Oct 2024 22:26:56 -0400 Subject: [PATCH 175/306] aarch64 neon impl take one fingers crossed it compiles! --- src/CP15.cpp | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index 9e4736f2..2079645b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -20,6 +20,8 @@ #include #if defined(__x86_64__) #include +#elif defined(__ARM_NEON) +#include #endif #include "NDS.h" #include "DSi.h" @@ -365,6 +367,25 @@ u32 ARMv5::ICacheLookup(const u32 addr) if (!set) goto miss; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { ICacheTags[id+0], ICacheTags[id+1], ICacheTags[id+2], ICacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) goto miss; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -535,6 +556,25 @@ u32 ARMv5::DCacheLookup(const u32 addr) if (!set) goto miss; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID, + tag | CACHE_FLAG_VALID }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) goto miss; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -656,6 +696,22 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) if (!set) return false; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -708,6 +764,22 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) if (!set) return false; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -761,6 +833,22 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) if (!set) return false; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return false; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow @@ -813,6 +901,22 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) if (!set) return; // check if none of them were a match else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + { +#elif defined(__ARM_NEON) + uint32x4_t tags = { DCacheTags[id+0], DCacheTags[id+1], DCacheTags[id+2], DCacheTags[id+3] }; // load tags + uint32x4_t mask = { ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK), + ~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK) }; // load mask + uint32x4_t cmp = { tag, tag, tag, tag }; // load tag and flag we're checking for + tags = vandq_u32(tags, mask); // mask out bits we dont wanna check for + cmp = vceqq_u32(tags, cmp); + uint16x4_t res = vmovn_u32(cmp); + u64 set; memcpy(&set, &res, 4); + + if (!set) return; + else set = __builtin_ctz(set) >> 3; + { #else // fallback for loop; slow From ffb24e70882646407f0d802a2fce61f3af6dcf09 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 16 Oct 2024 22:42:24 -0400 Subject: [PATCH 176/306] wrong bitshift --- src/CP15.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 2079645b..7d887be1 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -384,7 +384,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) u64 set; memcpy(&set, &res, 4); if (!set) goto miss; - else set = __builtin_ctz(set) >> 3; + else set = __builtin_ctz(set) >> 4; { #else @@ -573,7 +573,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) u64 set; memcpy(&set, &res, 4); if (!set) goto miss; - else set = __builtin_ctz(set) >> 3; + else set = __builtin_ctz(set) >> 4; { #else @@ -710,7 +710,7 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) u64 set; memcpy(&set, &res, 4); if (!set) return false; - else set = __builtin_ctz(set) >> 3; + else set = __builtin_ctz(set) >> 4; { #else @@ -778,7 +778,7 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) u64 set; memcpy(&set, &res, 4); if (!set) return false; - else set = __builtin_ctz(set) >> 3; + else set = __builtin_ctz(set) >> 4; { #else @@ -847,7 +847,7 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) u64 set; memcpy(&set, &res, 4); if (!set) return false; - else set = __builtin_ctz(set) >> 3; + else set = __builtin_ctz(set) >> 4; { #else @@ -915,7 +915,7 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) u64 set; memcpy(&set, &res, 4); if (!set) return; - else set = __builtin_ctz(set) >> 3; + else set = __builtin_ctz(set) >> 4; { #else From 68e8ff41eb02bb1ee5de411a5042cc6e7e04b9af Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:08:11 -0400 Subject: [PATCH 177/306] this barely makes a difference in practice but it's less inefficient --- src/CP15.cpp | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 7d887be1..5aa0e7ac 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -361,11 +361,11 @@ u32 ARMv5::ICacheLookup(const u32 addr) __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); // load the tag we're checking for into each 32 bit tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for - cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match - u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer if (!set) goto miss; // check if none of them were a match - else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match { #elif defined(__ARM_NEON) @@ -550,11 +550,11 @@ u32 ARMv5::DCacheLookup(const u32 addr) __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits __m128i cmp = _mm_set1_epi32(tag | CACHE_FLAG_VALID); // load the tag we're checking for into each 32 bit tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for - cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match - u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer if (!set) goto miss; // check if none of them were a match - else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match { #elif defined(__ARM_NEON) @@ -690,11 +690,11 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for - cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match - u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer if (!set) return false; // check if none of them were a match - else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match { #elif defined(__ARM_NEON) @@ -758,11 +758,11 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for - cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match - u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer if (!set) return false; // check if none of them were a match - else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match { #elif defined(__ARM_NEON) @@ -827,11 +827,11 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for - cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match - u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer if (!set) return false; // check if none of them were a match - else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match { #elif defined(__ARM_NEON) @@ -895,11 +895,11 @@ void ARMv5::DCacheInvalidateByAddr(const u32 addr) __m128i mask = _mm_set1_epi32(~(CACHE_FLAG_DIRTY_MASK | CACHE_FLAG_SET_MASK)); // load copies of the mask into each 32 bits __m128i cmp = _mm_set1_epi32(tag); // load the tag we're checking for into each 32 bit tags = _mm_and_si128(tags, mask); // mask out the bits we dont want to check for - cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match - u32 set = _mm_movemask_epi8(cmp); // move the 8 msb of each field into a single 32 bit integer + cmp = _mm_cmpeq_epi32(tags, cmp); // compare to see if any bits match; sets all bits of each value to either 0 or 1 depending on the result + u32 set = _mm_movemask_ps(_mm_castsi128_ps(cmp)); // move the "sign bits" of each field into the low 4 bits of a 32 bit integer if (!set) return; // check if none of them were a match - else set = (__builtin_ctz(set) >> 2); // count trailing zeros and right shift to figure out which set had a match + else set = __builtin_ctz(set); // count trailing zeros and right shift to figure out which set had a match { #elif defined(__ARM_NEON) From e2a810147f032bee153e2dc0a6fb73067e06e5ba Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:00:55 -0400 Subject: [PATCH 178/306] re-add interlocks breaks gcc debug builds for ??? reason --- src/ARM.cpp | 40 ++++- src/ARM.h | 33 ++-- src/ARMInterpreter.cpp | 4 + src/ARMInterpreter_ALU.cpp | 275 +++++++++++++++++++++++++---- src/ARMInterpreter_Branch.cpp | 4 + src/ARMInterpreter_LoadStore.cpp | 290 ++++++++++++++++++++----------- 6 files changed, 473 insertions(+), 173 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 6a51318f..7b8fbf46 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -200,13 +200,11 @@ void ARM::Reset() void ARMv5::Reset() { PU_Map = PU_PrivMap; + Store = false; TimestampActual = 0; - InterlockMem = 16; - InterlockWBCur = 16; - InterlockWBPrev = 16; - Store = false; - InterlockMask = 0; + ILCurrReg = 16; + ILPrevReg = 16; WBWritePointer = 16; WBFillPointer = 0; @@ -1152,7 +1150,7 @@ u32 ARMv5::ReadMem(u32 addr, int size) #endif -inline void ARMv5::CodeFetch() +void ARMv5::CodeFetch() { if (NullFetch) { @@ -1181,6 +1179,36 @@ void ARMv5::AddCycles_MW(s32 numM) if (numM > 0) NDS.ARM9Timestamp += numM; } +template +void ARMv5::HandleInterlocksExecute(u16 ilmask) +{ + if ((bitfield && (ilmask & (1< ILCurrTime) NDS.ARM9Timestamp = ILCurrTime; + ILCurrReg = 16; + ILPrevReg = 16; + return; + } + else if ((bitfield && (ilmask & (1< ILPrevTime) NDS.ARM9Timestamp = ILPrevTime; + } + + ILPrevReg = ILCurrReg; + ILPrevTime = ILCurrTime; + ILCurrReg = 16; +} +template void ARMv5::HandleInterlocksExecute(u16 ilmask); +template void ARMv5::HandleInterlocksExecute(u16 ilmask); + +void ARMv5::HandleInterlocksMemory(u8 reg) +{ + if ((reg != ILPrevReg) || (NDS.ARM9Timestamp <= ILPrevTime)) return; + + NDS.ARM9Timestamp = ILPrevTime; + ILPrevTime = 16; +} + u16 ARMv4::CodeRead16(u32 addr) { if ((addr >> 24) == 0x02) diff --git a/src/ARM.h b/src/ARM.h index 327f72c3..bacdf367 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -263,23 +263,6 @@ public: bool DataWrite16(u32 addr, u16 val) override; bool DataWrite32(u32 addr, u32 val) override; bool DataWrite32S(u32 addr, u32 val) override; - - template - void ExecuteStage(u8 rn, u8 rm) - { - static_assert((nregs < 2), "too many regs"); - - if constexpr (nregs == 1) - { - InterlockMask = 1 << rn; - } - if constexpr (nregs == 2) - { - InterlockMask = 1 << rn | 1 << rm; - } - - AddCycles_C(); - } void CodeFetch(); @@ -300,6 +283,10 @@ public: AddCycles_MW(DataCycles); DataCycles = 0; } + + template + void HandleInterlocksExecute(u16 ilmask); + void HandleInterlocksMemory(u8 reg); void GetCodeMemRegion(u32 addr, MemRegion* region); @@ -371,14 +358,14 @@ public: u64 ITCMTimestamp; u64 TimestampActual; - u8 InterlockMem; - u8 InterlockWBCur; - u8 InterlockWBPrev; - bool Store; - u16 InterlockMask; - bool NullFetch; u32 PC; + bool NullFetch; + bool Store; + u8 ILCurrReg; + u8 ILPrevReg; + u64 ILCurrTime; + u64 ILPrevTime; u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index b1d856a0..614f3b53 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -153,6 +153,8 @@ void A_MSR_IMM(ARM* cpu) void A_MSR_REG(ARM* cpu) { + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr & 0xF); + u32* psr; if (cpu->CurInstr & (1<<22)) { @@ -275,6 +277,8 @@ void A_MCR(ARM* cpu) u32 val = cpu->R[(cpu->CurInstr>>12)&0xF]; if (((cpu->CurInstr>>12) & 0xF) == 15) val += 4; + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr>>12)&0xF); + if (cpu->Num==0 && cp==15) { ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo, val); diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index e9439d2a..0d2fb5af 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -152,22 +152,26 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) #define A_CALC_OP2_IMM \ - u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); + u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); \ + u16 ilmask = 0; #define A_CALC_OP2_IMM_S \ u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); \ if ((cpu->CurInstr>>7)&0x1E) \ - cpu->SetC(b & 0x80000000); + cpu->SetC(b & 0x80000000); \ + u16 ilmask = 0; #define A_CALC_OP2_REG_SHIFT_IMM(shiftop) \ u32 b = cpu->R[cpu->CurInstr&0xF]; \ u32 s = (cpu->CurInstr>>7)&0x1F; \ - shiftop(b, s); + shiftop(b, s); \ + u16 ilmask = 1 << (cpu->CurInstr&0xF); #define A_CALC_OP2_REG_SHIFT_REG(shiftop) \ u32 b = cpu->R[cpu->CurInstr&0xF]; \ if ((cpu->CurInstr&0xF)==15) b += 4; \ - shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); + shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); \ + u16 ilmask = 1 << (cpu->CurInstr&0xF); #define A_IMPLEMENT_ALU_OP(x,s) \ @@ -377,6 +381,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -394,6 +399,7 @@ A_IMPLEMENT_ALU_OP(EOR,_S) !res, \ CarrySub(a, b), \ OverflowSub(a, b)); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -410,6 +416,7 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = b - a; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -427,6 +434,7 @@ A_IMPLEMENT_ALU_OP(SUB,) !res, \ CarrySub(b, a), \ OverflowSub(b, a)); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -443,6 +451,7 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -460,6 +469,7 @@ A_IMPLEMENT_ALU_OP(RSB,) !res, \ CarryAdd(a, b), \ OverflowAdd(a, b)); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -476,6 +486,7 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b + (cpu->CPSR&0x20000000 ? 1:0); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -495,6 +506,7 @@ A_IMPLEMENT_ALU_OP(ADD,) !res, \ CarryAdd(a, b) | CarryAdd(res_tmp, carry), \ OverflowAdc(a, b, carry)); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -511,6 +523,7 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b - (cpu->CPSR&0x20000000 ? 0:1); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -530,6 +543,7 @@ A_IMPLEMENT_ALU_OP(ADC,) !res, \ CarrySub(a, b) & CarrySub(res_tmp, carry), \ OverflowSbc(a, b, carry)); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -546,6 +560,7 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = b - a - (cpu->CPSR&0x20000000 ? 0:1); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -565,6 +580,7 @@ A_IMPLEMENT_ALU_OP(SBC,) !res, \ CarrySub(b, a) & CarrySub(res_tmp, carry), \ OverflowSbc(b, a, carry)); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -581,6 +597,8 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ @@ -601,8 +619,7 @@ A_IMPLEMENT_ALU_OP(RSC,) { \ cpu->SetNZ(res & 0x80000000, \ !res); \ - } \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + } A_IMPLEMENT_ALU_TEST(TST,_S) @@ -610,6 +627,8 @@ A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a ^ b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ @@ -630,8 +649,7 @@ A_IMPLEMENT_ALU_TEST(TST,_S) { \ cpu->SetNZ(res & 0x80000000, \ !res); \ - } \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + } A_IMPLEMENT_ALU_TEST(TEQ,_S) @@ -639,6 +657,8 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a - b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ @@ -663,8 +683,7 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) !res, \ CarrySub(a, b), \ OverflowSub(a, b)); \ - } \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + } A_IMPLEMENT_ALU_TEST(CMP,) @@ -672,6 +691,8 @@ A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a + b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ if (cpu->Num == 1) \ @@ -696,8 +717,7 @@ A_IMPLEMENT_ALU_TEST(CMP,) !res, \ CarryAdd(a, b), \ OverflowAdd(a, b)); \ - } \ - if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); + } A_IMPLEMENT_ALU_TEST(CMN,) @@ -705,6 +725,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a | b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -720,6 +741,7 @@ A_IMPLEMENT_ALU_TEST(CMN,) u32 res = a | b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -734,6 +756,7 @@ A_IMPLEMENT_ALU_OP(ORR,_S) #define A_MOV(c) \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -747,6 +770,7 @@ A_IMPLEMENT_ALU_OP(ORR,_S) #define A_MOV_S(c) \ cpu->SetNZ(b & 0x80000000, \ !b); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -781,6 +805,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 res = a & ~b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -796,6 +821,7 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) u32 res = a & ~b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -811,6 +837,7 @@ A_IMPLEMENT_ALU_OP(BIC,_S) #define A_MVN(c) \ b = ~b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -825,6 +852,7 @@ A_IMPLEMENT_ALU_OP(BIC,_S) b = ~b; \ cpu->SetNZ(b & 0x80000000, \ !b); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -859,12 +887,17 @@ void A_MUL(ARM* cpu) if (cpu->Num == 0) { - if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); // S else { cpu->AddCycles_C(); // 1 X + cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } else @@ -899,12 +932,18 @@ void A_MLA(ARM* cpu) if (cpu->Num == 0) { + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF))); if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); else { cpu->AddCycles_C(); // 1 X + cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } else @@ -938,18 +977,31 @@ void A_UMULL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; + { + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_UMLAL(ARM* cpu) @@ -974,18 +1026,33 @@ void A_UMLAL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; + { + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)) | + (1 << ((cpu->CurInstr >> 16) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + cpu->AddCycles_CI(cycles); } - cpu->AddCycles_CI(cycles); } void A_SMULL(ARM* cpu) @@ -1007,18 +1074,30 @@ void A_SMULL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; + { + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + cpu->AddCycles_CI(cycles); } - - cpu->AddCycles_CI(cycles); } void A_SMLAL(ARM* cpu) @@ -1043,18 +1122,32 @@ void A_SMLAL(ARM* cpu) if (cpu->Num==1) cpu->SetC(0); } - u32 cycles; if (cpu->Num == 0) - cycles = (cpu->CurInstr & (1<<20)) ? 4 : 2; + { + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)) | + (1 << ((cpu->CurInstr >> 16) & 0xF))); + if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); + else + { + cpu->AddCycles_CI(2); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } + } else { + u32 cycles; if ((rs & 0xFFFFFF00) == 0x00000000 || (rs & 0xFFFFFF00) == 0xFFFFFF00) cycles = 2; else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + cpu->AddCycles_CI(cycles); } - - cpu->AddCycles_CI(cycles); } void A_SMLAxy(ARM* cpu) @@ -1078,8 +1171,17 @@ void A_SMLAxy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; + - cpu->AddCycles_C(); // TODO: interlock?? + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_SMLAWy(ARM* cpu) @@ -1101,7 +1203,16 @@ void A_SMLAWy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - cpu->AddCycles_C(); // TODO: interlock?? + + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_SMULxy(ARM* cpu) @@ -1120,7 +1231,16 @@ void A_SMULxy(ARM* cpu) if (((cpu->CurInstr >> 16) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + + + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_SMULWy(ARM* cpu) @@ -1137,7 +1257,16 @@ void A_SMULWy(ARM* cpu) if (((cpu->CurInstr >> 16) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + + + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_SMLALxy(ARM* cpu) @@ -1162,10 +1291,18 @@ void A_SMLALxy(ARM* cpu) if (((cpu->CurInstr >> 16) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - + + + + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | + (1 << ((cpu->CurInstr >> 8) & 0xF)) | + (1 << ((cpu->CurInstr >> 12) & 0xF)) | + (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); // 1 X cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } @@ -1192,6 +1329,8 @@ void A_CLZ(ARM* cpu) if (((cpu->CurInstr >> 12) & 0xF) == 15) cpu->JumpTo(res & ~1); else cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; + + ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr & 0xF); cpu->AddCycles_C(); } @@ -1213,7 +1352,13 @@ void A_QADD(ARM* cpu) if (((cpu->CurInstr >> 12) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_QSUB(ARM* cpu) @@ -1233,7 +1378,13 @@ void A_QSUB(ARM* cpu) if (((cpu->CurInstr >> 12) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_QDADD(ARM* cpu) @@ -1261,7 +1412,13 @@ void A_QDADD(ARM* cpu) if (((cpu->CurInstr >> 12) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } void A_QDSUB(ARM* cpu) @@ -1289,7 +1446,13 @@ void A_QDSUB(ARM* cpu) if (((cpu->CurInstr >> 12) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - cpu->AddCycles_C(); // TODO: interlock?? + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); + cpu->AddCycles_C(); + + cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } @@ -1306,6 +1469,7 @@ void T_LSL_IMM(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = op; cpu->SetNZ(op & 0x80000000, !op); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1317,6 +1481,7 @@ void T_LSR_IMM(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = op; cpu->SetNZ(op & 0x80000000, !op); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1328,6 +1493,7 @@ void T_ASR_IMM(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = op; cpu->SetNZ(op & 0x80000000, !op); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1341,6 +1507,7 @@ void T_ADD_REG_(ARM* cpu) !res, CarryAdd(a, b), OverflowAdd(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << ((cpu->CurInstr >> 3) & 0x7)) | (1 << ((cpu->CurInstr >> 6) & 0x7))); cpu->AddCycles_C(); } @@ -1354,6 +1521,7 @@ void T_SUB_REG_(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << ((cpu->CurInstr >> 3) & 0x7)) | (1 << ((cpu->CurInstr >> 6) & 0x7))); cpu->AddCycles_C(); } @@ -1367,6 +1535,7 @@ void T_ADD_IMM_(ARM* cpu) !res, CarryAdd(a, b), OverflowAdd(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1380,6 +1549,7 @@ void T_SUB_IMM_(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1389,6 +1559,7 @@ void T_MOV_IMM(ARM* cpu) cpu->R[(cpu->CurInstr >> 8) & 0x7] = b; cpu->SetNZ(0, !b); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); cpu->AddCycles_C(); } @@ -1401,6 +1572,7 @@ void T_CMP_IMM(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); cpu->AddCycles_C(); } @@ -1414,6 +1586,7 @@ void T_ADD_IMM(ARM* cpu) !res, CarryAdd(a, b), OverflowAdd(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); cpu->AddCycles_C(); } @@ -1427,6 +1600,7 @@ void T_SUB_IMM(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 8) & 0x7); cpu->AddCycles_C(); } @@ -1439,6 +1613,7 @@ void T_AND_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, !res); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1450,6 +1625,7 @@ void T_EOR_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, !res); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1461,6 +1637,7 @@ void T_LSL_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, !a); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_CI(1); } @@ -1472,6 +1649,7 @@ void T_LSR_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, !a); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_CI(1); } @@ -1483,6 +1661,7 @@ void T_ASR_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, !a); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_CI(1); } @@ -1498,6 +1677,7 @@ void T_ADC_REG(ARM* cpu) !res, CarryAdd(a, b) | CarryAdd(res_tmp, carry), OverflowAdc(a, b, carry)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1513,6 +1693,7 @@ void T_SBC_REG(ARM* cpu) !res, CarrySub(a, b) & CarrySub(res_tmp, carry), OverflowSbc(a, b, carry)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1524,6 +1705,7 @@ void T_ROR_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = a; cpu->SetNZ(a & 0x80000000, !a); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_CI(1); } @@ -1534,6 +1716,7 @@ void T_TST_REG(ARM* cpu) u32 res = a & b; cpu->SetNZ(res & 0x80000000, !res); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1546,6 +1729,7 @@ void T_NEG_REG(ARM* cpu) !res, CarrySub(0, b), OverflowSub(0, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1558,6 +1742,7 @@ void T_CMP_REG(ARM* cpu) !res, CarrySub(a, b), OverflowSub(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1570,6 +1755,7 @@ void T_CMN_REG(ARM* cpu) !res, CarryAdd(a, b), OverflowAdd(a, b)); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1581,6 +1767,7 @@ void T_ORR_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, !res); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1597,6 +1784,7 @@ void T_MUL_REG(ARM* cpu) if (cpu->Num == 0) { cycles += 3; + ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); } else { @@ -1606,7 +1794,7 @@ void T_MUL_REG(ARM* cpu) else if (a & 0x0000FF00) cycles += 2; else cycles += 1; } - cpu->AddCycles_CI(cycles); + cpu->AddCycles_CI(cycles); // implemented as S variant, doesn't interlock } void T_BIC_REG(ARM* cpu) @@ -1617,6 +1805,7 @@ void T_BIC_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, !res); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0x7)) | (1 << ((cpu->CurInstr >> 3) & 0x7))); cpu->AddCycles_C(); } @@ -1627,6 +1816,7 @@ void T_MVN_REG(ARM* cpu) cpu->R[cpu->CurInstr & 0x7] = res; cpu->SetNZ(res & 0x80000000, !res); + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0x7); cpu->AddCycles_C(); } @@ -1643,7 +1833,8 @@ void T_ADD_HIREG(ARM* cpu) u32 b = cpu->R[rs]; cpu->AddCycles_C(); - + + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); if (rd == 15) { cpu->JumpTo((a + b) | 1); @@ -1678,6 +1869,7 @@ void T_CMP_HIREG(ARM* cpu) cpu->CPSR |= 0x20; // keep it from crashing the emulator at least } } + else if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); cpu->AddCycles_C(); } @@ -1687,6 +1879,7 @@ void T_MOV_HIREG(ARM* cpu) u32 rd = (cpu->CurInstr & 0x7) | ((cpu->CurInstr >> 4) & 0x8); u32 rs = (cpu->CurInstr >> 3) & 0xF; + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); cpu->AddCycles_C(); if (rd == 15) @@ -1717,6 +1910,8 @@ void T_ADD_PCREL(ARM* cpu) u32 val = cpu->R[15] & ~2; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; + + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(15); cpu->AddCycles_C(); } @@ -1725,6 +1920,8 @@ void T_ADD_SPREL(ARM* cpu) u32 val = cpu->R[13]; val += ((cpu->CurInstr & 0xFF) << 2); cpu->R[(cpu->CurInstr >> 8) & 0x7] = val; + + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(13); cpu->AddCycles_C(); } @@ -1736,6 +1933,8 @@ void T_ADD_SP(ARM* cpu) else val += ((cpu->CurInstr & 0x7F) << 2); cpu->R[13] = val; + + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(13); cpu->AddCycles_C(); } diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index a95aa27d..35bbbc52 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -51,12 +51,14 @@ void A_BLX_IMM(ARM* cpu) void A_BX(ARM* cpu) { + if (cpu->Num==0) ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr&0xF); cpu->AddCycles_C(); cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); } void A_BLX_REG(ARM* cpu) { + if (cpu->Num==0) ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr&0xF); cpu->AddCycles_C(); u32 lr = cpu->R[15] - 4; cpu->JumpTo(cpu->R[cpu->CurInstr & 0xF]); @@ -77,12 +79,14 @@ void T_BCOND(ARM* cpu) void T_BX(ARM* cpu) { + if (cpu->Num==0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0xF); cpu->AddCycles_C(); cpu->JumpTo(cpu->R[(cpu->CurInstr >> 3) & 0xF]); } void T_BLX_REG(ARM* cpu) { + if (cpu->Num==0) ((ARMv5*)cpu)->HandleInterlocksExecute((cpu->CurInstr >> 3) & 0xF); cpu->AddCycles_C(); if (cpu->Num==1) { diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index bab4f25b..3022b94d 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -18,15 +18,20 @@ #include #include "ARM.h" +#include "NDS.h" namespace melonDS::ARMInterpreter { - -void ExecuteStage(ARM* cpu) +template +inline void ExecuteStage(ARM* cpu, u16 ilmask) { - if (cpu->Num == 0) cpu->AddCycles_C(); + if (cpu->Num == 0) + { + ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); + ((ARMv5*)cpu)->AddCycles_C(); + } } @@ -56,13 +61,15 @@ void ExecuteStage(ARM* cpu) #define A_WB_CALC_OFFSET_IMM \ u32 offset = (cpu->CurInstr & 0xFFF); \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 0; #define A_WB_CALC_OFFSET_REG(shiftop) \ u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ u32 shift = ((cpu->CurInstr>>7)&0x1F); \ shiftop(offset, shift); \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 1 << (cpu->CurInstr & 0xF); enum class Writeback { @@ -72,11 +79,12 @@ enum class Writeback Trans, }; -template -void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) +template +void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset, u16 ilmask) { - ExecuteStage(cpu); static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); + + ExecuteStage(cpu, (ilmask | (1<R[rn]; @@ -117,15 +125,25 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; cpu->JumpTo(val); } - else cpu->R[rd] = val; + else + { + cpu->R[rd] = val; + if (cpu->Num == 0) + { + ((ARMv5*)cpu)->ILCurrReg = rd; + bool extra = ((size < 32) || (signror && (addr&0x3))); + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles + extra; + } + } } -template -void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) +template +void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset, u16 ilmask) { - ExecuteStage(cpu); static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); + ExecuteStage(cpu, (ilmask | (1<R[rn]; else addr = cpu->R[rn]; @@ -139,6 +157,8 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; } + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksMemory(rd); + bool dabort; if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval); if constexpr (size == 16) dabort = !cpu->DataWrite16(addr, storeval); @@ -162,36 +182,36 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) #define A_STR \ - if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else StoreSingle<32, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Pre, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<32, Writeback::None, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_STR_POST \ - if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Trans>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else StoreSingle<32, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<32, Writeback::Trans, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<32, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_STRB \ - if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else StoreSingle<8, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Pre, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<8, Writeback::None, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_STRB_POST \ - if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Trans>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else StoreSingle<8, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<8, Writeback::Trans, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<8, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDR \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDR_POST \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRB \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRB_POST \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); @@ -266,63 +286,72 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_HD_CALC_OFFSET_IMM \ u32 offset = (cpu->CurInstr & 0xF) | ((cpu->CurInstr >> 4) & 0xF0); \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 0; #define A_HD_CALC_OFFSET_REG \ u32 offset = cpu->R[cpu->CurInstr & 0xF]; \ - if (!(cpu->CurInstr & (1<<23))) offset = -offset; + if (!(cpu->CurInstr & (1<<23))) offset = -offset; \ + u16 ilmask = 1 << (cpu->CurInstr & 0xF); #define A_STRH \ - if (cpu->CurInstr & (1<<21)) StoreSingle<16, Writeback::Pre>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else StoreSingle<16, Writeback::None>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) StoreSingle<16, Writeback::Pre, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else StoreSingle<16, Writeback::None, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_STRH_POST \ - StoreSingle<16, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + StoreSingle<16, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); // TODO: CHECK LDRD/STRD TIMINGS!! #define A_LDRD \ if (cpu->Num != 0) return; \ - ExecuteStage(cpu); \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ + ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ bool dabort = !cpu->DataRead32(offset, &cpu->R[r]); \ u32 val; dabort |= !cpu->DataRead32S(offset+4, &val); \ + cpu->AddCycles_CDI(); \ if (dabort) { \ - cpu->AddCycles_CDI(); \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ - else cpu->R[r+1] = val; \ - cpu->AddCycles_CDI(); \ + else { \ + cpu->R[r+1] = val; \ + if (cpu->Num == 0) { \ + ((ARMv5*)cpu)->ILCurrReg = r+1; \ + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ if (cpu->Num != 0) return; \ - ExecuteStage(cpu); \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ + ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ bool dabort = !cpu->DataRead32(addr, &cpu->R[r]); \ u32 val; dabort |= !cpu->DataRead32S(addr+4, &val); \ + cpu->AddCycles_CDI(); \ if (dabort) { \ - cpu->AddCycles_CDI(); \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ - else cpu->R[r+1] = val; \ - cpu->AddCycles_CDI(); \ + else { \ + cpu->R[r+1] = val; \ + if (cpu->Num == 0) { \ + ((ARMv5*)cpu)->ILCurrReg = r+1; \ + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ if (cpu->Num != 0) return; \ - ExecuteStage(cpu); \ offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ + ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ + ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ bool dabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dabort |= !cpu->DataWrite32S (offset+4, storeval); /* no, i dont understand it either */ \ @@ -334,10 +363,11 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) #define A_STRD_POST \ if (cpu->Num != 0) return; \ - ExecuteStage(cpu); \ u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ + ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ + ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ bool dabort = !cpu->DataWrite32(addr, cpu->R[r]); \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dabort |= !cpu->DataWrite32S (addr+4, storeval); \ @@ -348,25 +378,25 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_LDRH \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRH_POST \ - LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSB \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSB_POST \ - LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSH \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRSH_POST \ - LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_IMPLEMENT_HD_LDRSTR(x) \ @@ -406,7 +436,7 @@ A_IMPLEMENT_HD_LDRSTR(LDRSH) template inline void SWP(ARM* cpu) { - ExecuteStage(cpu); + ExecuteStage(cpu, ((cpu->CurInstr >> 16) & 0xF)); u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; u32 rm = cpu->R[cpu->CurInstr & 0xF]; if ((cpu->CurInstr & 0xF) == 15) rm += 4; @@ -415,7 +445,7 @@ inline void SWP(ARM* cpu) if ((byte ? cpu->DataRead8 (base, &val) : cpu->DataRead32(base, &val))) [[likely]] { - u32 numD = cpu->DataCycles; + cpu->NDS.ARM9Timestamp += cpu->DataCycles; // checkme if ((byte ? cpu->DataWrite8 (base, rm) : cpu->DataWrite32(base, rm))) [[likely]] @@ -424,17 +454,27 @@ inline void SWP(ARM* cpu) u32 rd = (cpu->CurInstr >> 12) & 0xF; if constexpr (!byte) val = ROR(val, 8*(base&0x3)); + + cpu->AddCycles_CDI(); - if (rd != 15) cpu->R[rd] = val; + if (rd != 15) + { + cpu->R[rd] = val; + if (cpu->Num == 0) + { + ((ARMv5*)cpu)->ILCurrReg = rd; + bool extra = (byte || (base&0x3)); + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles + extra; + } + } else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't seem to work on the arm 9? + return; } - else ((ARMv5*)cpu)->DataAbort(); - - cpu->DataCycles += numD; } - else ((ARMv5*)cpu)->DataAbort(); + // data abort handling cpu->AddCycles_CDI(); + ((ARMv5*)cpu)->DataAbort(); } void A_SWP(ARM* cpu) @@ -499,7 +539,6 @@ void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) void A_LDM(ARM* cpu) { - ExecuteStage(cpu); u32 baseid = (cpu->CurInstr >> 16) & 0xF; u32 base = cpu->R[baseid]; u32 wbbase; @@ -518,6 +557,8 @@ void A_LDM(ARM* cpu) return; } + ExecuteStage(cpu, baseid); + if (!(cpu->CurInstr & (1<<23))) // decrement { // decrement is actually an increment starting from the end address @@ -573,11 +614,12 @@ void A_LDM(ARM* cpu) // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + + cpu->AddCycles_CDI(); // handle data aborts if (dabort) [[unlikely]] { - cpu->AddCycles_CDI(); ((ARMv5*)cpu)->DataAbort(); return; } @@ -601,17 +643,20 @@ void A_LDM(ARM* cpu) else cpu->R[baseid] = wbbase; } - + // jump if pc got written if (cpu->CurInstr & (1<<15)) cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); - - cpu->AddCycles_CDI(); + else if (cpu->Num == 0) + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0x7FFF); + ((ARMv5*)cpu)->ILCurrReg = lastreg; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } } void A_STM(ARM* cpu) { - ExecuteStage(cpu); u32 baseid = (cpu->CurInstr >> 16) & 0xF; u32 base = cpu->R[baseid]; u32 oldbase = base; @@ -628,6 +673,8 @@ void A_STM(ARM* cpu) (0 << 4))); // thumb return; } + + ExecuteStage(cpu, baseid); if (!(cpu->CurInstr & (1<<23))) { @@ -655,6 +702,8 @@ void A_STM(ARM* cpu) cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true); } + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksMemory(__builtin_ctz(cpu->CurInstr)); + for (u32 i = 0; i < 16; i++) { if (cpu->CurInstr & (1<CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - + + cpu->AddCycles_CD(); + // handle data aborts if (dabort) [[unlikely]] { // restore original value of base cpu->R[baseid] = oldbase; - cpu->AddCycles_CD(); ((ARMv5*)cpu)->DataAbort(); return; } if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) cpu->R[baseid] = base; - - - cpu->AddCycles_CD(); } @@ -710,106 +757,108 @@ void A_STM(ARM* cpu) void T_LDR_PCREL(ARM* cpu) { - ExecuteStage(cpu); + ExecuteStage(cpu, 15); u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); bool dabort = !cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); cpu->AddCycles_CDI(); - if (dabort) [[unlikely]] + if (dabort) [[unlikely]] ((ARMv5*)cpu)->DataAbort(); + else if (cpu->Num == 0) { - ((ARMv5*)cpu)->DataAbort(); + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 8) & 0x7; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } void T_STR_REG(ARM* cpu) { - StoreSingle<32, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + StoreSingle<32, Writeback::None, true>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_STRB_REG(ARM* cpu) { - StoreSingle<8, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + StoreSingle<8, Writeback::None, true>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDR_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRB_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_STRH_REG(ARM* cpu) { - StoreSingle<16, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + StoreSingle<16, Writeback::None, true>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRSB_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRH_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRSH_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_STR_IMM(ARM* cpu) { - StoreSingle<32, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); + StoreSingle<32, Writeback::None, false>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C), 0); } void T_LDR_IMM(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C), 0); } void T_STRB_IMM(ARM* cpu) { - StoreSingle<8, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F)); + StoreSingle<8, Writeback::None, false>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F), 0); } void T_LDRB_IMM(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F)); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 6) & 0x1F), 0); } void T_STRH_IMM(ARM* cpu) { - StoreSingle<16, Writeback::None>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E)); + StoreSingle<16, Writeback::None, false>(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E), 0); } void T_LDRH_IMM(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E)); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 5) & 0x3E), 0); } void T_STR_SPREL(ARM* cpu) { - StoreSingle<32, Writeback::None>(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC)); + StoreSingle<32, Writeback::None, false>(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC), 0); } void T_LDR_SPREL(ARM* cpu) { - LoadSingle(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC)); + LoadSingle(cpu, ((cpu->CurInstr >> 8) & 0x7), 13, ((cpu->CurInstr << 2) & 0x3FC), 0); } void T_PUSH(ARM* cpu) { - ExecuteStage(cpu); + ExecuteStage(cpu, 13); int nregs = 0; bool first = true; bool dabort = false; @@ -833,6 +882,13 @@ void T_PUSH(ARM* cpu) base -= (nregs<<2); u32 wbbase = base; + if (cpu->Num == 0) + { + u8 firstreg = __builtin_ctz(cpu->CurInstr); + if (firstreg == 8) firstreg = 14; + ((ARMv5*)cpu)->HandleInterlocksMemory(firstreg); + } + for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<DataWrite32S(base, cpu->R[14])); } + cpu->AddCycles_CD(); + if (dabort) [[unlikely]] { - cpu->AddCycles_CD(); ((ARMv5*)cpu)->DataAbort(); return; } cpu->R[13] = wbbase; - - cpu->AddCycles_CD(); } void T_POP(ARM* cpu) { - ExecuteStage(cpu); + ExecuteStage(cpu, 13); u32 base = cpu->R[13]; bool first = true; bool dabort = false; - + if (!(cpu->CurInstr & 0x1FF)) [[unlikely]] { EmptyRListLDMSTM(cpu, 13, 0b00011); @@ -898,10 +953,23 @@ void T_POP(ARM* cpu) : cpu->DataRead32S(base, &pc)); if (dabort) [[unlikely]] goto dataabort; + + cpu->AddCycles_CDI(); if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; cpu->JumpTo(pc); base += 4; } + else + { + cpu->AddCycles_CDI(); + + if (cpu->Num == 0) + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); + ((ARMv5*)cpu)->ILCurrReg = lastreg; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } + } if (dabort) [[unlikely]] { @@ -912,23 +980,27 @@ void T_POP(ARM* cpu) } cpu->R[13] = base; - - cpu->AddCycles_CDI(); } void T_STMIA(ARM* cpu) { - ExecuteStage(cpu); + ExecuteStage(cpu, ((cpu->CurInstr >> 8) & 0x7)); u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; bool dabort = false; - + if (!(cpu->CurInstr & 0xFF)) [[unlikely]] { EmptyRListLDMSTM(cpu, (cpu->CurInstr >> 8) & 0x7, 0b10010); return; } + if (cpu->Num == 0) + { + u8 firstreg = __builtin_ctz(cpu->CurInstr); + ((ARMv5*)cpu)->HandleInterlocksMemory(firstreg); + } + for (int i = 0; i < 8; i++) { if (cpu->CurInstr & (1<AddCycles_CD(); + if (dabort) [[unlikely]] { - cpu->AddCycles_CD(); ((ARMv5*)cpu)->DataAbort(); return; } // TODO: check "Rb included in Rlist" case cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - cpu->AddCycles_CD(); } void T_LDMIA(ARM* cpu) { - ExecuteStage(cpu); + ExecuteStage(cpu, ((cpu->CurInstr >> 8) & 0x7)); u32 base = cpu->R[(cpu->CurInstr >> 8) & 0x7]; bool first = true; bool dabort = false; - + if (!(cpu->CurInstr & 0xFF)) [[unlikely]] { EmptyRListLDMSTM(cpu, (cpu->CurInstr >> 8) & 0x7, 0b00011); @@ -980,17 +1052,23 @@ void T_LDMIA(ARM* cpu) } } + cpu->AddCycles_CDI(); + if (dabort) [[unlikely]] { - cpu->AddCycles_CDI(); ((ARMv5*)cpu)->DataAbort(); return; } + + if (cpu->Num == 0) + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); + ((ARMv5*)cpu)->ILCurrReg = lastreg; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + } if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) cpu->R[(cpu->CurInstr >> 8) & 0x7] = base; - - cpu->AddCycles_CDI(); } From 0e6d3fd8341c6e41bc140d0c6c1b5d78df651e3f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:32:02 -0400 Subject: [PATCH 179/306] fix bad logical leaps --- src/ARM.cpp | 6 +++--- src/ARMInterpreter_ALU.cpp | 30 +++++++++++++++--------------- src/ARMInterpreter_LoadStore.cpp | 16 ++++++++-------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7b8fbf46..75d88aea 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1184,14 +1184,14 @@ void ARMv5::HandleInterlocksExecute(u16 ilmask) { if ((bitfield && (ilmask & (1< ILCurrTime) NDS.ARM9Timestamp = ILCurrTime; + if (NDS.ARM9Timestamp < ILCurrTime) NDS.ARM9Timestamp = ILCurrTime; ILCurrReg = 16; ILPrevReg = 16; return; } else if ((bitfield && (ilmask & (1< ILPrevTime) NDS.ARM9Timestamp = ILPrevTime; + if (NDS.ARM9Timestamp < ILPrevTime) NDS.ARM9Timestamp = ILPrevTime; } ILPrevReg = ILCurrReg; @@ -1203,7 +1203,7 @@ template void ARMv5::HandleInterlocksExecute(u16 ilmask); void ARMv5::HandleInterlocksMemory(u8 reg) { - if ((reg != ILPrevReg) || (NDS.ARM9Timestamp <= ILPrevTime)) return; + if ((reg != ILPrevReg) || (NDS.ARM9Timestamp >= ILPrevTime)) return; NDS.ARM9Timestamp = ILPrevTime; ILPrevTime = 16; diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 0d2fb5af..ce2a0cd7 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -897,7 +897,7 @@ void A_MUL(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } else @@ -943,7 +943,7 @@ void A_MLA(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } else @@ -989,7 +989,7 @@ void A_UMULL(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } else @@ -1040,7 +1040,7 @@ void A_UMLAL(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } else @@ -1086,7 +1086,7 @@ void A_SMULL(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } else @@ -1136,7 +1136,7 @@ void A_SMLAL(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } else @@ -1181,7 +1181,7 @@ void A_SMLAxy(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_SMLAWy(ARM* cpu) @@ -1212,7 +1212,7 @@ void A_SMLAWy(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_SMULxy(ARM* cpu) @@ -1240,7 +1240,7 @@ void A_SMULxy(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_SMULWy(ARM* cpu) @@ -1266,7 +1266,7 @@ void A_SMULWy(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_SMLALxy(ARM* cpu) @@ -1302,7 +1302,7 @@ void A_SMLALxy(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } @@ -1358,7 +1358,7 @@ void A_QADD(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_QSUB(ARM* cpu) @@ -1384,7 +1384,7 @@ void A_QSUB(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_QDADD(ARM* cpu) @@ -1418,7 +1418,7 @@ void A_QDADD(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } void A_QDSUB(ARM* cpu) @@ -1452,7 +1452,7 @@ void A_QDSUB(ARM* cpu) cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 3022b94d..93ea54dc 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -132,7 +132,7 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset, u16 ilmask) { ((ARMv5*)cpu)->ILCurrReg = rd; bool extra = ((size < 32) || (signror && (addr&0x3))); - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles + extra; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + extra; } } } @@ -322,7 +322,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[r+1] = val; \ if (cpu->Num == 0) { \ ((ARMv5*)cpu)->ILCurrReg = r+1; \ - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } \ + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ @@ -342,7 +342,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) cpu->R[r+1] = val; \ if (cpu->Num == 0) { \ ((ARMv5*)cpu)->ILCurrReg = r+1; \ - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; } } \ + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ @@ -464,7 +464,7 @@ inline void SWP(ARM* cpu) { ((ARMv5*)cpu)->ILCurrReg = rd; bool extra = (byte || (base&0x3)); - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles + extra; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + extra; } } else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't seem to work on the arm 9? @@ -651,7 +651,7 @@ void A_LDM(ARM* cpu) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0x7FFF); ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } @@ -766,7 +766,7 @@ void T_LDR_PCREL(ARM* cpu) else if (cpu->Num == 0) { ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 8) & 0x7; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } @@ -967,7 +967,7 @@ void T_POP(ARM* cpu) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } @@ -1064,7 +1064,7 @@ void T_LDMIA(ARM* cpu) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + ((ARMv5*)cpu)->DataCycles; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) From 8ff0946b8a148326dcfcef51f925561f1520bea3 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 18 Oct 2024 17:10:00 -0400 Subject: [PATCH 180/306] mrc causes interlocks --- src/ARMInterpreter.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 614f3b53..9585e7ce 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -333,6 +333,8 @@ void A_MRC(ARM* cpu) cpu->AddCycles_C(); // 1 Execute cycle cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 Memory cycles + ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } else cpu->AddCycles_CI(2 + 1); // TODO: checkme } From 9ed4c665920bc1ed5ea1ccf8452020d292f62ee2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 18 Oct 2024 17:56:56 -0400 Subject: [PATCH 181/306] add support for 1 reg ldm/stm a9 timings and fix a bug --- src/ARMInterpreter_LoadStore.cpp | 78 +++++++++++++++++++++++++------- src/CP15.cpp | 4 +- 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 93ea54dc..b965c078 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -617,6 +617,12 @@ void A_LDM(ARM* cpu) cpu->AddCycles_CDI(); + if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + // handle data aborts if (dabort) [[unlikely]] { @@ -735,6 +741,12 @@ void A_STM(ARM* cpu) cpu->AddCycles_CD(); + if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + // handle data aborts if (dabort) [[unlikely]] { @@ -909,6 +921,12 @@ void T_PUSH(ARM* cpu) cpu->AddCycles_CD(); + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + if (dabort) [[unlikely]] { ((ARMv5*)cpu)->DataAbort(); @@ -952,33 +970,47 @@ void T_POP(ARM* cpu) dabort |= !(first ? cpu->DataRead32 (base, &pc) : cpu->DataRead32S(base, &pc)); - if (dabort) [[unlikely]] goto dataabort; + if (!dabort) [[likely]] + { + cpu->AddCycles_CDI(); - cpu->AddCycles_CDI(); - if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; - cpu->JumpTo(pc); - base += 4; + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + + if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; + cpu->JumpTo(pc); + base += 4; + } } else { cpu->AddCycles_CDI(); + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + if (cpu->Num == 0) { - u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); - ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + if (dabort) [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } + else + { + u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); + ((ARMv5*)cpu)->ILCurrReg = lastreg; + ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + } } } - if (dabort) [[unlikely]] - { - dataabort: - cpu->AddCycles_CDI(); - ((ARMv5*)cpu)->DataAbort(); - return; - } - cpu->R[13] = base; } @@ -1015,6 +1047,12 @@ void T_STMIA(ARM* cpu) cpu->AddCycles_CD(); + if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + if (dabort) [[unlikely]] { ((ARMv5*)cpu)->DataAbort(); @@ -1054,12 +1092,18 @@ void T_LDMIA(ARM* cpu) cpu->AddCycles_CDI(); + if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + if (dabort) [[unlikely]] { ((ARMv5*)cpu)->DataAbort(); return; } - + if (cpu->Num == 0) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); diff --git a/src/CP15.cpp b/src/CP15.cpp index 44b7e88a..6c9c7951 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -321,7 +321,7 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) { MemTimings[i][1] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; MemTimings[i][2] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][3] = bustimings[3] << NDS.ARM9ClockShift; // inaccurate but ehgh + MemTimings[i][3] = ((bustimings[3] - 1) << NDS.ARM9ClockShift) + 1;; // inaccurate but ehgh } } } @@ -1367,7 +1367,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if ((addr >> 24) == 0x02) { - if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) NDS.ARM9Timestamp = MainRAMTimestamp - DataCycles; + if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) DataCycles += MainRAMTimestamp - NDS.ARM9Timestamp; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } From e33d19cf14d2f0669d65de3f36b989ee54ef8c59 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 18 Oct 2024 21:08:56 -0400 Subject: [PATCH 182/306] fix a few misc things? --- src/ARM.cpp | 19 +++++-- src/ARMInterpreter_LoadStore.cpp | 86 +++++++++++++++++++++++++------- src/CP15.cpp | 58 +++++++++++---------- 3 files changed, 115 insertions(+), 48 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 75d88aea..8bcdd452 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1184,14 +1184,24 @@ void ARMv5::HandleInterlocksExecute(u16 ilmask) { if ((bitfield && (ilmask & (1<(u16 ilmask); void ARMv5::HandleInterlocksMemory(u8 reg) { if ((reg != ILPrevReg) || (NDS.ARM9Timestamp >= ILPrevTime)) return; - + + u64 diff = ILPrevTime - NDS.ARM9Timestamp; // should always be 1? NDS.ARM9Timestamp = ILPrevTime; + MainRAMTimestamp += diff; + ITCMTimestamp += diff; ILPrevTime = 16; } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index b965c078..28256207 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -157,7 +157,11 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset, u16 ilmask) ((ARMv5*)cpu)->PU_Map = ((ARMv5*)cpu)->PU_UserMap; } - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksMemory(rd); + if (cpu->Num == 0) + { + ((ARMv5*)cpu)->HandleInterlocksMemory(rd); + ((ARMv5*)cpu)->Store = true; + } bool dabort; if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval); @@ -614,14 +618,20 @@ void A_LDM(ARM* cpu) // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - - cpu->AddCycles_CDI(); + if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CDI(); if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CDI(); + } // handle data aborts if (dabort) [[unlikely]] @@ -739,13 +749,18 @@ void A_STM(ARM* cpu) if (cpu->CurInstr & (1<<22)) cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); - cpu->AddCycles_CD(); - if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CD(); if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CD(); + } // handle data aborts if (dabort) [[unlikely]] @@ -919,13 +934,18 @@ void T_PUSH(ARM* cpu) : cpu->DataWrite32S(base, cpu->R[14])); } - cpu->AddCycles_CD(); - if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CD(); if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CD(); + } if (dabort) [[unlikely]] { @@ -970,30 +990,47 @@ void T_POP(ARM* cpu) dabort |= !(first ? cpu->DataRead32 (base, &pc) : cpu->DataRead32S(base, &pc)); + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CDI(); + if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + else; // CHECKME: ARM7 timing behavior? + } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CDI(); + } + if (!dabort) [[likely]] { - cpu->AddCycles_CDI(); - - if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg - { - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages - else; // CHECKME: ARM7 timing behavior? - } - if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; cpu->JumpTo(pc); base += 4; } + else [[unlikely]] + { + ((ARMv5*)cpu)->DataAbort(); + return; + } } else { - cpu->AddCycles_CDI(); + if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CDI(); if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CDI(); + } if (cpu->Num == 0) { @@ -1045,13 +1082,19 @@ void T_STMIA(ARM* cpu) } } - cpu->AddCycles_CD(); if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CD(); if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CD(); + } if (dabort) [[unlikely]] { @@ -1090,13 +1133,18 @@ void T_LDMIA(ARM* cpu) } } - cpu->AddCycles_CDI(); - if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + cpu->AddCycles_CDI(); if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } + else + { + if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + cpu->AddCycles_CDI(); + } if (dabort) [[unlikely]] { diff --git a/src/CP15.cpp b/src/CP15.cpp index 6c9c7951..c9f184d6 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -926,7 +926,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>12] & 0x01)) [[unlikely]] @@ -984,7 +984,6 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { - Store = false; // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] @@ -1032,7 +1031,6 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { - Store = false; // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] @@ -1084,7 +1082,8 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { - DataCycles += 1; + NDS.ARM9Timestamp += DataCycles; + DataCycles = 1; return false; } @@ -1092,15 +1091,17 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { - DataCycles += 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + NDS.ARM9Timestamp += DataCycles; + DataCycles = 1; + // we update the timestamp during the actual function, as a sequential itcm access can only occur during instructions with strange itcm wait cycles DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return true; } if ((addr & DTCMMask) == DTCMBase) { - DataCycles += 1; + NDS.ARM9Timestamp += DataCycles; + DataCycles = 1; DataRegion = Mem9_DTCM; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return true; @@ -1111,9 +1112,9 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) NDS.ARM9Timestamp += DataCycles; - if (!(addr & 0x3FF)) return DataRead32(addr, val); // bursts cannot cross a 1kb boundary - - DataCycles = MemTimings[addr >> 12][3]; + // bursts cannot cross a 1kb boundary + if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][3]; //s + else DataCycles = MemTimings[addr >> 12][2]; // ns NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x02)) [[unlikely]] @@ -1143,7 +1143,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (addr < ITCMSize) { DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + // does not stall (for some reason?) DataRegion = Mem9_ITCM; *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED @@ -1170,7 +1170,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (2<>14]; @@ -1196,7 +1196,6 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { - Store = true; // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] @@ -1210,7 +1209,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (addr < ITCMSize) { DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + // does not stall (for some reason?) DataRegion = Mem9_ITCM; *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED @@ -1237,7 +1236,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (2<>14]; @@ -1263,7 +1262,6 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { - Store = true; // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] @@ -1277,7 +1275,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + // does not stall (for some reason?) DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED @@ -1304,7 +1302,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (2<>14]; @@ -1343,7 +1341,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { DataCycles += 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + // we update the timestamp during the actual function, as a sequential itcm access can only occur during instructions with strange itcm wait cycles DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; #ifdef JIT_ENABLED @@ -1362,13 +1360,16 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (!(PU_Map[addr>>12] & 0x30)) { DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<> 12][3]; //s + else DataCycles += MemTimings[addr >> 12][2]; // ns if ((addr >> 24) == 0x02) { - if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) DataCycles += MainRAMTimestamp - NDS.ARM9Timestamp; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + //if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) DataCycles += MainRAMTimestamp - NDS.ARM9Timestamp; // what the hell was i smoking here? + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles + (2 << NDS.ARM9ClockShift); DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; @@ -1378,7 +1379,12 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) } else { - u8 cycles = NDS.ARM9MemTimings[addr>>14][3]; + u8 cycles; + // bursts cannot cross a 1kb boundary + // CHECKME: does this actually apply to the write buffer too? it should + if (addr & 0x3FF) cycles = NDS.ARM9MemTimings[addr>>14][3]; //s + else cycles = NDS.ARM9MemTimings[addr>>14][2]; // ns + if ((addr >> 24) == 0x02) { cycles = (cycles - 2) | 0x80; From d4216309a24fa7befeabbd2f19e1fee8be164a14 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:00:51 -0400 Subject: [PATCH 183/306] hdfg --- src/CP15.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 914a4e12..bdee27a6 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2143,8 +2143,8 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) NDS.ARM9Timestamp += DataCycles; // bursts cannot cross a 1kb boundary - if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][3]; //s - else DataCycles = MemTimings[addr >> 12][2]; // ns + if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][2]; //s + else DataCycles = MemTimings[addr >> 12][1]; // ns NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 12][3]; //s - else DataCycles += MemTimings[addr >> 12][2]; // ns + if (addr & 0x3FF) DataCycles += MemTimings[addr >> 12][2]; //s + else DataCycles += MemTimings[addr >> 12][1]; // ns if ((addr >> 24) == 0x02) { From 8fff17f03f2d13be9f74af9607c048ab738fe3ac Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 19 Oct 2024 11:01:16 -0400 Subject: [PATCH 184/306] fix resets --- src/ARM.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ARM.cpp b/src/ARM.cpp index 8bcdd452..a67ef56e 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -202,6 +202,7 @@ void ARMv5::Reset() PU_Map = PU_PrivMap; Store = false; + ITCMTimestamp = 0; TimestampActual = 0; ILCurrReg = 16; ILPrevReg = 16; From e254ac3240a503213ca784779dfe32333a6ea803 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 19 Oct 2024 12:35:27 -0400 Subject: [PATCH 185/306] fix ldrd/strd itcm timings --- src/ARMInterpreter_LoadStore.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 28256207..adbd1121 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -317,6 +317,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ bool dabort = !cpu->DataRead32(offset, &cpu->R[r]); \ u32 val; dabort |= !cpu->DataRead32S(offset+4, &val); \ + if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ cpu->AddCycles_CDI(); \ if (dabort) { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -337,6 +338,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ bool dabort = !cpu->DataRead32(addr, &cpu->R[r]); \ u32 val; dabort |= !cpu->DataRead32S(addr+4, &val); \ + if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ cpu->AddCycles_CDI(); \ if (dabort) { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -359,6 +361,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dabort |= !cpu->DataWrite32S (offset+4, storeval); /* no, i dont understand it either */ \ + if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ cpu->AddCycles_CD(); \ if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -375,6 +378,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dabort = !cpu->DataWrite32(addr, cpu->R[r]); \ u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ dabort |= !cpu->DataWrite32S (addr+4, storeval); \ + if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ cpu->AddCycles_CD(); \ if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ From 744f5c9fcdd3339889abbbd0b81e15f2de464153 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 19 Oct 2024 16:51:40 -0400 Subject: [PATCH 186/306] small fixes --- src/ARM.cpp | 4 ++-- src/CP15.cpp | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 64d697a8..b3d3e9aa 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1160,6 +1160,7 @@ void ARMv5::CodeFetch() NextInstr[1] >>= 16; NDS.ARM9Timestamp++; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + Store = false; DataRegion = Mem9_Null; } else NextInstr[1] = CodeRead32(PC, false); @@ -1218,8 +1219,7 @@ void ARMv5::HandleInterlocksMemory(u8 reg) u64 diff = ILPrevTime - NDS.ARM9Timestamp; // should always be 1? NDS.ARM9Timestamp = ILPrevTime; - MainRAMTimestamp += diff; - ITCMTimestamp += diff; + ITCMTimestamp += diff; // checkme ILPrevTime = 16; } diff --git a/src/CP15.cpp b/src/CP15.cpp index bdee27a6..9603692e 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -413,6 +413,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) NDS.ARM9Timestamp += 1; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; + Store = false; return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; } } @@ -489,11 +490,12 @@ u32 ARMv5::ICacheLookup(const u32 addr) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1)) - 1) << NDS.ARM9ClockShift) + 1; + NDS.ARM9Timestamp += CodeCycles; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + DataRegion = Mem9_Null; return ptr[(addr & (ICACHE_LINELENGTH-1)) >> 2]; } @@ -1863,6 +1865,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) NDS.ARM9Timestamp += CodeCycles; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; + Store = false; return 0; } @@ -1874,6 +1877,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) NDS.ARM9Timestamp += CodeCycles; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; + Store = false; return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; } From af09e37d5347b658e73b2f78d6851d5ed5e05458 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 21 Oct 2024 11:38:26 -0400 Subject: [PATCH 187/306] attempt at twl timings also rework main ram's early finish handling --- src/CP15.cpp | 76 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index c9f184d6..f2384459 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -308,7 +308,21 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) } else { - MemTimings[i][0] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; + if (NDS.ARM9ClockShift == 1) // ntr + { + MemTimings[i][0] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + } + else // twl + { + if (NDS.ARM9Regions[i>>2] != Mem9_MainRAM) + { + MemTimings[i][0] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) - 1; + } + else + { + MemTimings[i][0] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + } + } } if (pu & 0x10) @@ -319,9 +333,27 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) } else { - MemTimings[i][1] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][2] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][3] = ((bustimings[3] - 1) << NDS.ARM9ClockShift) + 1;; // inaccurate but ehgh + if (NDS.ARM9ClockShift == 1) // ntr + { + MemTimings[i][1] = (bustimings[0] << NDS.ARM9ClockShift) - 1; + MemTimings[i][2] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + MemTimings[i][3] = (bustimings[3] << NDS.ARM9ClockShift) - 1; + } + else // twl + { + if (NDS.ARM9Regions[i>>2] != Mem9_MainRAM) + { + MemTimings[i][1] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) - 1; // 8/16 bit + MemTimings[i][2] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) - 1; // 32 bit ns + MemTimings[i][3] = (bustimings[3] << NDS.ARM9ClockShift) - 1; // s + } + else // main ram; timing adjustments are handled elsewhere + { + MemTimings[i][1] = (bustimings[0] << NDS.ARM9ClockShift) - 1; + MemTimings[i][2] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + MemTimings[i][3] = (bustimings[3] << NDS.ARM9ClockShift) - 1; + } + } } } } @@ -924,11 +956,20 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>14]; @@ -1021,6 +1063,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + if (NDS.ARM9ClockShift == 2) DataCycles -= 4; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; @@ -1068,6 +1111,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + if (NDS.ARM9ClockShift == 2) DataCycles -= 4; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; @@ -1115,13 +1159,14 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) // bursts cannot cross a 1kb boundary if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][3]; //s else DataCycles = MemTimings[addr >> 12][2]; // ns - - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = MainRAMTimestamp - NDS.ARM9Timestamp; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + if (NDS.ARM9ClockShift == 2) DataCycles -= 4; DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; @@ -1170,7 +1215,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (3<>14]; @@ -1236,7 +1281,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (3<>14]; @@ -1302,7 +1347,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; DataRegion = Mem9_MainRAM; MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataCycles -= (3<>14]; @@ -1368,8 +1413,9 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if ((addr >> 24) == 0x02) { - //if ((DataRegion != Mem9_MainRAM) && ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp)) DataCycles += MainRAMTimestamp - NDS.ARM9Timestamp; // what the hell was i smoking here? - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles + (2 << NDS.ARM9ClockShift); + if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = MainRAMTimestamp - NDS.ARM9Timestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataCycles -= 3 << NDS.ARM9ClockShift; // checkme: are sequentials actually - 3? DataRegion = Mem9_MainRAM; } else DataRegion = NDS.ARM9Regions[addr>>14]; @@ -1387,7 +1433,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if ((addr >> 24) == 0x02) { - cycles = (cycles - 2) | 0x80; + cycles = (cycles - 3) | 0x80; } WriteBufferWrite(val, 2, cycles, addr); From 54dd4e591365e1d9ca718d2772edfb933b976451 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 21 Oct 2024 13:15:15 -0400 Subject: [PATCH 188/306] git hates me --- src/CP15.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 9a87e620..19bc1339 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -332,9 +332,26 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) // checkme: should these be (bus timings shifted) - 1 or ((bustimings - 1) shifted) + 1 // should the last cycle be halved...? - MemTimings[i][0] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][1] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) + 1; - MemTimings[i][2] = ((bustimings[3] - 1) << NDS.ARM9ClockShift) + 1; + if (NDS.ARM9ClockShift == 1) + { + MemTimings[i][0] = (bustimings[0] << NDS.ARM9ClockShift) - 1; + MemTimings[i][1] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + MemTimings[i][2] = (bustimings[3] << NDS.ARM9ClockShift) - 1; + } + else + { + if (NDS.ARM9Regions[i] != Mem9_MainRAM) + { + MemTimings[i][0] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) - 1; + MemTimings[i][1] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) - 1; + } + else + { + MemTimings[i][0] = (bustimings[0] << NDS.ARM9ClockShift) - 1; + MemTimings[i][1] = (bustimings[2] << NDS.ARM9ClockShift) - 1; + } + MemTimings[i][2] = (bustimings[3] << NDS.ARM9ClockShift) - 1; + } } } From d31f652fc816cfafc2156e773033ef1d150f5426 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 25 Oct 2024 18:15:54 -0400 Subject: [PATCH 189/306] implement icache streaming --- src/ARM.cpp | 20 +++---- src/ARM.h | 5 ++ src/CP15.cpp | 150 +++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 130 insertions(+), 45 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index b3d3e9aa..b2211173 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -207,6 +207,8 @@ void ARMv5::Reset() ILCurrReg = 16; ILPrevReg = 16; + ICacheFillPtr = 7; + WBWritePointer = 16; WBFillPointer = 0; WBDelay = 0; @@ -320,30 +322,29 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) //if (addr == 0x0201764C) printf("capture test %d: R1=%08X\n", R[6], R[1]); //if (addr == 0x020175D8) printf("capture test %d: res=%08X\n", R[6], R[0]); - u32 oldregion = R[15] >> 24; - u32 newregion = addr >> 24; + if (ICacheFillPtr != 7) + { + u64 fillend = ICacheFillTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; + ICacheFillPtr = 7; + } if (addr & 0x1) { addr &= ~0x1; R[15] = addr+2; - if (newregion != oldregion) SetupCodeMem(addr); - // two-opcodes-at-once fetch // doesn't matter if we put garbage in the MSbs there if (addr & 0x2) { NextInstr[0] = CodeRead32(addr-2, true) >> 16; - Cycles += CodeCycles; NextInstr[1] = CodeRead32(addr+2, false); - Cycles += CodeCycles; } else { NextInstr[0] = CodeRead32(addr, true); NextInstr[1] = NextInstr[0] >> 16; - Cycles += CodeCycles; } CPSR |= 0x20; @@ -352,13 +353,8 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) { addr &= ~0x3; R[15] = addr+4; - - if (newregion != oldregion) SetupCodeMem(addr); - NextInstr[0] = CodeRead32(addr, true); - Cycles += CodeCycles; NextInstr[1] = CodeRead32(addr+4, false); - Cycles += CodeCycles; CPSR &= ~0x20; } diff --git a/src/ARM.h b/src/ARM.h index ad97261b..657e2069 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -673,6 +673,11 @@ public: u64 ILCurrTime; u64 ILPrevTime; + u8 ICacheFillPtr; + u8 DCacheFillPtr; + u64 ICacheFillTimes[7]; + u64 DCacheFillTimes[7]; + u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing bool WBWriting; // whether the buffer is actively trying to perform a write diff --git a/src/CP15.cpp b/src/CP15.cpp index 19bc1339..1278fb6c 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -418,16 +418,29 @@ u32 ARMv5::ICacheLookup(const u32 addr) // retreive the data from memory, even if the data was cached // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") WriteBufferDrain(); - CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; - if (CodeMem.Mem) - { - return *(u32*)&CodeMem.Mem[(addr & CodeMem.Mask) & ~3]; - } else + CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; { return NDS.ARM9Read32(addr & ~3); } } - NDS.ARM9Timestamp += 1; + + if (ICacheFillPtr == 7) NDS.ARM9Timestamp++; + else + { + u64 nextfill = ICacheFillTimes[ICacheFillPtr++]; + if (NDS.ARM9Timestamp < nextfill) + { + NDS.ARM9Timestamp = nextfill; + } + else + { + u64 fillend = ICacheFillTimes[6] + 2; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; + else NDS.ARM9Timestamp++; + ICacheFillPtr = 7; + } + } + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; Store = false; @@ -442,11 +455,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] { WriteBufferDrain(); - CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; - if (CodeMem.Mem) - { - return *(u32*)&CodeMem.Mem[(addr & CodeMem.Mask) & ~3]; - } else + CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; { return NDS.ARM9Read32(addr & ~3); } @@ -481,14 +490,17 @@ u32 ARMv5::ICacheLookup(const u32 addr) line += id; u32* ptr = (u32 *)&ICache[line << ICACHE_LINELENGTH_LOG2]; - + + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + WriteBufferDrain(); - if (CodeMem.Mem) - { - memcpy(ptr, &CodeMem.Mem[tag & CodeMem.Mask], ICACHE_LINELENGTH); - } - else { for (int i = 0; i < ICACHE_LINELENGTH; i+=sizeof(u32)) ptr[i >> 2] = NDS.ARM9Read32(tag+i); @@ -509,10 +521,23 @@ u32 ARMv5::ICacheLookup(const u32 addr) else if (NDS.ARM9Regions[addr>>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 1)) - 1) << NDS.ARM9ClockShift) + 1; - NDS.ARM9Timestamp += CodeCycles; + u8 ns = MemTimings[addr>>14][1]; + u8 seq = MemTimings[addr>>14][2] + 1; + + u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + + u32 cycles = ns + (seq * linepos); + NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + ICacheFillPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + ICacheFillTimes[i] = cycles; + } + if ((addr >> 24) == 0x02) MainRAMTimestamp = ICacheFillTimes[6]; + DataRegion = Mem9_Null; return ptr[(addr & (ICACHE_LINELENGTH-1)) >> 2]; } @@ -1876,10 +1901,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) // prefetch abort // the actual exception is not raised until the aborted instruction is executed if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] - { - CodeCycles = 1; - - NDS.ARM9Timestamp += CodeCycles; + { + NDS.ARM9Timestamp += 1; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; Store = false; @@ -1888,10 +1911,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (addr < ITCMSize) { - CodeCycles = 1; - if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; - NDS.ARM9Timestamp += CodeCycles; + NDS.ARM9Timestamp += 1; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; Store = false; @@ -1910,7 +1931,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) #endif } - CodeCycles = MemTimings[addr >> 14][1]; + u8 cycles = MemTimings[addr >> 14][1]; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); @@ -1920,7 +1941,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>12] & 0x30) WriteBufferDrain(); @@ -2044,6 +2072,14 @@ bool ARMv5::DataRead16(u32 addr, u32* val) } #endif + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); @@ -2106,6 +2142,14 @@ bool ARMv5::DataRead32(u32 addr, u32* val) } #endif + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); @@ -2170,14 +2214,22 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) } #endif + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); NDS.ARM9Timestamp += DataCycles; // bursts cannot cross a 1kb boundary - if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][3]; //s - else DataCycles = MemTimings[addr >> 12][2]; // ns + if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][2]; //s + else DataCycles = MemTimings[addr >> 12][1]; // ns DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<>12] & (0x30))) { @@ -2314,6 +2374,14 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } } #endif + + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } if (!(PU_Map[addr>>12] & 0x30)) { @@ -2394,6 +2462,14 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) } } #endif + + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } if (!(PU_Map[addr>>12] & 0x30)) { @@ -2473,6 +2549,14 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) } } #endif + + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheFillPtr != 7) + { + u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } if (!(PU_Map[addr>>12] & 0x30)) { From ebb63dcdb203b2b42fe96572fbc67eb9d9f5cdbb Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 25 Oct 2024 18:53:22 -0400 Subject: [PATCH 190/306] implement dcache streaming --- src/ARM.cpp | 1 + src/CP15.cpp | 135 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 106 insertions(+), 30 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index b2211173..fe01ea4f 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -208,6 +208,7 @@ void ARMv5::Reset() ILPrevReg = 16; ICacheFillPtr = 7; + DCacheFillPtr = 7; WBWritePointer = 16; WBFillPointer = 0; diff --git a/src/CP15.cpp b/src/CP15.cpp index 1278fb6c..463208d0 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -637,10 +637,29 @@ u32 ARMv5::DCacheLookup(const u32 addr) DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; return BusRead32(addr & ~3); } - DataCycles += 1; + + NDS.ARM9Timestamp += DataCycles; + DataCycles = 0; + + if (DCacheFillPtr == 7) DataCycles = 1; + else + { + u64 nextfill = DCacheFillTimes[DCacheFillPtr++]; + if (NDS.ARM9Timestamp < nextfill) + { + DataCycles = nextfill - NDS.ARM9Timestamp; + } + else + { + u64 fillend = DCacheFillTimes[6] + 2; + if (NDS.ARM9Timestamp < fillend) DataCycles = fillend - NDS.ARM9Timestamp; + else DataCycles = 1; + DCacheFillPtr = 7; + } + } DataRegion = Mem9_DCache; - //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2], set, id>>2); - return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; + //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2], set, id>>2); + return cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2]; } } @@ -683,39 +702,53 @@ u32 ARMv5::DCacheLookup(const u32 addr) line += id; u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; - - DataCycles = 0; + + NDS.ARM9Timestamp += DataCycles; + //DataCycles = 0; #if !DISABLE_CACHEWRITEBACK // Before we fill the cacheline, we need to write back dirty content // Datacycles will be incremented by the required cycles to do so DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); #endif - WriteBufferDrain(); - //Log(LogLevel::Debug,"DCache miss, load @ %08x\n", tag); + WriteBufferDrain(); // checkme? + for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { - ptr[i >> 2] = BusRead32(tag+i); - //Log(LogLevel::Debug,"DCache store @ %08x: %08x in set %i, line %i\n", tag+i, *(u32*)&ptr[i >> 2], line & 3, line >> 2); + ptr[i >> 2] = BusRead32(tag+i); } DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; - - // ouch :/ - //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); - // first N32 remaining S32 - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 2)) - 1) << NDS.ARM9ClockShift) + 1; - DataCycles = NDS.ARM9MemTimings[tag>>14][3] << NDS.ARM9ClockShift; - if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_MainRAM; } - else DataRegion = NDS.ARM9Regions[addr>>14]; + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>14][1]; + u8 seq = MemTimings[addr>>14][2] + 1; + + u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + + u32 cycles = ns + (seq * linepos); + DataCycles = cycles; + + cycles += NDS.ARM9Timestamp; + + DCacheFillPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + DCacheFillTimes[i] = cycles; + } + if ((addr >> 24) == 0x02) MainRAMTimestamp = DCacheFillTimes[6]; + + DataRegion = NDS.ARM9Regions[addr>>14]; + + //NDS.ARM9Timestamp += ((NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 2)) - 1) << NDS.ARM9ClockShift) + 1; + //DataCycles = NDS.ARM9MemTimings[tag>>14][3] << NDS.ARM9ClockShift; return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -1048,7 +1081,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) WriteBufferWrite(ptr[1], 2, cycless, tag+0x04); WriteBufferWrite(ptr[2], 2, cycless, tag+0x08); WriteBufferWrite(ptr[3], 2, cycless, tag+0x0C); - DataCycles += 5; + NDS.ARM9Timestamp += 5; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? } if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) // todo: check how this behaves when both fields need to be written { @@ -1065,7 +1098,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) WriteBufferWrite(ptr[5], 2, cycless, tag+0x14); WriteBufferWrite(ptr[6], 2, cycless, tag+0x18); WriteBufferWrite(ptr[7], 2, cycless, tag+0x1C); - DataCycles += 5; + NDS.ARM9Timestamp += 5; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); #endif @@ -1964,6 +1997,13 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) bool ARMv5::DataRead8(u32 addr, u32* val) { + if (DCacheFillPtr != 7) + { + u64 fillend = DCacheFillTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheFillPtr = 7; + } + // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] @@ -2019,7 +2059,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x01)) [[unlikely]] @@ -2089,7 +2136,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x01)) [[unlikely]] @@ -2159,7 +2213,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<> 24) == 0x02) { - if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = MainRAMTimestamp - NDS.ARM9Timestamp; + if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = (MainRAMTimestamp - NDS.ARM9Timestamp) + ((1<>12] & 0x02)) [[unlikely]] @@ -2304,7 +2365,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x02)) [[unlikely]] @@ -2391,7 +2459,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x02)) [[unlikely]] @@ -2479,7 +2554,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<> 24) == 0x02) { - if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = MainRAMTimestamp - NDS.ARM9Timestamp; + if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = (MainRAMTimestamp - NDS.ARM9Timestamp) + ((1< Date: Fri, 25 Oct 2024 19:35:58 -0400 Subject: [PATCH 191/306] fix an oopsie 3-2 (unsigned) equals "oh no" --- src/CP15.cpp | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 463208d0..c9b809d5 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -330,8 +330,6 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) { u8* bustimings = NDS.ARM9MemTimings[i]; - // checkme: should these be (bus timings shifted) - 1 or ((bustimings - 1) shifted) + 1 - // should the last cycle be halved...? if (NDS.ARM9ClockShift == 1) { MemTimings[i][0] = (bustimings[0] << NDS.ARM9ClockShift) - 1; @@ -2053,13 +2051,13 @@ bool ARMv5::DataRead8(u32 addr, u32* val) if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][0]; if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x30) WriteBufferDrain(); - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][0]; if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x30) WriteBufferDrain(); - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1]; if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<> 12][2]; //s else DataCycles = MemTimings[addr >> 12][1]; // ns - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<> 24) == 0x02) { - if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = (MainRAMTimestamp - NDS.ARM9Timestamp) + ((1<>12] & (0x30))) { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][0]; if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x30)) { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][0]; if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x30)) { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1]; if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>12] & 0x30)) { - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<> 24) == 0x02) { - if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = (MainRAMTimestamp - NDS.ARM9Timestamp) + ((1<> 24) == 0x02) { - cycles = (cycles - 3) | 0x80; + cycles = (cycles - 2) | 0x80; } WriteBufferWrite(val, 2, cycles, addr); From e1f22bd5117f8bcbae9cda9ef530c3c3b33d54e4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 25 Oct 2024 23:58:18 -0400 Subject: [PATCH 192/306] writing back dirty cache lines should be done in one burst if both halves are dirty --- src/CP15.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index c9b809d5..8f3f6fb3 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1091,12 +1091,20 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; if ((tag >> 24) == 0x02) cycless = (cycless - 2) | 0x80; - WriteBufferWrite(tag+0x10, 3, 0); + if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) + { + cyclesn = cycless; // write back is done in one burst if both halves are dirty + } + else + { + WriteBufferWrite(tag+0x10, 3, 0); + NDS.ARM9Timestamp += 1; + } WriteBufferWrite(ptr[4], 2, cyclesn, tag+0x10); WriteBufferWrite(ptr[5], 2, cycless, tag+0x14); WriteBufferWrite(ptr[6], 2, cycless, tag+0x18); WriteBufferWrite(ptr[7], 2, cycless, tag+0x1C); - NDS.ARM9Timestamp += 5; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? + NDS.ARM9Timestamp += 4; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); #endif @@ -2274,7 +2282,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - if (PU_Map[addr>>12] & 0x30) + if (PU_Map[addr>>12] & 0x30) // checkme WriteBufferDrain(); NDS.ARM9Timestamp += DataCycles; From ba904b4d81cb73f010c211d49ace07e2451c8267 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 26 Oct 2024 11:20:55 -0400 Subject: [PATCH 193/306] redo cache streaming disable implementation --- src/CP15.cpp | 133 +++++++++++++++++++++++++++------------------------ 1 file changed, 70 insertions(+), 63 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 8f3f6fb3..301a010d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -410,17 +410,6 @@ u32 ARMv5::ICacheLookup(const u32 addr) #endif { u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; - if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] - { - // Disabled ICACHE Streaming: - // retreive the data from memory, even if the data was cached - // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") - WriteBufferDrain(); - CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; - { - return NDS.ARM9Read32(addr & ~3); - } - } if (ICacheFillPtr == 7) NDS.ARM9Timestamp++; else @@ -505,11 +494,8 @@ u32 ARMv5::ICacheLookup(const u32 addr) } ICacheTags[line] = tag | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; - - // ouch :/ - //printf("cache miss %08X: %d/%d\n", addr, NDS::ARM9MemTimings[addr >> 14][2], NDS::ARM9MemTimings[addr >> 14][3]); - // first N32 remaining S32 - + + // timing logic NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) @@ -519,22 +505,32 @@ u32 ARMv5::ICacheLookup(const u32 addr) else if (NDS.ARM9Regions[addr>>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>14][1]; - u8 seq = MemTimings[addr>>14][2] + 1; - - u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually - - u32 cycles = ns + (seq * linepos); - NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; - - ICacheFillPtr = linepos; - for (int i = linepos; i < 7; i++) + // Disabled ICACHE Streaming: + // Wait until the entire cache line is filled before continuing with execution + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] { - cycles += seq; - ICacheFillTimes[i] = cycles; + NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + ((MemTimings[tag >> 14][2] + 1) * ((DCACHE_LINELENGTH / 4) - 1)); + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + } + else // ICache Streaming logic + { + u8 ns = MemTimings[addr>>14][1]; + u8 seq = MemTimings[addr>>14][2] + 1; + + u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + + u32 cycles = ns + (seq * linepos); + NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + + ICacheFillPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + ICacheFillTimes[i] = cycles; + } + if ((addr >> 24) == 0x02) MainRAMTimestamp = ICacheFillTimes[6]; } - if ((addr >> 24) == 0x02) MainRAMTimestamp = ICacheFillTimes[6]; DataRegion = Mem9_Null; return ptr[(addr & (ICACHE_LINELENGTH-1)) >> 2]; @@ -626,15 +622,6 @@ u32 ARMv5::DCacheLookup(const u32 addr) #endif { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; - if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] - { - // Disabled DCACHE Streaming: - // retreive the data from memory, even if the data was cached - // See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") - WriteBufferDrain(); - DataCycles = NDS.ARM9MemTimings[tag >> 14][2]; - return BusRead32(addr & ~3); - } NDS.ARM9Timestamp += DataCycles; DataCycles = 0; @@ -702,14 +689,14 @@ u32 ARMv5::DCacheLookup(const u32 addr) u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; NDS.ARM9Timestamp += DataCycles; + + WriteBufferDrain(); // checkme? //DataCycles = 0; #if !DISABLE_CACHEWRITEBACK // Before we fill the cacheline, we need to write back dirty content // Datacycles will be incremented by the required cycles to do so DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); #endif - - WriteBufferDrain(); // checkme? for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { @@ -718,36 +705,54 @@ u32 ARMv5::DCacheLookup(const u32 addr) DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; - if ((addr >> 24) == 0x02) + // timing logic + + // Disabled DCACHE Streaming: + // Wait until the entire cache line is filled before continuing with execution + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1] + ((MemTimings[tag >> 14][2] + 1) * ((DCACHE_LINELENGTH / 4) - 2)); + DataCycles = MemTimings[tag>>14][2] + 1; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; } + else // DCache Streaming logic + { + if ((addr >> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; + } - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>14][1]; - u8 seq = MemTimings[addr>>14][2] + 1; + u8 ns = MemTimings[addr>>14][1]; + u8 seq = MemTimings[addr>>14][2] + 1; - u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually - u32 cycles = ns + (seq * linepos); - DataCycles = cycles; + u32 cycles = ns + (seq * linepos); + DataCycles = cycles; - cycles += NDS.ARM9Timestamp; + cycles += NDS.ARM9Timestamp; - DCacheFillPtr = linepos; - for (int i = linepos; i < 7; i++) - { - cycles += seq; - DCacheFillTimes[i] = cycles; + DCacheFillPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + DCacheFillTimes[i] = cycles; + } + if ((addr >> 24) == 0x02) MainRAMTimestamp = DCacheFillTimes[6]; + + DataRegion = NDS.ARM9Regions[addr>>14]; } - if ((addr >> 24) == 0x02) MainRAMTimestamp = DCacheFillTimes[6]; - - DataRegion = NDS.ARM9Regions[addr>>14]; - - //NDS.ARM9Timestamp += ((NDS.ARM9MemTimings[tag >> 14][2] + (NDS.ARM9MemTimings[tag >> 14][3] * ((DCACHE_LINELENGTH / 4) - 2)) - 1) << NDS.ARM9ClockShift) + 1; - //DataCycles = NDS.ARM9MemTimings[tag>>14][3] << NDS.ARM9ClockShift; - return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -1124,6 +1129,8 @@ inline bool ARMv5::WriteBufferHandle() bool mainram = (WBCurCycles >= 0x80); u64 ts; + u64 mrts = (MainRAMTimestamp + ((1<> NDS.ARM9ClockShift; + if (WBMainRAMDelay < mrts) WBMainRAMDelay = mrts; if (mainram) ts = std::max(WBTimestamp, WBMainRAMDelay) + (WBCurCycles & 0x7F); else ts = WBTimestamp + (WBCurCycles & 0x7F); @@ -1206,7 +1213,7 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - WBTimestamp = (((NDS.ARM9Timestamp + DataCycles + 1) + ((1<> NDS.ARM9ClockShift; + WBTimestamp = ((NDS.ARM9Timestamp + DataCycles + 1) + ((1<> NDS.ARM9ClockShift; } WriteBufferFifo[WBFillPointer] = val | (u64)flag << 62; From ca674b63724f514e1553612111e76732d440f63f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 26 Oct 2024 18:42:31 -0400 Subject: [PATCH 194/306] "fix" icache linefill disable timings --- src/ARM.cpp | 2 ++ src/CP15.cpp | 74 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index fe01ea4f..4de2e8ad 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -323,6 +323,8 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) //if (addr == 0x0201764C) printf("capture test %d: R1=%08X\n", R[6], R[1]); //if (addr == 0x020175D8) printf("capture test %d: res=%08X\n", R[6], R[0]); + // jumps count as nonsequential accesses on the instruction bus on the arm9 + // thus it requires waiting for the current ICache line fill to complete before continuing if (ICacheFillPtr != 7) { u64 fillend = ICacheFillTimes[6] + 1; diff --git a/src/CP15.cpp b/src/CP15.cpp index 301a010d..ea3c5f46 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -441,11 +441,33 @@ u32 ARMv5::ICacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] { + u8 cycles = MemTimings[addr >> 14][1]; + WriteBufferDrain(); - CodeCycles = NDS.ARM9MemTimings[tag >> 14][2]; + + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) { - return NDS.ARM9Read32(addr & ~3); - } + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<> 14][2]; + + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 14][1]; // CHECKME: can this do sequential accesses? + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; + return BusRead32(addr & ~3); } @@ -688,10 +728,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; - NDS.ARM9Timestamp += DataCycles; - WriteBufferDrain(); // checkme? - //DataCycles = 0; + #if !DISABLE_CACHEWRITEBACK // Before we fill the cacheline, we need to write back dirty content // Datacycles will be incremented by the required cycles to do so @@ -2048,7 +2086,6 @@ bool ARMv5::DataRead8(u32 addr, u32* val) { if (IsAddressDCachable(addr)) { - DataCycles = 0; *val = (DCacheLookup(addr) >> (8 * (addr & 3))) & 0xff; return true; } @@ -2125,7 +2162,6 @@ bool ARMv5::DataRead16(u32 addr, u32* val) { if (IsAddressDCachable(addr)) { - DataCycles = 0; *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; return true; } @@ -2202,7 +2238,6 @@ bool ARMv5::DataRead32(u32 addr, u32* val) { if (IsAddressDCachable(addr)) { - DataCycles = 0; *val = DCacheLookup(addr); return true; } @@ -2239,11 +2274,12 @@ bool ARMv5::DataRead32(u32 addr, u32* val) bool ARMv5::DataRead32S(u32 addr, u32* val) { + NDS.ARM9Timestamp += DataCycles; + // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] { - NDS.ARM9Timestamp += DataCycles; DataCycles = 1; return false; } @@ -2252,7 +2288,6 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (addr < ITCMSize) { - NDS.ARM9Timestamp += DataCycles; DataCycles = 1; // we update the timestamp during the actual function, as a sequential itcm access can only occur during instructions with strange itcm wait cycles DataRegion = Mem9_ITCM; @@ -2261,7 +2296,6 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) } if ((addr & DTCMMask) == DTCMBase) { - NDS.ARM9Timestamp += DataCycles; DataCycles = 1; DataRegion = Mem9_DTCM; *val = *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)]; @@ -2292,8 +2326,6 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (PU_Map[addr>>12] & 0x30) // checkme WriteBufferDrain(); - NDS.ARM9Timestamp += DataCycles; - // bursts cannot cross a 1kb boundary if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][2]; //s else DataCycles = MemTimings[addr >> 12][1]; // ns @@ -2381,7 +2413,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; @@ -2475,7 +2507,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; @@ -2570,7 +2602,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; From d88b46e6d974ccc9d7c4f434c7ebb4ed32b47b55 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 27 Oct 2024 14:30:29 -0400 Subject: [PATCH 195/306] rework and fix bursts --- src/CP15.cpp | 95 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index ea3c5f46..a3365674 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -840,7 +840,7 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 2] = val; - DataCycles += 1; + DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) @@ -2326,20 +2326,34 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (PU_Map[addr>>12] & 0x30) // checkme WriteBufferDrain(); - // bursts cannot cross a 1kb boundary - if (addr & 0x3FF) DataCycles = MemTimings[addr >> 12][2]; //s - else DataCycles = MemTimings[addr >> 12][1]; // ns - - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<> 24) == 0x02) + // bursts cannot cross a 1kb boundary + if (addr & 0x3FF) // s { - if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = ((MainRAMTimestamp - NDS.ARM9Timestamp) + ((1<>14][2]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + if (NDS.ARM9ClockShift == 2) MainRAMTimestamp += 4; + DataRegion = Mem9_MainRAM; + } + else DataRegion = NDS.ARM9Regions[addr>>14]; + } + else // ns + { + DataCycles = MemTimings[addr>>14][1]; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; } - else DataRegion = NDS.ARM9Regions[addr>>14]; *val = BusRead32(addr); return true; @@ -2576,7 +2590,6 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) { if (IsAddressDCachable(addr)) { - DataCycles = 0; if (DCacheWrite32(addr, val)) return true; } @@ -2605,7 +2618,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) DataCycles -= 2<>14]; - + BusWrite32(addr, val); } else @@ -2628,11 +2641,13 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) bool ARMv5::DataWrite32S(u32 addr, u32 val) { + NDS.ARM9Timestamp += DataCycles; + // Data Aborts // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & 0x02)) [[unlikely]] { - DataCycles += 1; + DataCycles = 1; return false; } @@ -2640,7 +2655,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (addr < ITCMSize) { - DataCycles += 1; + DataCycles = 1; // we update the timestamp during the actual function, as a sequential itcm access can only occur during instructions with strange itcm wait cycles DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; @@ -2651,7 +2666,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) } if ((addr & DTCMMask) == DTCMBase) { - DataCycles += 1; + DataCycles = 1; DataRegion = Mem9_DTCM; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; return true; @@ -2678,26 +2693,42 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - if (!(PU_Map[addr>>12] & 0x30)) + if (!(PU_Map[addr>>12] & 0x30)) // non-bufferable { - DataCycles += (((NDS.ARM9Timestamp + DataCycles) + ((1<> 12][2]; //s - else DataCycles += MemTimings[addr >> 12][1]; // ns - - if ((addr >> 24) == 0x02) + if (addr & 0x3FF) // s { - if ((NDS.ARM9Timestamp + DataCycles) < MainRAMTimestamp) DataCycles = ((MainRAMTimestamp - NDS.ARM9Timestamp) + ((1<>14][2]; + + if ((addr >> 24) == 0x02) + { + MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; + MainRAMTimestamp += 2<>14]; + + // burst stores seem to process the extra delay cycles at the end of the burst + // this means that we end up *always* able to begin code fetches 3 cycles early when accessing the bus + // this is a weird way of implemeting this but it should work fine....? + NDS.ARM9Timestamp -= 3<>14]; - + else // ns + { + DataCycles = MemTimings[addr>>14][1]; + + if ((addr >> 24) == 0x02) + { + if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; + } + BusWrite32(addr, val); - DataCycles += MemTimings[addr >> 14][2]; } else { From ce55f29d9d2cc9eb63669472f864cbeb80500cd7 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 28 Oct 2024 12:47:05 -0400 Subject: [PATCH 196/306] loads to r15 force an interlock --- src/ARMInterpreter_LoadStore.cpp | 19 +++++++++++++++---- src/CP15.cpp | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index adbd1121..3e3841de 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -123,6 +123,8 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset, u16 ilmask) if (rd == 15) { if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; + if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (signror && (addr&0x3))); // force an interlock + cpu->JumpTo(val); } else @@ -322,7 +324,9 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (dabort) { \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ + if (r+1 == 15) { \ + if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; \ + cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ cpu->R[r+1] = val; \ if (cpu->Num == 0) { \ @@ -343,7 +347,9 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (dabort) { \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ - if (r == 14) cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); /* restores cpsr presumably due to shared dna with ldm */ \ + if (r+1 == 15) { \ + if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; \ + cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ cpu->R[r+1] = val; \ if (cpu->Num == 0) { \ @@ -359,7 +365,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ bool dabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ - u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ dabort |= !cpu->DataWrite32S (offset+4, storeval); /* no, i dont understand it either */ \ if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ cpu->AddCycles_CD(); \ @@ -376,7 +382,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ bool dabort = !cpu->DataWrite32(addr, cpu->R[r]); \ - u32 storeval = cpu->R[r+1]; if (r == 14) storeval+=4; \ + u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ dabort |= !cpu->DataWrite32S (addr+4, storeval); \ if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ cpu->AddCycles_CD(); \ @@ -666,7 +672,10 @@ void A_LDM(ARM* cpu) // jump if pc got written if (cpu->CurInstr & (1<<15)) + { + if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + } else if (cpu->Num == 0) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0x7FFF); @@ -1010,6 +1019,8 @@ void T_POP(ARM* cpu) if (!dabort) [[likely]] { if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; + if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock + cpu->JumpTo(pc); base += 4; } diff --git a/src/CP15.cpp b/src/CP15.cpp index a3365674..2d31ed71 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2710,7 +2710,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) // burst stores seem to process the extra delay cycles at the end of the burst // this means that we end up *always* able to begin code fetches 3 cycles early when accessing the bus - // this is a weird way of implemeting this but it should work fine....? + // this is a weird way of implementing this but it should work fine....? NDS.ARM9Timestamp -= 3< Date: Tue, 29 Oct 2024 19:56:18 -0400 Subject: [PATCH 197/306] code reads should trigger an edge case with dcache streaming also itcm and icache behave similarly with itcm fetches and apparently i forgot to commit the fix to stm too oops-- --- src/ARM.h | 1 + src/ARMInterpreter_LoadStore.cpp | 3 --- src/CP15.cpp | 22 +++++++++++++++++++--- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 657e2069..8795d4c5 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -281,6 +281,7 @@ public: void AddCycles_CD() override { + Store = true; AddCycles_MW(DataCycles); DataCycles = 0; } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 3e3841de..c5b25eb2 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -160,10 +160,7 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset, u16 ilmask) } if (cpu->Num == 0) - { ((ARMv5*)cpu)->HandleInterlocksMemory(rd); - ((ARMv5*)cpu)->Store = true; - } bool dabort; if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval); diff --git a/src/CP15.cpp b/src/CP15.cpp index 2d31ed71..0145e9df 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -411,7 +411,11 @@ u32 ARMv5::ICacheLookup(const u32 addr) { u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; - if (ICacheFillPtr == 7) NDS.ARM9Timestamp++; + if (ICacheFillPtr == 7) + { + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; // does this apply to streamed fetches? + NDS.ARM9Timestamp++; + } else { u64 nextfill = ICacheFillTimes[ICacheFillPtr++]; @@ -423,7 +427,11 @@ u32 ARMv5::ICacheLookup(const u32 addr) { u64 fillend = ICacheFillTimes[6] + 2; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; - else NDS.ARM9Timestamp++; + else // checkme + { + if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; + NDS.ARM9Timestamp++; + } ICacheFillPtr = 7; } } @@ -2014,6 +2022,13 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } #endif } + + // bus reads can only overlap with dcache streaming by 6 cycles + if (DCacheFillPtr != 7) + { + u64 time = DCacheFillTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } u8 cycles = MemTimings[addr >> 14][1]; @@ -2093,7 +2108,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) #endif // bus reads can only overlap with icache streaming by 6 cycles - // checkme: does cache trigger this? + // checkme: does dcache trigger this? if (ICacheFillPtr != 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? @@ -2695,6 +2710,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (!(PU_Map[addr>>12] & 0x30)) // non-bufferable { + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1< Date: Sat, 2 Nov 2024 00:15:38 -0400 Subject: [PATCH 198/306] fix 8 bit main ram write timing --- src/CP15.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 0145e9df..dd9057ac 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -282,7 +282,7 @@ void ARMv5::UpdatePURegion(const u32 n) "PU region %d: %08X-%08X, user=%02X priv=%02X, %08X/%08X\n", n, start << CP15_MAP_ENTRYSIZE_LOG2, - end << CP15_MAP_ENTRYSIZE_LOG2 - 1, + (end << CP15_MAP_ENTRYSIZE_LOG2) - 1, usermask, privmask, PU_DataRW, @@ -2442,7 +2442,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; From ca1fb2bc9e5a1f32aef2ee54cc9b60b54cdf774b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 2 Nov 2024 23:33:44 -0400 Subject: [PATCH 199/306] write buffer mk3 now with actually passing some hardware tests included! --- src/ARM.cpp | 5 +- src/ARM.h | 15 ++- src/CP15.cpp | 265 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 169 insertions(+), 116 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 4de2e8ad..33131713 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -214,7 +214,6 @@ void ARMv5::Reset() WBFillPointer = 0; WBDelay = 0; WBWriting = false; - WBMainRAMDelay = 0; ARM::Reset(); } @@ -618,7 +617,7 @@ void ARMv5::Execute() else { NDS.ARM9Timestamp = NDS.ARM9Target; - WriteBufferCheck(); + WriteBufferCheck(); return; } } @@ -745,7 +744,7 @@ void ARMv5::Execute() //NDS.ARM9Timestamp += Cycles; //Cycles = 0; } - WriteBufferCheck(); + WriteBufferCheck(); if (Halted == 2) Halted = 0; diff --git a/src/ARM.h b/src/ARM.h index 8795d4c5..1a5e6ca5 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -393,8 +393,8 @@ public: void ICacheInvalidateAll(); template inline bool WriteBufferHandle(); - void WriteBufferCheck(); - void WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr = 0); + template void WriteBufferCheck(); + void WriteBufferWrite(u32 val, u8 flag, u32 addr = 0); void WriteBufferDrain(); /** @@ -682,15 +682,14 @@ public: u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing bool WBWriting; // whether the buffer is actively trying to perform a write - u8 WBCurCycles; // how long the current write will take; bit 7 is a flag used to indicate main ram - u64 WBCurVal; // current value being written; 0-31: val | 62-32: flag; 0 = byte; 1 = halfword; 2 = word; 3 = address (invalid in this variable) u32 WBCurAddr; // address the write buffer is currently writing to + u64 WBCurVal; // current value being written; 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address (invalid in this variable) u32 storeaddr[16]; // temp until i figure out why using the fifo address entries directly didn't work - u8 WBCycles[16]; // num cycles for each write; bit 7 is a flag used to indicate main ram - u64 WriteBufferFifo[16]; // 0-31: val | 62-32: flag; 0 = byte; 1 = halfword; 2 = word; 3 = address - u64 WBTimestamp; // current timestamp in bus cycles - u64 WBMainRAMDelay; // timestamp in bus cycles used to emulate the delay before the next main ram write can begin + u64 WriteBufferFifo[16]; // 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address + u64 WBTimestamp; // current timestamp + //u64 WBMainRAMDelay; // timestamp used to emulate the delay before the next main ram write can begin u64 WBDelay; // timestamp in bus cycles use for the delay before next write to the write buffer can occur (seems to be a 1 cycle delay after a write to it) + u32 WBLastRegion; // the last region written to by the write buffer #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; diff --git a/src/CP15.cpp b/src/CP15.cpp index dd9057ac..978d923d 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -334,21 +334,24 @@ void ARMv5::UpdateRegionTimings(u32 addrstart, u32 addrend) { MemTimings[i][0] = (bustimings[0] << NDS.ARM9ClockShift) - 1; MemTimings[i][1] = (bustimings[2] << NDS.ARM9ClockShift) - 1; - MemTimings[i][2] = (bustimings[3] << NDS.ARM9ClockShift) - 1; + MemTimings[i][2] = bustimings[3] << NDS.ARM9ClockShift; // sequentials technically should probably be -1 as well? + // but it doesn't really matter as long as i also dont force align the start of sequential accesses, now does it? } else { if (NDS.ARM9Regions[i] != Mem9_MainRAM) { + // 133MHz clock has 1 less bus cycle penalty on ns accesses MemTimings[i][0] = ((bustimings[0] - 1) << NDS.ARM9ClockShift) - 1; MemTimings[i][1] = ((bustimings[2] - 1) << NDS.ARM9ClockShift) - 1; } else { + // we handle the different timings for main ram in the read/write functions (they're slightly more complicated...) MemTimings[i][0] = (bustimings[0] << NDS.ARM9ClockShift) - 1; MemTimings[i][1] = (bustimings[2] << NDS.ARM9ClockShift) - 1; } - MemTimings[i][2] = (bustimings[3] << NDS.ARM9ClockShift) - 1; + MemTimings[i][2] = bustimings[3] << NDS.ARM9ClockShift; } } } @@ -1119,42 +1122,29 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; - u8 cyclesn = NDS.ARM9MemTimings[tag>>14][2]; - if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) | 0x80; - - u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; - if ((tag >> 24) == 0x02) cycless = (cycless - 2) | 0x80; - - WriteBufferWrite(tag, 3, 0); - WriteBufferWrite(ptr[0], 2, cyclesn, tag+0x00); - WriteBufferWrite(ptr[1], 2, cycless, tag+0x04); - WriteBufferWrite(ptr[2], 2, cycless, tag+0x08); - WriteBufferWrite(ptr[3], 2, cycless, tag+0x0C); - NDS.ARM9Timestamp += 5; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? + WriteBufferWrite(tag, 4); + WriteBufferWrite(ptr[0], 2, tag+0x00); + WriteBufferWrite(ptr[1], 3, tag+0x04); + WriteBufferWrite(ptr[2], 3, tag+0x08); + WriteBufferWrite(ptr[3], 3, tag+0x0C); + NDS.ARM9Timestamp += 4; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? } if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) // todo: check how this behaves when both fields need to be written { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; - u8 cyclesn = NDS.ARM9MemTimings[tag>>14][2]; - if ((tag >> 24) == 0x02) cyclesn = (cyclesn - 2) | 0x80; - - u8 cycless = NDS.ARM9MemTimings[tag>>14][3]; - if ((tag >> 24) == 0x02) cycless = (cycless - 2) | 0x80; - if (DCacheTags[index] & CACHE_FLAG_DIRTY_LOWERHALF) { - cyclesn = cycless; // write back is done in one burst if both halves are dirty + WriteBufferWrite(ptr[4], 3, tag+0x10); } else { - WriteBufferWrite(tag+0x10, 3, 0); - NDS.ARM9Timestamp += 1; + WriteBufferWrite(tag+0x10, 4); + WriteBufferWrite(ptr[4], 2, tag+0x10); } - WriteBufferWrite(ptr[4], 2, cyclesn, tag+0x10); - WriteBufferWrite(ptr[5], 2, cycless, tag+0x14); - WriteBufferWrite(ptr[6], 2, cycless, tag+0x18); - WriteBufferWrite(ptr[7], 2, cycless, tag+0x1C); + WriteBufferWrite(ptr[5], 3, tag+0x14); + WriteBufferWrite(ptr[6], 3, tag+0x18); + WriteBufferWrite(ptr[7], 3, tag+0x1C); NDS.ARM9Timestamp += 4; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); @@ -1172,25 +1162,67 @@ inline bool ARMv5::WriteBufferHandle() // handle write buffer writes if (WBWriting) { - bool mainram = (WBCurCycles >= 0x80); - - u64 ts; - u64 mrts = (MainRAMTimestamp + ((1<> NDS.ARM9ClockShift; - if (WBMainRAMDelay < mrts) WBMainRAMDelay = mrts; - if (mainram) ts = std::max(WBTimestamp, WBMainRAMDelay) + (WBCurCycles & 0x7F); - else ts = WBTimestamp + (WBCurCycles & 0x7F); - - if (!force && ts > ((NDS.ARM9Timestamp + DataCycles) >> NDS.ARM9ClockShift)) return true; - if ( force && ts > ((NDS.ARM9Timestamp + DataCycles) >> NDS.ARM9ClockShift)) + // look up timings + u32 cycles; + switch (WBCurVal >> 61) { - NDS.ARM9Timestamp = ((ts - 1) << NDS.ARM9ClockShift) + 1; + case 0: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + cycles = (4 << NDS.ARM9ClockShift) - 1; + } + else cycles = MemTimings[WBCurAddr>>14][0] - 6; // todo: twl timings + break; + } + case 1: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + cycles = (3 << NDS.ARM9ClockShift) - 1; + } + else cycles = MemTimings[WBCurAddr>>14][0] - 6; // todo: twl timings + break; + } + case 2: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + cycles = (4 << NDS.ARM9ClockShift) - 1; + } + else cycles = MemTimings[WBCurAddr>>14][1] - 6; // todo: twl timings + break; + } + case 3: + { + cycles = MemTimings[WBCurAddr>>14][2]; + break; + } + } + + // get the current timestamp + u64 ts; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + ts = std::max(WBTimestamp, MainRAMTimestamp); + else + ts = WBTimestamp; + + ts = (ts + ((1< NDS.ARM9Timestamp) return true; + if ( force && ts > NDS.ARM9Timestamp) + { + NDS.ARM9Timestamp = ts; DataCycles = 0; // checkme } WBTimestamp = ts; - if (mainram) WBMainRAMDelay = WBTimestamp + 2; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) MainRAMTimestamp = ts + ((((WBCurVal >> 61) == 0) ? 4 : 5) << NDS.ARM9ClockShift); + else WBTimestamp += 2; // todo: twl timings - switch (WBCurVal >> 62) + switch (WBCurVal >> 61) { case 0: // byte BusWrite8 (WBCurAddr, WBCurVal); @@ -1199,13 +1231,15 @@ inline bool ARMv5::WriteBufferHandle() BusWrite16(WBCurAddr, WBCurVal); break; case 2: // word + case 3: BusWrite32(WBCurAddr, WBCurVal); break; - default: // address ie. invalid - Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE AN ADDRESS VIA THE WRITE BUFFER! PANIC!!!\n", (u8)(WBCurVal >> 62)); + default: // invalid + Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE AN ADDRESS VIA THE WRITE BUFFER! PANIC!!!\n", (u8)(WBCurVal >> 61)); break; } + WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; //printf("writing: adr: %i, val: %lli, cyl: %i", WBCurAddr, WBCurVal, WBCurCycles); WBWriting = false; } @@ -1213,21 +1247,16 @@ inline bool ARMv5::WriteBufferHandle() // check if write buffer is empty if (WBWritePointer == 16) return true; // attempt to drain write buffer - if ((WriteBufferFifo[WBWritePointer] >> 62) != 3) // not an address + if ((WriteBufferFifo[WBWritePointer] >> 61) != 4) // not an address { - if (WBCycles[WBWritePointer] >= 0x80) // main ram handling + if (NDS.ARM9Regions[storeaddr[WBWritePointer]>>14] == Mem9_MainRAM) // main ram handling { - u64 ts = ((NDS.ARM9Timestamp + DataCycles) >> NDS.ARM9ClockShift); - if (!force && (WBMainRAMDelay > ts)) return true; - if ( force && (WBMainRAMDelay > ts)) - { - NDS.ARM9Timestamp = ((WBMainRAMDelay - 1) << NDS.ARM9ClockShift) + 1; - DataCycles = 0; - } + if (!force && (MainRAMTimestamp > NDS.ARM9Timestamp)) return true; + if ( force && (MainRAMTimestamp > NDS.ARM9Timestamp)) + NDS.ARM9Timestamp = MainRAMTimestamp; } WBCurVal = WriteBufferFifo[WBWritePointer]; - WBCurCycles = WBCycles[WBWritePointer]; WBCurAddr = storeaddr[WBWritePointer]; WBWriting = true; } @@ -1245,25 +1274,35 @@ inline bool ARMv5::WriteBufferHandle() return false; } +template void ARMv5::WriteBufferCheck() { while (!WriteBufferHandle()); // loop until we've cleared out all writeable entries -} -void ARMv5::WriteBufferWrite(u32 val, u8 flag, u8 cycles, u32 addr) + if constexpr (next) // check if the next write is occuring + { + if (NDS.ARM9Timestamp > WBTimestamp) + { + WriteBufferHandle(); + } + } +} +template void ARMv5::WriteBufferCheck(); +template void ARMv5::WriteBufferCheck(); + +void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) { - WriteBufferCheck(); + WriteBufferCheck(); if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made WriteBufferHandle(); else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - WBTimestamp = ((NDS.ARM9Timestamp + DataCycles + 1) + ((1<> NDS.ARM9ClockShift; + WBTimestamp = NDS.ARM9Timestamp + 1; } - WriteBufferFifo[WBFillPointer] = val | (u64)flag << 62; - WBCycles[WBFillPointer] = cycles; + WriteBufferFifo[WBFillPointer] = val | (u64)flag << 61; storeaddr[WBFillPointer] = addr; WBFillPointer = (WBFillPointer + 1) & 0xF; } @@ -2034,6 +2073,8 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); + else + WriteBufferCheck(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<>12] & 0x30) WriteBufferDrain(); + else + WriteBufferCheck(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>12] & 0x30) WriteBufferDrain(); + else + WriteBufferCheck(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>12] & 0x30) WriteBufferDrain(); + else + WriteBufferCheck(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>12] & 0x30) // checkme WriteBufferDrain(); - - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<(); // bursts cannot cross a 1kb boundary if (addr & 0x3FF) // s @@ -2354,10 +2419,17 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (NDS.ARM9ClockShift == 2) MainRAMTimestamp += 4; DataRegion = Mem9_MainRAM; } - else DataRegion = NDS.ARM9Regions[addr>>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>14][1]; if ((addr >> 24) == 0x02) @@ -2367,7 +2439,12 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) if (NDS.ARM9ClockShift == 2) DataCycles -= 4; DataRegion = Mem9_MainRAM; } - else DataRegion = NDS.ARM9Regions[addr>>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>12] & (0x30))) { + WriteBufferCheck(); + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 14][0]; @@ -2452,14 +2531,8 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; - u8 cycles = NDS.ARM9MemTimings[addr>>14][0]; - if ((addr >> 24) == 0x02) - { - cycles = (cycles - 2) | 0x80; - } - - WriteBufferWrite(addr, 3, 0); - WriteBufferWrite(val, 0, cycles, addr); + WriteBufferWrite(addr, 4); + WriteBufferWrite(val, 0, addr); DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } @@ -2527,6 +2600,8 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (!(PU_Map[addr>>12] & 0x30)) { + WriteBufferCheck(); + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 14][0]; @@ -2546,14 +2621,8 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; - u8 cycles = NDS.ARM9MemTimings[addr>>14][0]; - if ((addr >> 24) == 0x02) - { - cycles = (cycles - 2) | 0x80; - } - - WriteBufferWrite(addr, 3, 0); - WriteBufferWrite(val, 1, cycles, addr); + WriteBufferWrite(addr, 4); + WriteBufferWrite(val, 1, addr); DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } @@ -2621,6 +2690,8 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) if (!(PU_Map[addr>>12] & 0x30)) { + WriteBufferCheck(); + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 14][1]; @@ -2640,14 +2711,8 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) { if (WBDelay > NDS.ARM9Timestamp) NDS.ARM9Timestamp = WBDelay; - u8 cycles = NDS.ARM9MemTimings[addr>>14][2]; - if ((addr >> 24) == 0x02) - { - cycles = (cycles - 2) | 0x80; - } - - WriteBufferWrite(addr, 3, 0); - WriteBufferWrite(val, 2, cycles, addr); + WriteBufferWrite(addr, 4); + WriteBufferWrite(val, 2, addr); DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } @@ -2710,7 +2775,8 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) if (!(PU_Map[addr>>12] & 0x30)) // non-bufferable { - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<(); + // bursts cannot cross a 1kb boundary if (addr & 0x3FF) // s { @@ -2748,20 +2814,9 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) } else { - u8 cycles; - // bursts cannot cross a 1kb boundary - // CHECKME: does this actually apply to the write buffer too? it should - if (addr & 0x3FF) cycles = NDS.ARM9MemTimings[addr>>14][3]; //s - else cycles = NDS.ARM9MemTimings[addr>>14][2]; // ns - - if ((addr >> 24) == 0x02) - { - cycles = (cycles - 2) | 0x80; - } - - WriteBufferWrite(val, 2, cycles, addr); - DataCycles += 1; - WBDelay = NDS.ARM9Timestamp + DataCycles + 1; + WriteBufferWrite(val, 3, addr); + DataCycles = 1; + WBDelay = NDS.ARM9Timestamp + 2; } return true; } From dfd2512c2021ef0f35443f9c38a720e0295d7986 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 3 Nov 2024 09:50:58 -0500 Subject: [PATCH 200/306] fix some more tests --- src/ARM.cpp | 2 ++ src/ARM.h | 2 +- src/CP15.cpp | 10 +++++----- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 33131713..5a08bce4 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -213,6 +213,8 @@ void ARMv5::Reset() WBWritePointer = 16; WBFillPointer = 0; WBDelay = 0; + WBTimestamp = 0; + WBLastRegion = Mem9_Null; WBWriting = false; ARM::Reset(); diff --git a/src/ARM.h b/src/ARM.h index 1a5e6ca5..a36009fb 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -392,7 +392,7 @@ public: */ void ICacheInvalidateAll(); - template inline bool WriteBufferHandle(); + template inline bool WriteBufferHandle(); template void WriteBufferCheck(); void WriteBufferWrite(u32 val, u8 flag, u32 addr = 0); void WriteBufferDrain(); diff --git a/src/CP15.cpp b/src/CP15.cpp index 978d923d..8bff685c 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1156,7 +1156,7 @@ bool ARMv5::IsAddressDCachable(const u32 addr) const return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE; } -template +template inline bool ARMv5::WriteBufferHandle() { // handle write buffer writes @@ -1215,7 +1215,6 @@ inline bool ARMv5::WriteBufferHandle() if ( force && ts > NDS.ARM9Timestamp) { NDS.ARM9Timestamp = ts; - DataCycles = 0; // checkme } WBTimestamp = ts; @@ -1242,6 +1241,7 @@ inline bool ARMv5::WriteBufferHandle() WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; //printf("writing: adr: %i, val: %lli, cyl: %i", WBCurAddr, WBCurVal, WBCurCycles); WBWriting = false; + if constexpr (force == 2) return true; } // check if write buffer is empty @@ -1281,9 +1281,9 @@ void ARMv5::WriteBufferCheck() if constexpr (next) // check if the next write is occuring { - if (NDS.ARM9Timestamp > WBTimestamp) + if (NDS.ARM9Timestamp >= WBTimestamp) { - WriteBufferHandle(); + while(!WriteBufferHandle<2>()); } } } @@ -1299,7 +1299,7 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - WBTimestamp = NDS.ARM9Timestamp + 1; + if (WBTimestamp < (NDS.ARM9Timestamp + 1)) WBTimestamp = NDS.ARM9Timestamp + 1; } WriteBufferFifo[WBFillPointer] = val | (u64)flag << 61; From 8a857f1257f2750383980813bdd664fb458af76f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 3 Nov 2024 09:55:14 -0500 Subject: [PATCH 201/306] why so many spaces --- src/ARMInterpreter_LoadStore.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 7b34d5af..bd6a4e8d 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -1028,8 +1028,6 @@ void T_POP(ARM* cpu) } else { - - if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; @@ -1093,7 +1091,6 @@ void T_STMIA(ARM* cpu) } } - if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; From a662af908df04e48f0bd041b26958b5905629fdd Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 3 Nov 2024 11:24:58 -0500 Subject: [PATCH 202/306] improved...? --- src/ARM.h | 2 +- src/CP15.cpp | 43 ++++++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index a36009fb..efdf9eba 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -393,7 +393,7 @@ public: void ICacheInvalidateAll(); template inline bool WriteBufferHandle(); - template void WriteBufferCheck(); + template void WriteBufferCheck(); void WriteBufferWrite(u32 val, u8 flag, u32 addr = 0); void WriteBufferDrain(); diff --git a/src/CP15.cpp b/src/CP15.cpp index 8bff685c..e0019e7e 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1217,7 +1217,7 @@ inline bool ARMv5::WriteBufferHandle() NDS.ARM9Timestamp = ts; } - WBTimestamp = ts; + WBTimestamp = (ts + ((1<>14] == Mem9_MainRAM) MainRAMTimestamp = ts + ((((WBCurVal >> 61) == 0) ? 4 : 5) << NDS.ARM9ClockShift); else WBTimestamp += 2; // todo: twl timings @@ -1274,32 +1274,37 @@ inline bool ARMv5::WriteBufferHandle() return false; } -template +template void ARMv5::WriteBufferCheck() { - while (!WriteBufferHandle()); // loop until we've cleared out all writeable entries + while (!WriteBufferHandle<0>()); // loop until we've cleared out all writeable entries - if constexpr (next) // check if the next write is occuring + if constexpr (next == 1) // check if the next write is occuring { if (NDS.ARM9Timestamp >= WBTimestamp) { while(!WriteBufferHandle<2>()); } } + else if constexpr (next == 2) + { + while(!WriteBufferHandle<2>()); + } } -template void ARMv5::WriteBufferCheck(); -template void ARMv5::WriteBufferCheck(); +template void ARMv5::WriteBufferCheck<2>(); +template void ARMv5::WriteBufferCheck<1>(); +template void ARMv5::WriteBufferCheck<0>(); void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) { - WriteBufferCheck(); + WriteBufferCheck<0>(); if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made - WriteBufferHandle(); + WriteBufferHandle<1>(); else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - if (WBTimestamp < (NDS.ARM9Timestamp + 1)) WBTimestamp = NDS.ARM9Timestamp + 1; + if (WBTimestamp < (NDS.ARM9Timestamp + 1)) WBTimestamp = (NDS.ARM9Timestamp + 1 + ((1<()); // loop until drained fully + while (!WriteBufferHandle<1>()); // loop until drained fully } void ARMv5::CP15Write(u32 id, u32 val) @@ -2074,7 +2079,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else - WriteBufferCheck(); + WriteBufferCheck<1>(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>12] & 0x30) WriteBufferDrain(); else - WriteBufferCheck(); + WriteBufferCheck<1>(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>12] & 0x30) WriteBufferDrain(); else - WriteBufferCheck(); + WriteBufferCheck<1>(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>12] & 0x30) WriteBufferDrain(); else - WriteBufferCheck(); + WriteBufferCheck<1>(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>12] & 0x30) // checkme WriteBufferDrain(); else - WriteBufferCheck(); + WriteBufferCheck<1>(); // bursts cannot cross a 1kb boundary if (addr & 0x3FF) // s @@ -2510,7 +2515,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (!(PU_Map[addr>>12] & (0x30))) { - WriteBufferCheck(); + WriteBufferCheck<2>(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>12] & 0x30)) { - WriteBufferCheck(); + WriteBufferCheck<2>(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>12] & 0x30)) { - WriteBufferCheck(); + WriteBufferCheck<2>(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<>12] & 0x30)) // non-bufferable { - WriteBufferCheck(); + WriteBufferCheck<2>(); // bursts cannot cross a 1kb boundary if (addr & 0x3FF) // s From d929587577e5a540725061a2eb15c6d8c590f999 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 3 Nov 2024 11:35:02 -0500 Subject: [PATCH 203/306] fix cache fill --- src/CP15.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index e0019e7e..fb130745 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -542,13 +542,13 @@ u32 ARMv5::ICacheLookup(const u32 addr) // Wait until the entire cache line is filled before continuing with execution if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] { - NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + ((MemTimings[tag >> 14][2] + 1) * ((DCACHE_LINELENGTH / 4) - 1)); + NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 1)); if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; } else // ICache Streaming logic { u8 ns = MemTimings[addr>>14][1]; - u8 seq = MemTimings[addr>>14][2] + 1; + u8 seq = MemTimings[addr>>14][2]; u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually @@ -762,7 +762,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) { NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1] + ((MemTimings[tag >> 14][2] + 1) * ((DCACHE_LINELENGTH / 4) - 2)); + NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 2)); DataCycles = MemTimings[tag>>14][2] + 1; if ((addr >> 24) == 0x02) @@ -783,7 +783,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>14][1]; - u8 seq = MemTimings[addr>>14][2] + 1; + u8 seq = MemTimings[addr>>14][2]; u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually From d6d54fd9137891346b235633288cc3263eb9d423 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 3 Nov 2024 15:30:07 -0500 Subject: [PATCH 204/306] handle wb writes followed immediately by reads better --- src/ARM.h | 1 + src/CP15.cpp | 48 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index efdf9eba..c714035e 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -690,6 +690,7 @@ public: //u64 WBMainRAMDelay; // timestamp used to emulate the delay before the next main ram write can begin u64 WBDelay; // timestamp in bus cycles use for the delay before next write to the write buffer can occur (seems to be a 1 cycle delay after a write to it) u32 WBLastRegion; // the last region written to by the write buffer + u64 WBReleaseTS; // the timestamp on which the write buffer relinquished control of the bus back #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; diff --git a/src/CP15.cpp b/src/CP15.cpp index fb130745..d650a6cc 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -554,6 +554,11 @@ u32 ARMv5::ICacheLookup(const u32 addr) u32 cycles = ns + (seq * linepos); NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; + + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<> 24) == 0x02) MainRAMTimestamp = ICacheFillTimes[6]; } @@ -771,14 +777,25 @@ u32 ARMv5::DCacheLookup(const u32 addr) MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_MainRAM; } - else DataRegion = NDS.ARM9Regions[addr>>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; + if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<>14]; if ((addr >> 24) == 0x02) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; } + else + { + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer + NDS.ARM9Timestamp += 1<> 24) == 0x02) MainRAMTimestamp = DCacheFillTimes[6]; - - DataRegion = NDS.ARM9Regions[addr>>14]; } return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -1217,9 +1232,17 @@ inline bool ARMv5::WriteBufferHandle() NDS.ARM9Timestamp = ts; } - WBTimestamp = (ts + ((1<>14] == Mem9_MainRAM) MainRAMTimestamp = ts + ((((WBCurVal >> 61) == 0) ? 4 : 5) << NDS.ARM9ClockShift); - else WBTimestamp += 2; // todo: twl timings + if ((WBCurVal >> 61) != 3) + { + WBReleaseTS = WBTimestamp = (ts + ((1<>14] == Mem9_MainRAM) MainRAMTimestamp = ts + ((((WBCurVal >> 61) == 0) ? 4 : 5) << NDS.ARM9ClockShift); + else WBTimestamp += 2; // todo: twl timings + } + else + { + WBReleaseTS = WBTimestamp = ts; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) MainRAMTimestamp = ts; + } switch (WBCurVal >> 61) { @@ -1288,6 +1311,7 @@ void ARMv5::WriteBufferCheck() } else if constexpr (next == 2) { + if (WBWriting) while(!WriteBufferHandle<2>()); } } @@ -2095,7 +2119,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } else { - if (((NDS.ARM9Timestamp <= WBTimestamp-1) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store NDS.ARM9Timestamp += 1<>14]; - if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14]; - if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14]; - if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14]; - if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14]; - if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1< Date: Sun, 3 Nov 2024 16:35:24 -0500 Subject: [PATCH 205/306] improve interlock timings still imperfect. using the same reg for multiple inputs can result in incorrect timings --- src/ARM.cpp | 29 +++++---- src/ARM.h | 2 +- src/ARMInterpreter_ALU.cpp | 122 +++++++++++++++++++++++++------------ 3 files changed, 99 insertions(+), 54 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index abb864d6..f8ba8d57 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1183,26 +1183,29 @@ void ARMv5::AddCycles_MW(s32 numM) } template -void ARMv5::HandleInterlocksExecute(u16 ilmask) +void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times) { if ((bitfield && (ilmask & (1<(u16 ilmask); -template void ARMv5::HandleInterlocksExecute(u16 ilmask); +template void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times); +template void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times); void ARMv5::HandleInterlocksMemory(u8 reg) { diff --git a/src/ARM.h b/src/ARM.h index c714035e..15f44023 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -287,7 +287,7 @@ public: } template - void HandleInterlocksExecute(u16 ilmask); + void HandleInterlocksExecute(u16 ilmask, u8* times = NULL); void HandleInterlocksMemory(u8 reg); void GetCodeMemRegion(const u32 addr, MemRegion* region); diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index ce2a0cd7..c493b9ac 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -153,25 +153,30 @@ inline bool OverflowSbc(u32 a, u32 b, u32 carry) #define A_CALC_OP2_IMM \ u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); \ - u16 ilmask = 0; + u16 ilmask = 0; \ + u8 iltime[16]; #define A_CALC_OP2_IMM_S \ u32 b = ROR(cpu->CurInstr&0xFF, (cpu->CurInstr>>7)&0x1E); \ if ((cpu->CurInstr>>7)&0x1E) \ cpu->SetC(b & 0x80000000); \ - u16 ilmask = 0; + u16 ilmask = 0; \ + u8 iltime[16]; #define A_CALC_OP2_REG_SHIFT_IMM(shiftop) \ u32 b = cpu->R[cpu->CurInstr&0xF]; \ u32 s = (cpu->CurInstr>>7)&0x1F; \ shiftop(b, s); \ - u16 ilmask = 1 << (cpu->CurInstr&0xF); + u16 ilmask = 1 << (cpu->CurInstr&0xF); \ + u8 iltime[16]; iltime[cpu->CurInstr&0xF] = 0; #define A_CALC_OP2_REG_SHIFT_REG(shiftop) \ u32 b = cpu->R[cpu->CurInstr&0xF]; \ if ((cpu->CurInstr&0xF)==15) b += 4; \ shiftop(b, (cpu->R[(cpu->CurInstr>>8)&0xF] & 0xFF)); \ - u16 ilmask = 1 << (cpu->CurInstr&0xF); + u16 ilmask = (1 << (cpu->CurInstr&0xF)) | (1 << ((cpu->CurInstr>>8)&0xF)); \ + u8 iltime[16]; iltime[(cpu->CurInstr>>8)&0xF] = 0; \ + iltime[cpu->CurInstr&0xF] = 1; // REMINDER: THIS IS WRONG, THIS CAN OVERWRITE LOWER VALUES. #define A_IMPLEMENT_ALU_OP(x,s) \ @@ -318,7 +323,9 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -331,9 +338,11 @@ void A_##x##_REG_ROR_REG(ARM* cpu) \ #define A_AND_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -349,7 +358,9 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a ^ b; \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -362,9 +373,11 @@ A_IMPLEMENT_ALU_OP(AND,_S) #define A_EOR_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a ^ b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -380,8 +393,9 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -394,12 +408,13 @@ A_IMPLEMENT_ALU_OP(EOR,_S) #define A_SUB_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ CarrySub(a, b), \ OverflowSub(a, b)); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -415,8 +430,9 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = b - a; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -429,12 +445,13 @@ A_IMPLEMENT_ALU_OP(SUB,) #define A_RSB_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = b - a; \ cpu->SetNZCV(res & 0x80000000, \ !res, \ CarrySub(b, a), \ OverflowSub(b, a)); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -450,8 +467,9 @@ A_IMPLEMENT_ALU_OP(RSB,) #define A_ADD(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a + b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -469,7 +487,7 @@ A_IMPLEMENT_ALU_OP(RSB,) !res, \ CarryAdd(a, b), \ OverflowAdd(a, b)); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -485,8 +503,9 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a + b + (cpu->CPSR&0x20000000 ? 1:0); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -499,6 +518,7 @@ A_IMPLEMENT_ALU_OP(ADD,) #define A_ADC_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res_tmp = a + b; \ u32 carry = (cpu->CPSR&0x20000000 ? 1:0); \ u32 res = res_tmp + carry; \ @@ -506,7 +526,7 @@ A_IMPLEMENT_ALU_OP(ADD,) !res, \ CarryAdd(a, b) | CarryAdd(res_tmp, carry), \ OverflowAdc(a, b, carry)); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -522,8 +542,9 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b - (cpu->CPSR&0x20000000 ? 0:1); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -536,6 +557,7 @@ A_IMPLEMENT_ALU_OP(ADC,) #define A_SBC_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res_tmp = a - b; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -543,7 +565,7 @@ A_IMPLEMENT_ALU_OP(ADC,) !res, \ CarrySub(a, b) & CarrySub(res_tmp, carry), \ OverflowSbc(a, b, carry)); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -559,8 +581,9 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = b - a - (cpu->CPSR&0x20000000 ? 0:1); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -573,6 +596,7 @@ A_IMPLEMENT_ALU_OP(SBC,) #define A_RSC_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res_tmp = b - a; \ u32 carry = (cpu->CPSR&0x20000000 ? 0:1); \ u32 res = res_tmp - carry; \ @@ -580,7 +604,7 @@ A_IMPLEMENT_ALU_OP(SBC,) !res, \ CarrySub(b, a) & CarrySub(res_tmp, carry), \ OverflowSbc(b, a, carry)); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -596,8 +620,9 @@ A_IMPLEMENT_ALU_OP(RSC,) #define A_TST(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ @@ -626,8 +651,9 @@ A_IMPLEMENT_ALU_TEST(TST,_S) #define A_TEQ(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a ^ b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ @@ -656,8 +682,9 @@ A_IMPLEMENT_ALU_TEST(TEQ,_S) #define A_CMP(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a - b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ @@ -690,8 +717,9 @@ A_IMPLEMENT_ALU_TEST(CMP,) #define A_CMN(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a + b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) [[unlikely]] /* this seems to trigger alu rd==15 behavior for arm7 and legacy instruction behavior for arm9 */ \ { \ @@ -724,8 +752,9 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a | b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -738,10 +767,11 @@ A_IMPLEMENT_ALU_TEST(CMN,) #define A_ORR_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a | b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -756,7 +786,7 @@ A_IMPLEMENT_ALU_OP(ORR,_S) #define A_MOV(c) \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -770,7 +800,7 @@ A_IMPLEMENT_ALU_OP(ORR,_S) #define A_MOV_S(c) \ cpu->SetNZ(b & 0x80000000, \ !b); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -804,8 +834,9 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & ~b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -818,10 +849,11 @@ void A_MOV_REG_LSL_IMM_DBG(ARM* cpu) #define A_BIC_S(c) \ u32 a = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ + iltime[(cpu->CurInstr>>16)&0xF] = c; \ u32 res = a & ~b; \ cpu->SetNZ(res & 0x80000000, \ !res); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF))); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask | (1 <<((cpu->CurInstr>>16) & 0xF)), iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -837,7 +869,7 @@ A_IMPLEMENT_ALU_OP(BIC,_S) #define A_MVN(c) \ b = ~b; \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -852,7 +884,7 @@ A_IMPLEMENT_ALU_OP(BIC,_S) b = ~b; \ cpu->SetNZ(b & 0x80000000, \ !b); \ - if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask); \ + if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(ilmask, iltime); \ if (c) cpu->AddCycles_CI(c); else cpu->AddCycles_C(); \ if (((cpu->CurInstr>>12) & 0xF) == 15) \ { \ @@ -932,9 +964,11 @@ void A_MLA(ARM* cpu) if (cpu->Num == 0) { + u8 iltime[16] = {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 8) & 0xF)) | - (1 << ((cpu->CurInstr >> 12) & 0xF))); + (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); else { @@ -1028,10 +1062,12 @@ void A_UMLAL(ARM* cpu) if (cpu->Num == 0) { + u8 iltime[16] = {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 8) & 0xF)) | - (1 << ((cpu->CurInstr >> 12) & 0xF)) | - (1 << ((cpu->CurInstr >> 16) & 0xF))); + (1 << ((cpu->CurInstr >> 12) & 0xF))/* | + (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); else { @@ -1124,10 +1160,12 @@ void A_SMLAL(ARM* cpu) if (cpu->Num == 0) { + u8 iltime[16] {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 8) & 0xF)) | - (1 << ((cpu->CurInstr >> 12) & 0xF)) | - (1 << ((cpu->CurInstr >> 16) & 0xF))); + (1 << ((cpu->CurInstr >> 12) & 0xF)) /*| + (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(4); else { @@ -1171,11 +1209,13 @@ void A_SMLAxy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - + + u8 iltime[16] {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 8) & 0xF)) | - (1 << ((cpu->CurInstr >> 12) & 0xF))); + (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); cpu->AddCycles_C(); cpu->DataRegion = Mem9_Null; @@ -1203,10 +1243,11 @@ void A_SMLAWy(ARM* cpu) if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; - + u8 iltime[16] = {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 8) & 0xF)) | - (1 << ((cpu->CurInstr >> 12) & 0xF))); + (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); cpu->AddCycles_C(); cpu->DataRegion = Mem9_Null; @@ -1293,11 +1334,12 @@ void A_SMLALxy(ARM* cpu) cpu->R[(cpu->CurInstr >> 16) & 0xF] = (u32)(res >> 32ULL); - + u8 iltime[16] {}; + iltime[(cpu->CurInstr>>12)&0xF] = 1; ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 8) & 0xF)) | - (1 << ((cpu->CurInstr >> 12) & 0xF)) | - (1 << ((cpu->CurInstr >> 16) & 0xF))); + (1 << ((cpu->CurInstr >> 12) & 0xF))/* | + (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); cpu->AddCycles_C(); // 1 X cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M From 604b21c85adb7d313565446b911c2e58ba2d3307 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 4 Nov 2024 09:23:58 -0500 Subject: [PATCH 206/306] this fixes stuff --- src/CP15.cpp | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index d650a6cc..993e60d3 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -535,7 +535,9 @@ u32 ARMv5::ICacheLookup(const u32 addr) { if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<>14] == WBLastRegion)) // check write buffer - || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store - NDS.ARM9Timestamp += 1<> 61) { @@ -1217,12 +1216,12 @@ inline bool ARMv5::WriteBufferHandle() // get the current timestamp u64 ts; - if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + if (((WBCurVal >> 61) != 3) && (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM)) ts = std::max(WBTimestamp, MainRAMTimestamp); else ts = WBTimestamp; - ts = (ts + ((1<> 61) != 3) ts = (ts + ((1<>14] == Mem9_MainRAM) MainRAMTimestamp = ts; + WBTimestamp = ts; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) MainRAMTimestamp += 2 << NDS.ARM9ClockShift; } switch (WBCurVal >> 61) @@ -1264,7 +1263,7 @@ inline bool ARMv5::WriteBufferHandle() WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; //printf("writing: adr: %i, val: %lli, cyl: %i", WBCurAddr, WBCurVal, WBCurCycles); WBWriting = false; - if constexpr (force == 2) return true; + if ((force == 2) && ((WriteBufferFifo[WBWritePointer] >> 61) != 3)) return true; } // check if write buffer is empty @@ -1304,7 +1303,7 @@ void ARMv5::WriteBufferCheck() if constexpr (next == 1) // check if the next write is occuring { - if (NDS.ARM9Timestamp >= WBTimestamp) + if (WBWriting) { while(!WriteBufferHandle<2>()); } @@ -1328,7 +1327,7 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - if (WBTimestamp < (NDS.ARM9Timestamp + 1)) WBTimestamp = (NDS.ARM9Timestamp + 1 + ((1<>14]; + WBTimestamp = NDS.ARM9Timestamp; BusWrite8(addr, val); } else @@ -2562,6 +2567,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) WriteBufferWrite(addr, 4); WriteBufferWrite(val, 0, addr); + DataRegion = Mem9_Null; DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } @@ -2644,6 +2650,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) } else DataRegion = NDS.ARM9Regions[addr>>14]; + WBTimestamp = NDS.ARM9Timestamp; BusWrite16(addr, val); } else @@ -2652,6 +2659,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) WriteBufferWrite(addr, 4); WriteBufferWrite(val, 1, addr); + DataRegion = Mem9_Null; DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } @@ -2733,7 +2741,8 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) DataCycles -= 2<>14]; - + + WBTimestamp = NDS.ARM9Timestamp; BusWrite32(addr, val); } else @@ -2742,6 +2751,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) WriteBufferWrite(addr, 4); WriteBufferWrite(val, 2, addr); + DataRegion = Mem9_Null; DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } @@ -2839,11 +2849,13 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) else DataRegion = NDS.ARM9Regions[addr>>14]; } + WBTimestamp = NDS.ARM9Timestamp; BusWrite32(addr, val); } else { WriteBufferWrite(val, 3, addr); + DataRegion = Mem9_Null; DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } From 24ed8832a9887ae9af22b4b108a4eeada05d0699 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 4 Nov 2024 12:16:56 -0500 Subject: [PATCH 207/306] fix resetting under certain circumstances cache streaming could use a stale value for setting the main ram timestamp under certain circumstances, which ofc games triggered....... --- src/ARM.cpp | 7 +++--- src/CP15.cpp | 60 +++++++++++++++++++++++++++------------------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index f8ba8d57..c1fc1898 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -215,6 +215,7 @@ void ARMv5::Reset() WBFillPointer = 0; WBDelay = 0; WBTimestamp = 0; + WBReleaseTS = 0; WBLastRegion = Mem9_Null; WBWriting = false; @@ -327,7 +328,7 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) // jumps count as nonsequential accesses on the instruction bus on the arm9 // thus it requires waiting for the current ICache line fill to complete before continuing - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 fillend = ICacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; @@ -1156,8 +1157,8 @@ void ARMv5::CodeFetch() { if (NullFetch) { - // no fetch is performed. - // in practice it doesn't matter though. + // the value we need is cached by the bus + // in practice we can treat this as a 1 cycle fetch, with no penalties NextInstr[1] >>= 16; NDS.ARM9Timestamp++; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; diff --git a/src/CP15.cpp b/src/CP15.cpp index 993e60d3..06e160de 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -414,7 +414,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) { u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; - if (ICacheFillPtr == 7) + if (ICacheFillPtr >= 7) { if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; // does this apply to streamed fetches? NDS.ARM9Timestamp++; @@ -480,7 +480,6 @@ u32 ARMv5::ICacheLookup(const u32 addr) DataRegion = Mem9_Null; return NDS.ARM9Read32(addr & ~3); } - u32 line; if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] @@ -513,7 +512,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -545,7 +544,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] { NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 1)); - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; // this should never trigger in practice } else // ICache Streaming logic { @@ -558,7 +557,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; - + ICacheFillPtr = linepos; for (int i = linepos; i < 7; i++) { @@ -566,7 +565,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) ICacheFillTimes[i] = cycles; } - if ((addr >> 24) == 0x02) MainRAMTimestamp = ICacheFillTimes[6]; + if ((addr >> 24) == 0x02) MainRAMTimestamp = ((linepos < 7) ? ICacheFillTimes[6] : NDS.ARM9Timestamp); } DataRegion = Mem9_Null; @@ -660,21 +659,24 @@ u32 ARMv5::DCacheLookup(const u32 addr) { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; - if (DCacheFillPtr == 7) DataCycles = 1; + if (DCacheFillPtr >= 7) + { + DataCycles = 1; + } else { u64 nextfill = DCacheFillTimes[DCacheFillPtr++]; - if (NDS.ARM9Timestamp < nextfill) + //if (NDS.ARM9Timestamp < nextfill) // can this ever really fail? { DataCycles = nextfill - NDS.ARM9Timestamp; } - else + /*else { u64 fillend = DCacheFillTimes[6] + 2; if (NDS.ARM9Timestamp < fillend) DataCycles = fillend - NDS.ARM9Timestamp; else DataCycles = 1; DCacheFillPtr = 7; - } + }*/ } DataRegion = Mem9_DCache; //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2], set, id>>2); @@ -690,7 +692,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) { // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -767,7 +769,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 2)); - DataCycles = MemTimings[tag>>14][2] + 1; + DataCycles = MemTimings[tag>>14][2]; if ((addr >> 24) == 0x02) { @@ -813,7 +815,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) cycles += seq; DCacheFillTimes[i] = cycles; } - if ((addr >> 24) == 0x02) MainRAMTimestamp = DCacheFillTimes[6]; + + if ((addr >> 24) == 0x02) MainRAMTimestamp = ((linepos < 7) ? ICacheFillTimes[6] : NDS.ARM9Timestamp); } return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -1261,7 +1264,6 @@ inline bool ARMv5::WriteBufferHandle() } WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; - //printf("writing: adr: %i, val: %lli, cyl: %i", WBCurAddr, WBCurVal, WBCurCycles); WBWriting = false; if ((force == 2) && ((WriteBufferFifo[WBWritePointer] >> 61) != 3)) return true; } @@ -2091,7 +2093,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) } // bus reads can only overlap with dcache streaming by 6 cycles - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 time = DCacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2136,7 +2138,7 @@ u32 ARMv5::CodeRead32(u32 addr, bool branch) bool ARMv5::DataRead8(u32 addr, u32* val) { - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 fillend = DCacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? @@ -2182,7 +2184,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does dcache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2218,7 +2220,7 @@ bool ARMv5::DataRead8(u32 addr, u32* val) bool ARMv5::DataRead16(u32 addr, u32* val) { - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 fillend = DCacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? @@ -2266,7 +2268,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2302,7 +2304,7 @@ bool ARMv5::DataRead16(u32 addr, u32* val) bool ARMv5::DataRead32(u32 addr, u32* val) { - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 fillend = DCacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? @@ -2350,7 +2352,7 @@ bool ARMv5::DataRead32(u32 addr, u32* val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2429,7 +2431,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2486,7 +2488,7 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) bool ARMv5::DataWrite8(u32 addr, u8 val) { - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 fillend = DCacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? @@ -2535,7 +2537,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2576,7 +2578,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) bool ARMv5::DataWrite16(u32 addr, u16 val) { - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 fillend = DCacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? @@ -2627,7 +2629,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2668,7 +2670,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) bool ARMv5::DataWrite32(u32 addr, u32 val) { - if (DCacheFillPtr != 7) + if (DCacheFillPtr < 7) { u64 fillend = DCacheFillTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? @@ -2719,7 +2721,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; @@ -2806,7 +2808,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr != 7) + if (ICacheFillPtr < 7) { u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; From 3c7db9b21f232f8dece502d20a9dba546fea217c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 5 Nov 2024 21:56:19 -0500 Subject: [PATCH 208/306] correct thumb multiply timings --- src/ARMInterpreter_ALU.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 83fc1944..504a9c21 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -1583,10 +1583,10 @@ void T_MUL_REG(ARM* cpu) else { cpu->SetC(0); // carry flag destroyed, they say. whatever that means... - if (a & 0xFF000000) cycles += 4; - else if (a & 0x00FF0000) cycles += 3; - else if (a & 0x0000FF00) cycles += 2; - else cycles += 1; + if ((a & 0xFFFFFF00) == 0x00000000 || (a & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; + else if ((a & 0xFFFF0000) == 0x00000000 || (a & 0xFFFF0000) == 0xFFFF0000) cycles = 2; + else if ((a & 0xFF000000) == 0x00000000 || (a & 0xFF000000) == 0xFF000000) cycles = 3; + else cycles = 4; } cpu->AddCycles_CI(cycles); } From 3d49f5f2560084dbc70b1df780ad12cf24e5b97f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Wed, 6 Nov 2024 00:18:29 -0500 Subject: [PATCH 209/306] arm7 muls carry flag emulation. --- src/ARMInterpreter_ALU.cpp | 19 ++-- src/ARMInterpreter_MultiplySuperLLE.h | 136 ++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 9 deletions(-) create mode 100644 src/ARMInterpreter_MultiplySuperLLE.h diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 504a9c21..72992f0f 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -19,6 +19,7 @@ #include #include "ARM.h" #include "NDS.h" +#include "ARMInterpreter_MultiplySuperLLE.h" namespace melonDS::ARMInterpreter { @@ -854,7 +855,6 @@ void A_MUL(ARM* cpu) { cpu->SetNZ(res & 0x80000000, !res); - if (cpu->Num==1) cpu->SetC(0); } u32 cycles; @@ -866,6 +866,7 @@ void A_MUL(ARM* cpu) else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 2; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 3; else cycles = 4; + if (cpu->CurInstr & (1<<20)) cpu->SetC(MULSCarry(rm, rs, 0, cycles==4)); } cpu->AddCycles_CI(cycles); @@ -886,7 +887,6 @@ void A_MLA(ARM* cpu) { cpu->SetNZ(res & 0x80000000, !res); - if (cpu->Num==1) cpu->SetC(0); } u32 cycles; @@ -898,6 +898,7 @@ void A_MLA(ARM* cpu) else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(MULSCarry(rm, rs, rn, cycles==5)); } cpu->AddCycles_CI(cycles); @@ -919,7 +920,6 @@ void A_UMULL(ARM* cpu) { cpu->SetNZ((u32)(res >> 63ULL), !res); - if (cpu->Num==1) cpu->SetC(0); } u32 cycles; @@ -931,6 +931,7 @@ void A_UMULL(ARM* cpu) else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(UMULLSCarry(0, rm, rs, cycles==5)); } cpu->AddCycles_CI(cycles); @@ -955,7 +956,6 @@ void A_UMLAL(ARM* cpu) { cpu->SetNZ((u32)(res >> 63ULL), !res); - if (cpu->Num==1) cpu->SetC(0); } u32 cycles; @@ -967,6 +967,7 @@ void A_UMLAL(ARM* cpu) else if ((rs & 0xFFFF0000) == 0x00000000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(UMULLSCarry(rd, rm, rs, cycles==5)); } cpu->AddCycles_CI(cycles); @@ -988,7 +989,6 @@ void A_SMULL(ARM* cpu) { cpu->SetNZ((u32)(res >> 63ULL), !res); - if (cpu->Num==1) cpu->SetC(0); } u32 cycles; @@ -1000,6 +1000,7 @@ void A_SMULL(ARM* cpu) else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(SMULLSCarry(0, rm, rs, cycles==5)); } cpu->AddCycles_CI(cycles); @@ -1024,7 +1025,6 @@ void A_SMLAL(ARM* cpu) { cpu->SetNZ((u32)(res >> 63ULL), !res); - if (cpu->Num==1) cpu->SetC(0); } u32 cycles; @@ -1036,6 +1036,7 @@ void A_SMLAL(ARM* cpu) else if ((rs & 0xFFFF0000) == 0x00000000 || (rs & 0xFFFF0000) == 0xFFFF0000) cycles = 3; else if ((rs & 0xFF000000) == 0x00000000 || (rs & 0xFF000000) == 0xFF000000) cycles = 4; else cycles = 5; + if (cpu->CurInstr & (1<<20)) cpu->SetC(SMULLSCarry(rd, rm, rs, cycles==5)); } cpu->AddCycles_CI(cycles); @@ -1575,18 +1576,18 @@ void T_MUL_REG(ARM* cpu) cpu->SetNZ(res & 0x80000000, !res); - s32 cycles = 0; + s32 cycles; if (cpu->Num == 0) { - cycles += 3; + cycles = 3; } else { - cpu->SetC(0); // carry flag destroyed, they say. whatever that means... if ((a & 0xFFFFFF00) == 0x00000000 || (a & 0xFFFFFF00) == 0xFFFFFF00) cycles = 1; else if ((a & 0xFFFF0000) == 0x00000000 || (a & 0xFFFF0000) == 0xFFFF0000) cycles = 2; else if ((a & 0xFF000000) == 0x00000000 || (a & 0xFF000000) == 0xFF000000) cycles = 3; else cycles = 4; + cpu->SetC(MULSCarry(b, a, 0, cycles==4)); // carry flag destroyed, they say. whatever that means... } cpu->AddCycles_CI(cycles); } diff --git a/src/ARMInterpreter_MultiplySuperLLE.h b/src/ARMInterpreter_MultiplySuperLLE.h new file mode 100644 index 00000000..21b17bbc --- /dev/null +++ b/src/ARMInterpreter_MultiplySuperLLE.h @@ -0,0 +1,136 @@ +#ifndef ARMINTERPRETER_MULTIPLYSUPERLLE_H +#define ARMINTERPRETER_MULTIPLYSUPERLLE_H + +#include "types.h" + +using namespace melonDS; + +/* + Copyright (c) 2024 zaydlang + + This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + + + + +// code taken from: (also features a few alternative implementations that could maybe be worth looking at?) +// https://github.com/calc84maniac/multiplication-algorithm/blob/master/impl_opt.h +// based on research that can be found here: https://bmchtech.github.io/post/multiply/ + +// the code in this file is dedicated to handling the calculation of the carry flag for multiplication (S variant) instructions on the ARM7TDMI. + + +// Takes a multiplier between -0x01000000 and 0x00FFFFFF, cycles between 0 and 2 +static inline bool booths_multiplication32_opt(u32 multiplicand, u32 multiplier, u32 accumulator) { + // Set the low bit of the multiplicand to cause negation to invert the upper bits, this bit can't propagate to bit 31 + multiplicand |= 1; + + // Optimized first iteration + u32 booth = (s32)(multiplier << 31) >> 31; + u32 carry = booth * multiplicand; + // Pre-populate accumulator for output + u32 output = accumulator; + + u32 sum = output + carry; + int shift = 29; + do { + for (int i = 0; i < 4; i++, shift -= 2) { + // Get next booth factor (-2 to 2, shifted left by 30-shift) + u32 next_booth = (s32)(multiplier << shift) >> shift; + u32 factor = next_booth - booth; + booth = next_booth; + // Get scaled value of booth addend + u32 addend = multiplicand * factor; + // Combine the addend with the CSA + // Not performing any masking seems to work because the lower carries can't propagate to bit 31 + output ^= carry ^ addend; + sum += addend; + carry = sum - output; + } + } while (booth != multiplier); + + return carry >> 31; +} + +// Takes a multiplicand shifted right by 6 and a multiplier shifted right by 26 (zero or sign extended) +static inline bool booths_multiplication64_opt(u32 multiplicand, u32 multiplier, u32 accum_hi) { + // Skipping the first 14 iterations seems to work because the lower carries can't propagate to bit 63 + // This means only magic bits 62-61 are needed (which requires decoding 3 booth chunks), + // and only the last two booth iterations are needed + + // Set the low bit of the multiplicand to cause negation to invert the upper bits + multiplicand |= 1; + + // Pre-populate magic bit 61 for carry + u32 carry = ~accum_hi & UINT32_C(0x20000000); + // Pre-populate magic bits 63-60 for output (with carry magic pre-added in) + u32 output = accum_hi - UINT32_C(0x08000000); + + // Get factors from the top 3 booth chunks + u32 booth0 = (s32)(multiplier << 27) >> 27; + u32 booth1 = (s32)(multiplier << 29) >> 29; + u32 booth2 = (s32)(multiplier << 31) >> 31; + u32 factor0 = multiplier - booth0; + u32 factor1 = booth0 - booth1; + u32 factor2 = booth1 - booth2; + + // Get scaled value of the 3rd top booth addend + u32 addend = multiplicand * factor2; + // Finalize bits 61-60 of output magic using its sign + output -= addend & UINT32_C(0x10000000); + // Get scaled value of the 2nd top booth addend + addend = multiplicand * factor1; + // Finalize bits 63-62 of output magic using its sign + output -= addend & UINT32_C(0x40000000); + + // Get the carry from the CSA in bit 61 and propagate it to bit 62, which is not processed in this iteration + u32 sum = output + (addend & UINT32_C(0x20000000)); + // Subtract out the carry magic to get the actual output magic + output -= carry; + + // Get scaled value of the 1st top booth addend + addend = multiplicand * factor0; + // Add to bit 62 and propagate the carry + sum += addend & UINT32_C(0x40000000); + + // Cancel out the output magic bit 63 to get the carry bit 63 + return (sum ^ output) >> 31; +} + + +// also for MLAS and MUL (thumb ver.) +inline bool MULSCarry(s32 rm, s32 rs, u32 rn, bool lastcycle) +{ + if (lastcycle) + return (rs >> 30) == -2; + else + return booths_multiplication32_opt(rm, rs, rn); +} + +// also for UMLALS +inline bool UMULLSCarry(u64 rd, u32 rm, u32 rs, bool lastcycle) +{ + if (lastcycle) + return booths_multiplication64_opt(rm >> 6, rs >> 26, rd >> 32); + else + return booths_multiplication32_opt(rm, rs, rd & 0xFFFFFFFF); +} + +// also for SMLALS +inline bool SMULLSCarry(u64 rd, s32 rm, s32 rs, bool lastcycle) +{ + if (lastcycle) + return booths_multiplication64_opt(rm >> 6, rs >> 26, rd >> 32); + else + return booths_multiplication32_opt(rm, rs, rd & 0xFFFFFFFF); +} + +#endif From ef5de6091b903bac7a0d4f34673a69efbea27906 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:16:39 -0500 Subject: [PATCH 210/306] t blx long with bit 0 set should raise an exception fixes a bug with gbarunner3 --- src/ARMInterpreter_Branch.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index 623be41a..88b14ab7 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -104,6 +104,9 @@ void T_BL_LONG_1(ARM* cpu) void T_BL_LONG_2(ARM* cpu) { + if ((cpu->CurInstr & 0x1801) == 0x0801) // "BLX" with bit 0 set is an unvalid instruction. + return T_UNK(cpu); // TODO: Check ARM7 for exceptions + s32 offset = (cpu->CurInstr & 0x7FF) << 1; u32 pc = cpu->R[14] + offset; cpu->R[14] = (cpu->R[15] - 2) | 1; From 5091061a39d307f9dd92ef0aa5d808fb0900121b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 7 Nov 2024 20:16:19 -0500 Subject: [PATCH 211/306] improve accuracy of prefetch abort handling slightly prefetch aborts should be handled on executing an instruction by a flag set when the instruction is fetched --- src/ARM.cpp | 8 ++++---- src/ARM.h | 6 +++--- src/ARMJIT.cpp | 2 +- src/CP15.cpp | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 682ce9ff..7f5d2e86 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -222,7 +222,7 @@ void ARM::DoSavestate(Savestate* file) file->VarArray(R_ABT, 3*sizeof(u32)); file->VarArray(R_IRQ, 3*sizeof(u32)); file->VarArray(R_UND, 3*sizeof(u32)); - file->Var32(&CurInstr); + file->Var64(&CurInstr); #ifdef JIT_ENABLED if (file->Saving && NDS.IsJITEnabled()) { @@ -232,7 +232,7 @@ void ARM::DoSavestate(Savestate* file) FillPipeline(); } #endif - file->VarArray(NextInstr, 2*sizeof(u32)); + file->VarArray(NextInstr, 2*sizeof(u64)); file->Var32(&ExceptionBase); @@ -667,7 +667,7 @@ void ARMv5::Execute() if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (!(PU_Map[(R[15]-4)>>12] & 0x04)) [[unlikely]] // handle aborted instructions + else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); } @@ -690,7 +690,7 @@ void ARMv5::Execute() if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (!(PU_Map[(R[15]-8)>>12] & 0x04)) [[unlikely]] // handle aborted instructions + else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); } diff --git a/src/ARM.h b/src/ARM.h index e7156d72..f4b3b53f 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -177,8 +177,8 @@ public: u32 R_ABT[3]; u32 R_IRQ[3]; u32 R_UND[3]; - u32 CurInstr; - u32 NextInstr[2]; + u64 CurInstr; + u64 NextInstr[2]; u32 ExceptionBase; @@ -251,7 +251,7 @@ public: void Execute(); // all code accesses are forced nonseq 32bit - u32 CodeRead32(u32 addr, bool branch); + u64 CodeRead32(u32 addr, bool branch); bool DataRead8(u32 addr, u32* val) override; bool DataRead16(u32 addr, u32* val) override; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 1ebcce8e..8bf509e9 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -588,7 +588,7 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept u32 numWriteAddrs = 0, writeAddrsTranslated = 0; cpu->FillPipeline(); - u32 nextInstr[2] = {cpu->NextInstr[0], cpu->NextInstr[1]}; + u32 nextInstr[2] = {(u32)cpu->NextInstr[0], (u32)cpu->NextInstr[1]}; u32 nextInstrAddr[2] = {blockAddr, r15}; JIT_DEBUGPRINT("start block %x %08x (%x)\n", blockAddr, cpu->CPSR, localAddr); diff --git a/src/CP15.cpp b/src/CP15.cpp index 5bffb185..fba73bda 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -771,14 +771,14 @@ u32 ARMv5::CP15Read(u32 id) const // TCM are handled here. // TODO: later on, handle PU, and maybe caches -u32 ARMv5::CodeRead32(u32 addr, bool branch) +u64 ARMv5::CodeRead32(u32 addr, bool branch) { // prefetch abort // the actual exception is not raised until the aborted instruction is executed if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] { CodeCycles = 1; - return 0; + return ((u64)1<<63); } if (addr < ITCMSize) From 60a819c1ed993aaf1d9ab16386d29f70596935ef Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 8 Nov 2024 01:02:36 -0500 Subject: [PATCH 212/306] correct handling of T bit changes w/o pipeline flush on arm9 --- src/ARMInterpreter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 82dc6876..ff79597e 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -129,7 +129,7 @@ void A_MSR_IMM(ARM* cpu) if (cpu->CPSR & 0x20) [[unlikely]] { - if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + if (cpu->Num == 0) cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); @@ -199,7 +199,7 @@ void A_MSR_REG(ARM* cpu) if (cpu->CPSR & 0x20) [[unlikely]] { - if (cpu->Num == 0) cpu->NextInstr[1] &= 0xFFFF; // checkme: probably not the right way to handle this + if (cpu->Num == 0) cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); From 676f471ebe788120efebc69271f58d5e2af2564b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 8 Nov 2024 01:36:14 -0500 Subject: [PATCH 213/306] fix edge case with thumb prefetch aborts --- src/ARM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 7f5d2e86..beefc132 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -667,7 +667,7 @@ void ARMv5::Execute() if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions + else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions { PrefetchAbort(); } From 9f8cf8dad20d9c65fbf458ff610ec37361e6f3fc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Nov 2024 14:49:34 -0500 Subject: [PATCH 214/306] ldm base writeback fails with r15 --- src/ARMInterpreter_LoadStore.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 9dc14ea4..77628d7d 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -568,7 +568,7 @@ void A_LDM(ARM* cpu) } // writeback to base - if (cpu->CurInstr & (1<<21)) + if (cpu->CurInstr & (1<<21) && (baseid != 15)) { // post writeback if (cpu->CurInstr & (1<<23)) @@ -624,7 +624,7 @@ void A_STM(ARM* cpu) base -= 4; } - if (cpu->CurInstr & (1<<21)) + if ((cpu->CurInstr & (1<<21)) && (baseid != 15)) cpu->R[baseid] = base; preinc = !preinc; @@ -681,7 +681,7 @@ void A_STM(ARM* cpu) return; } - if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21))) + if ((cpu->CurInstr & (1<<23)) && (cpu->CurInstr & (1<<21)) && (baseid != 15)) cpu->R[baseid] = base; From e4dd913ba3a1151fca3cba1ab76ad386a85eef58 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Nov 2024 15:38:08 -0500 Subject: [PATCH 215/306] arm7 RORs unaligned ldr(s)h ty mgba discord --- src/ARMInterpreter_LoadStore.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 77628d7d..80f82755 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -100,7 +100,10 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) return; } if constexpr (size == 8 && signror) val = (s32)(s8)val; - if constexpr (size == 16 && signror) val = (s32)(s16)val; + + if constexpr (size == 16) if (cpu->Num == 1) val = ROR(val, ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 + if constexpr (size == 16 && signror) val = (s32)(((cpu->Num == 1) && (addr & 1)) ? (s8)val : (s16)val); // sign extend like a ldrsb if we ror'd the value. + if constexpr (size == 32 && signror) val = ROR(val, ((addr&0x3)<<3)); if constexpr (writeback != Writeback::None) cpu->R[rn] += offset; From bdc315198f302ed03fdaf435e58e42b1095e4366 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:12:19 -0500 Subject: [PATCH 216/306] T_LDR_SPREL does ROR + misc cleanup --- src/ARMInterpreter_LoadStore.cpp | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 80f82755..4cd9a8fb 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -66,7 +66,7 @@ enum class Writeback Trans, }; -template +template void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) { static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); @@ -99,14 +99,21 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) ((ARMv5*)cpu)->DataAbort(); return; } - if constexpr (size == 8 && signror) val = (s32)(s8)val; + if constexpr (size == 8 && signextend) val = (s32)(s8)val; - if constexpr (size == 16) if (cpu->Num == 1) val = ROR(val, ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 - if constexpr (size == 16 && signror) val = (s32)(((cpu->Num == 1) && (addr & 1)) ? (s8)val : (s16)val); // sign extend like a ldrsb if we ror'd the value. + if constexpr (size == 16) + { + if (cpu->Num == 1) val = ROR(val, ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 + { + if constexpr (signextend) val = (s32)((addr&0x1) ? (s8)val : (s16)val); // sign extend like a ldrsb if we ror'd the value. + } + else if constexpr (signextend) val = (s32)(s16)val; + } - if constexpr (size == 32 && signror) val = ROR(val, ((addr&0x3)<<3)); + if constexpr (size == 32) val = ROR(val, ((addr&0x3)<<3)); - if constexpr (writeback != Writeback::None) cpu->R[rn] += offset; + + if constexpr (writeback != Writeback::None) cpu->R[rn] = addr; if (rd == 15) { @@ -173,12 +180,12 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) else StoreSingle<8, Writeback::Post>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDR \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDR_POST \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); #define A_LDRB \ if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ @@ -723,7 +730,7 @@ void T_STRB_REG(ARM* cpu) void T_LDR_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); } void T_LDRB_REG(ARM* cpu) @@ -760,7 +767,7 @@ void T_STR_IMM(ARM* cpu) void T_LDR_IMM(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); } void T_STRB_IMM(ARM* cpu) From ec241a822428392d4558245b9fc3bdc6aed148e5 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:18:48 -0500 Subject: [PATCH 217/306] im smrat :D --- src/ARMInterpreter_LoadStore.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4cd9a8fb..97bef0b0 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -99,12 +99,14 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) ((ARMv5*)cpu)->DataAbort(); return; } + if constexpr (size == 8 && signextend) val = (s32)(s8)val; if constexpr (size == 16) { - if (cpu->Num == 1) val = ROR(val, ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 + if (cpu->Num == 1) { + val = ROR(val, ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 if constexpr (signextend) val = (s32)((addr&0x1) ? (s8)val : (s16)val); // sign extend like a ldrsb if we ror'd the value. } else if constexpr (signextend) val = (s32)(s16)val; From fce0555a09283f7d5fdf6f195b4a9d8d2088b484 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 9 Nov 2024 22:07:17 -0500 Subject: [PATCH 218/306] slightly fix error in writeback handling --- src/ARMInterpreter_LoadStore.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 97bef0b0..159fc86f 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -115,6 +115,7 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) if constexpr (size == 32) val = ROR(val, ((addr&0x3)<<3)); + if constexpr (writeback >= Writeback::Post) addr += offset; if constexpr (writeback != Writeback::None) cpu->R[rn] = addr; if (rd == 15) @@ -160,8 +161,9 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) ((ARMv5*)cpu)->DataAbort(); return; } - - if constexpr (writeback != Writeback::None) cpu->R[rn] += offset; + + if constexpr (writeback >= Writeback::Post) addr += offset; + if constexpr (writeback != Writeback::None) cpu->R[rn] = addr; } From 9d92b8708a2b805dbefe75fbf59612ec996d9f8d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 10 Nov 2024 02:56:16 -0500 Subject: [PATCH 219/306] r15 writeback is very weird with ldr/str --- src/ARMInterpreter_Branch.cpp | 2 +- src/ARMInterpreter_LoadStore.cpp | 30 ++++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/ARMInterpreter_Branch.cpp b/src/ARMInterpreter_Branch.cpp index 88b14ab7..5731a0b6 100644 --- a/src/ARMInterpreter_Branch.cpp +++ b/src/ARMInterpreter_Branch.cpp @@ -104,7 +104,7 @@ void T_BL_LONG_1(ARM* cpu) void T_BL_LONG_2(ARM* cpu) { - if ((cpu->CurInstr & 0x1801) == 0x0801) // "BLX" with bit 0 set is an unvalid instruction. + if ((cpu->CurInstr & 0x1801) == 0x0801) // "BLX" with bit 0 set is an undefined instruction. return T_UNK(cpu); // TODO: Check ARM7 for exceptions s32 offset = (cpu->CurInstr & 0x7FF) << 1; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 159fc86f..a2c9d7cc 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -67,7 +67,7 @@ enum class Writeback }; template -void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) +void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset) { static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); @@ -116,7 +116,19 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) if constexpr (writeback >= Writeback::Post) addr += offset; - if constexpr (writeback != Writeback::None) cpu->R[rn] = addr; + if constexpr (writeback != Writeback::None) + { + if (rn != 15) [[likely]] // r15 writeback fails on arm9 + { + cpu->R[rn] = addr; + } + else if (cpu->Num == 1) // arm 7 + { + // note that at no point does it actually write the value it loaded to a register... + cpu->JumpTo((addr+4) & ~1); + return; + } + } if (rd == 15) { @@ -127,7 +139,7 @@ void LoadSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } template -void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) +void StoreSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset) { static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); @@ -163,7 +175,17 @@ void StoreSingle(ARM* cpu, u8 rd, u8 rn, s32 offset) } if constexpr (writeback >= Writeback::Post) addr += offset; - if constexpr (writeback != Writeback::None) cpu->R[rn] = addr; + if constexpr (writeback != Writeback::None) + { + if (rn != 15) [[likely]] // r15 writeback fails on arm9 + { + cpu->R[rn] = addr; + } + else if (cpu->Num == 1) // arm 7 + { + cpu->JumpTo(addr & ~1); + } + } } From 28d788f26d03e1893008dcc1098419752b842dda Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:56:54 -0500 Subject: [PATCH 220/306] fix build --- src/ARMInterpreter_LoadStore.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4beb027f..0a4f7224 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -148,7 +148,7 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if (rd == 15) { if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; - if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (signror && (addr&0x3))); // force an interlock + if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (addr&0x3)); // force an interlock cpu->JumpTo(val); } @@ -158,7 +158,7 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if (cpu->Num == 0) { ((ARMv5*)cpu)->ILCurrReg = rd; - bool extra = ((size < 32) || (signror && (addr&0x3))); + bool extra = ((size < 32) || (addr&0x3)); ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + extra; } } @@ -237,12 +237,12 @@ void StoreSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 else StoreSingle<8, Writeback::Post, true>(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDR \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDR_POST \ - if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); \ - else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset); + if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ + else LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); #define A_LDRB \ if (cpu->CurInstr & (1<<21)) LoadSingle(cpu, ((cpu->CurInstr>>12) & 0xF), ((cpu->CurInstr>>16) & 0xF), offset, ilmask); \ @@ -858,7 +858,7 @@ void T_STRB_REG(ARM* cpu) void T_LDR_REG(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7]); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), cpu->R[(cpu->CurInstr >> 6) & 0x7], (1 << ((cpu->CurInstr >> 6) & 0x7))); } void T_LDRB_REG(ARM* cpu) @@ -895,7 +895,7 @@ void T_STR_IMM(ARM* cpu) void T_LDR_IMM(ARM* cpu) { - LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C)); + LoadSingle(cpu, (cpu->CurInstr & 0x7), ((cpu->CurInstr >> 3) & 0x7), ((cpu->CurInstr >> 4) & 0x7C), 0); } void T_STRB_IMM(ARM* cpu) From 4099823af7862f6c79592174ab3615311b563409 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:58:10 -0500 Subject: [PATCH 221/306] more write buffer work --- src/ARM.cpp | 1 + src/ARM.h | 3 +- src/ARMInterpreter.cpp | 3 +- src/CP15.cpp | 85 ++++++++++++++++++++++++++---------------- 4 files changed, 58 insertions(+), 34 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 83d58217..75dde763 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -218,6 +218,7 @@ void ARMv5::Reset() WBReleaseTS = 0; WBLastRegion = Mem9_Null; WBWriting = false; + WBInitialTS = 0; ARM::Reset(); } diff --git a/src/ARM.h b/src/ARM.h index d45d87b6..2a002df9 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -681,7 +681,7 @@ public: u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing - bool WBWriting; // whether the buffer is actively trying to perform a write + u8 WBWriting; // whether the buffer is actively trying to perform a write u32 WBCurAddr; // address the write buffer is currently writing to u64 WBCurVal; // current value being written; 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address (invalid in this variable) u32 storeaddr[16]; // temp until i figure out why using the fifo address entries directly didn't work @@ -691,6 +691,7 @@ public: u64 WBDelay; // timestamp in bus cycles use for the delay before next write to the write buffer can occur (seems to be a 1 cycle delay after a write to it) u32 WBLastRegion; // the last region written to by the write buffer u64 WBReleaseTS; // the timestamp on which the write buffer relinquished control of the bus back + u64 WBInitialTS; // what cycle the entry was first sent in #ifdef GDBSTUB_ENABLED u32 ReadMem(u32 addr, int size) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index d2a8a9c7..7b8f93b0 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -293,7 +293,8 @@ void A_MCR(ARM* cpu) return A_UNK(cpu); // TODO: check what kind of exception it really is } - cpu->AddCycles_CI(1 + 1); // TODO: checkme + if (cpu->Num==0) cpu->AddCycles_CI(6); // checkme + else /* ARM7 */ cpu->AddCycles_CI(1 + 1); // TODO: checkme } void A_MRC(ARM* cpu) diff --git a/src/CP15.cpp b/src/CP15.cpp index fe79bad8..6c4af0ac 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -780,7 +780,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) else { DataRegion = NDS.ARM9Regions[addr>>14]; - if ((NDS.ARM9Timestamp <= WBTimestamp-1) && (DataRegion == WBLastRegion)) // check write buffer + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store NDS.ARM9Timestamp += 1<>14] == Mem9_MainRAM) && (((MainRAMTimestamp + ((1< WBTimestamp)) + WBTimestamp = (MainRAMTimestamp + ((1<> 61) != 3) && (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM)) - ts = std::max(WBTimestamp, MainRAMTimestamp); - else - ts = WBTimestamp; - - if ((WBCurVal >> 61) != 3) ts = (ts + ((1< NDS.ARM9Timestamp) return true; if ( force && ts > NDS.ARM9Timestamp) @@ -1245,6 +1241,7 @@ inline bool ARMv5::WriteBufferHandle() WBTimestamp = ts; if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) MainRAMTimestamp += 2 << NDS.ARM9ClockShift; } + WBInitialTS = WBTimestamp; switch (WBCurVal >> 61) { @@ -1275,11 +1272,15 @@ inline bool ARMv5::WriteBufferHandle() { if (NDS.ARM9Regions[storeaddr[WBWritePointer]>>14] == Mem9_MainRAM) // main ram handling { - if (!force && (MainRAMTimestamp > NDS.ARM9Timestamp)) return true; - if ( force && (MainRAMTimestamp > NDS.ARM9Timestamp)) - NDS.ARM9Timestamp = MainRAMTimestamp; + if (!force && (WBTimestamp > NDS.ARM9Timestamp)) return true; + if ( force && (WBTimestamp > NDS.ARM9Timestamp)) + NDS.ARM9Timestamp = WBTimestamp; + + WBTimestamp = std::max(MainRAMTimestamp, WBTimestamp); } + WBTimestamp = (WBTimestamp + ((1<()); // loop until we've cleared out all writeable entries - if constexpr (next == 1) // check if the next write is occuring + if constexpr (next == 1 || next == 3) // check if the next write is occuring { - if (WBWriting) + if (NDS.ARM9Timestamp >= WBInitialTS)// + (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM)))// || ((NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) && WBWriting)) { + u64 tsold = NDS.ARM9Timestamp; while(!WriteBufferHandle<2>()); + + //if constexpr (next == 3) NDS.ARM9Timestamp = std::max(tsold, NDS.ARM9Timestamp - (2<()); + //if (NDS.ARM9Timestamp >= WBInitialTS) + while(!WriteBufferHandle<2>()); } } +template void ARMv5::WriteBufferCheck<3>(); template void ARMv5::WriteBufferCheck<2>(); template void ARMv5::WriteBufferCheck<1>(); template void ARMv5::WriteBufferCheck<0>(); @@ -1326,10 +1331,14 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made WriteBufferHandle<1>(); - else if (WBWritePointer == 16) // indicates empty write buffer + else if (WBWritePointer == 16 && (flag != 4)) // indicates empty write buffer { WBWritePointer = 0; - if (!WBWriting && (WBTimestamp < (NDS.ARM9Timestamp+1))) WBTimestamp = (NDS.ARM9Timestamp+1); + u64 ts = (NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) ? std::max(MainRAMTimestamp, NDS.ARM9Timestamp + 1) : NDS.ARM9Timestamp + 1; + + if (!WBWriting && (WBTimestamp < ((ts + ((1<>12] & 0x30) WriteBufferDrain(); else - WriteBufferCheck<1>(); + WriteBufferCheck<3>(); NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<>14]; - WBTimestamp = NDS.ARM9Timestamp; + if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>14]; - WBTimestamp = NDS.ARM9Timestamp; + if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>14]; - WBTimestamp = NDS.ARM9Timestamp; + if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>14][1]; if ((addr >> 24) == 0x02) @@ -2851,7 +2871,8 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) else DataRegion = NDS.ARM9Regions[addr>>14]; } - WBTimestamp = NDS.ARM9Timestamp; + if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1< Date: Wed, 13 Nov 2024 20:57:00 -0500 Subject: [PATCH 222/306] some fixes to write buffer and icache prefetch --- src/CP15.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 6c4af0ac..085ba6fa 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1331,14 +1331,18 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made WriteBufferHandle<1>(); - else if (WBWritePointer == 16 && (flag != 4)) // indicates empty write buffer + else if (WBWritePointer == 16) // indicates empty write buffer { WBWritePointer = 0; - u64 ts = (NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) ? std::max(MainRAMTimestamp, NDS.ARM9Timestamp + 1) : NDS.ARM9Timestamp + 1; + if (!WBWriting) + { + u64 ts = ((NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) ? std::max(MainRAMTimestamp, (NDS.ARM9Timestamp + 1)) : (NDS.ARM9Timestamp + 1)); - if (!WBWriting && (WBTimestamp < ((ts + ((1< Date: Sun, 17 Nov 2024 08:55:39 -0500 Subject: [PATCH 223/306] improvements to dma --- src/DMA.cpp | 62 ++++++++++++++++++++++++++++----------------- src/DMA.h | 9 +++++-- src/DMA_Timings.cpp | 12 ++++----- src/NDS.h | 2 +- 4 files changed, 53 insertions(+), 32 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 80cd592c..56ec1564 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -187,16 +187,20 @@ void DMA::Start() // TODO eventually: not stop if we're running code in ITCM - Running = 2; + Running = 3; // safety measure MRAMBurstTable = DMATiming::MRAMDummy; InProgress = true; NDS.StopCPU(CPU, 1<> 14; u32 dst_id = CurDstAddr >> 14; @@ -213,11 +217,13 @@ u32 DMA::UnitTimings9_16(bool burststart) if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) - return 16; + { + return (burststart == 2) ? 11 : 16; + } if (SrcAddrInc > 0) { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) + if ((burststart == 2) || MRAMBurstTable[MRAMBurstCount] == 0) { MRAMBurstCount = 0; @@ -239,14 +245,14 @@ u32 DMA::UnitTimings9_16(bool burststart) { // TODO: not quite right for GBA slot return (((CurSrcAddr & 0x1F) == 0x1E) ? 7 : 8) + - (burststart ? dst_n : dst_s); + ((burststart == 2) ? dst_n : dst_s); } } else if (dst_rgn == Mem9_MainRAM) { if (DstAddrInc > 0) { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) + if ((burststart == 2) || MRAMBurstTable[MRAMBurstCount] == 0) { MRAMBurstCount = 0; @@ -266,23 +272,26 @@ u32 DMA::UnitTimings9_16(bool burststart) } else { - return (burststart ? src_n : src_s) + 7; + return ((burststart == 2) ? src_n : src_s) + 7; } } else if (src_rgn & dst_rgn) { - return src_n + dst_n + 1; + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); + else + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } } -u32 DMA::UnitTimings9_32(bool burststart) +u32 DMA::UnitTimings9_32(u8 burststart) { u32 src_id = CurSrcAddr >> 14; u32 dst_id = CurDstAddr >> 14; @@ -299,11 +308,11 @@ u32 DMA::UnitTimings9_32(bool burststart) if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) - return 18; + return (burststart == 2) ? 13 : 18; if (SrcAddrInc > 0) { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) + if ((burststart == 2) || MRAMBurstTable[MRAMBurstCount] == 0) { MRAMBurstCount = 0; @@ -327,14 +336,14 @@ u32 DMA::UnitTimings9_32(bool burststart) { // TODO: not quite right for GBA slot return (((CurSrcAddr & 0x1F) == 0x1C) ? (dst_n==2 ? 7:8) : 9) + - (burststart ? dst_n : dst_s); + ((burststart == 2) ? dst_n : dst_s); } } else if (dst_rgn == Mem9_MainRAM) { if (DstAddrInc > 0) { - if (burststart || MRAMBurstTable[MRAMBurstCount] == 0) + if ((burststart == 2) || MRAMBurstTable[MRAMBurstCount] == 0) { MRAMBurstCount = 0; @@ -356,17 +365,20 @@ u32 DMA::UnitTimings9_32(bool burststart) } else { - return (burststart ? src_n : src_s) + 8; + return ((burststart == 2) ? src_n : src_s) + 8; } } else if (src_rgn & dst_rgn) { - return src_n + dst_n + 1; + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); + else + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } @@ -557,15 +569,17 @@ void DMA::Run9() Executing = true; // add NS penalty for first accesses in burst - bool burststart = (Running == 2); - Running = 1; + int burststart = Running-1; + Running = 2; + + NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1< 0 && !Stall) { NDS.ARM9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); - burststart = false; + burststart -= 1; NDS.ARM9Write16(CurDstAddr, NDS.ARM9Read16(CurSrcAddr)); @@ -582,7 +596,7 @@ void DMA::Run9() while (IterCount > 0 && !Stall) { NDS.ARM9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); - burststart = false; + burststart -= 1; NDS.ARM9Write32(CurDstAddr, NDS.ARM9Read32(CurSrcAddr)); @@ -595,6 +609,8 @@ void DMA::Run9() } } + if (burststart == 1) Running = 1; + Executing = false; Stall = false; diff --git a/src/DMA.h b/src/DMA.h index 354f4495..64d5647f 100644 --- a/src/DMA.h +++ b/src/DMA.h @@ -40,8 +40,8 @@ public: void WriteCnt(u32 val); void Start(); - u32 UnitTimings9_16(bool burststart); - u32 UnitTimings9_32(bool burststart); + u32 UnitTimings9_16(u8 burststart); + u32 UnitTimings9_32(u8 burststart); u32 UnitTimings7_16(bool burststart); u32 UnitTimings7_32(bool burststart); @@ -73,6 +73,11 @@ public: if (Executing) Stall = true; } + void ResetBurst() + { + if (Running > 0) Running = (CPU ? 2 : 3); + } + u32 SrcAddr {}; u32 DstAddr {}; u32 Cnt {}; diff --git a/src/DMA_Timings.cpp b/src/DMA_Timings.cpp index a51fedfb..02539a62 100644 --- a/src/DMA_Timings.cpp +++ b/src/DMA_Timings.cpp @@ -48,7 +48,7 @@ extern const std::array MRAMDummy = {0}; extern const std::array MRAMRead16Bursts[] = { // main RAM to regular 16bit or 32bit bus (similar) - {7, 3, 2, 2, 2, 2, 2, 2, 2, 2, + {6, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -60,7 +60,7 @@ extern const std::array MRAMRead16Bursts[] = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 7, 3, 2, 2, 2, 2, 2, 2, 2, 2, + 6, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -72,7 +72,7 @@ extern const std::array MRAMRead16Bursts[] = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 7, 3, + 6, 3, 0}, // main RAM to GBA/wifi, seq=4 {8, 6, 5, 5, 5, 5, 5, 5, 5, 5, @@ -181,7 +181,7 @@ extern const std::array MRAMRead32Bursts[] = extern const std::array MRAMWrite16Bursts[] = { // regular 16bit or 32bit bus to main RAM (similar) - {8, 2, 2, 2, 2, 2, 2, 2, 2, 2, + {5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, @@ -212,7 +212,7 @@ extern const std::array MRAMWrite16Bursts[] = extern const std::array MRAMWrite32Bursts[4] = { // regular 16bit bus to main RAM - {9, 4, 4, 4, 4, 4, 4, 4, 4, 4, + {6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -220,7 +220,7 @@ extern const std::array MRAMWrite32Bursts[4] = 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0}, // regular 32bit bus to main RAM - {9, 3, 3, 3, 3, 3, 3, 3, 3, 3, + {6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, diff --git a/src/NDS.h b/src/NDS.h index e23b1f27..c1f0ff88 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -309,6 +309,7 @@ public: // TODO: Encapsulate the rest of these members GBACart::GBACartSlot GBACartSlot; melonDS::GPU GPU; melonDS::AREngine AREngine; + DMA DMAs[8]; #ifdef JIT_ENABLED bool IsJITEnabled(){return EnableJIT;}; @@ -494,7 +495,6 @@ private: u16 WifiWaitCnt; u8 TimerCheckMask[2]; u64 TimerTimestamp[2]; - DMA DMAs[8]; u32 DMA9Fill[4]; u16 IPCSync9, IPCSync7; u16 IPCFIFOCnt9, IPCFIFOCnt7; From 172a79cda86ca3bb60fdc06d6dc7109209f84fa1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 17 Nov 2024 09:12:32 -0500 Subject: [PATCH 224/306] hacky stall for cache streaming+wb during dmas --- src/NDS.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/NDS.cpp b/src/NDS.cpp index 591c22a0..df487ff6 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -957,6 +957,7 @@ u32 NDS::RunFrame() } else if (CPUStop & CPUStop_DMA9) { + u64 ts = ARM9Timestamp; DMAs[0].Run(); if (!(CPUStop & CPUStop_GXStall)) DMAs[1].Run(); if (!(CPUStop & CPUStop_GXStall)) DMAs[2].Run(); @@ -966,6 +967,14 @@ u32 NDS::RunFrame() auto& dsi = dynamic_cast(*this); dsi.RunNDMAs(0); } + ts = ARM9Timestamp - ts; + for (int i = 0; i < 7; i++) + { + ARM9.ICacheFillTimes[i] += ts; + ARM9.DCacheFillTimes[i] += ts; + } + ARM9.WBTimestamp += ts; + } else { From 0f20230a90c3150194b8ffc71960dd154546eac6 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 1 Dec 2024 15:03:14 -0500 Subject: [PATCH 225/306] fix cache streaming --- src/CP15.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 085ba6fa..92b41e25 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -553,7 +553,7 @@ u32 ARMv5::ICacheLookup(const u32 addr) u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually - u32 cycles = ns + (seq * linepos); + u64 cycles = ns + (seq * linepos); NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; @@ -805,7 +805,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually - u32 cycles = ns + (seq * linepos); + u64 cycles = ns + (seq * linepos); DataCycles = cycles; cycles += NDS.ARM9Timestamp; From 9f049056727b5517a8fe2364c206179c60f68cb9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 1 Dec 2024 15:28:46 -0500 Subject: [PATCH 226/306] fix mcr --- src/ARMInterpreter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 7b8f93b0..0400c40a 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -293,7 +293,7 @@ void A_MCR(ARM* cpu) return A_UNK(cpu); // TODO: check what kind of exception it really is } - if (cpu->Num==0) cpu->AddCycles_CI(6); // checkme + if (cpu->Num==0) cpu->AddCycles_CI(5); // checkme else /* ARM7 */ cpu->AddCycles_CI(1 + 1); // TODO: checkme } From 5698cf18620278de245d11ddb568ee6f20db02cd Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:29:32 -0500 Subject: [PATCH 227/306] lay ground work for main ram contention TAKE 2 alt title: ITS WORKING! ITS WORKING!!! --- src/ARM.cpp | 358 ++++++++++++----- src/ARM.h | 143 +++++-- src/ARMInterpreter.cpp | 61 +-- src/ARMInterpreter_ALU.cpp | 26 +- src/ARMInterpreter_LoadStore.cpp | 274 +++++++------ src/ARMJIT.cpp | 12 +- src/ARMJIT_A64/ARMJIT_Branch.cpp | 10 +- src/ARMJIT_A64/ARMJIT_LoadStore.cpp | 6 +- src/ARMJIT_x64/ARMJIT_Branch.cpp | 10 +- src/ARMJIT_x64/ARMJIT_LoadStore.cpp | 6 +- src/CP15.cpp | 599 +++++++++++++++++----------- src/CP15_Constants.h | 2 +- src/NDS.cpp | 117 +++++- src/NDS.h | 7 + 14 files changed, 1062 insertions(+), 569 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 75dde763..0bc138c2 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -194,12 +194,22 @@ void ARM::Reset() MainRAMTimestamp = 0; + memset(&MRTrack, 0, sizeof(MRTrack)); + + FuncQueueFill = 0; + FuncQueueEnd = 0; + FuncQueueProg = 0; + FuncQueueActive = false; + ExecuteCycles = 0; + // zorp JumpTo(ExceptionBase); } void ARMv5::Reset() { + FuncQueue[0] = &ARMv5::StartExec; + PU_Map = PU_PrivMap; Store = false; @@ -208,8 +218,8 @@ void ARMv5::Reset() ILCurrReg = 16; ILPrevReg = 16; - ICacheFillPtr = 7; - DCacheFillPtr = 7; + ICacheStreamPtr = 7; + DCacheStreamPtr = 7; WBWritePointer = 16; WBFillPointer = 0; @@ -313,14 +323,33 @@ void ARM::SetupCodeMem(u32 addr) } } -void ARMv5::JumpTo(u32 addr, bool restorecpsr) +void ARMv5::JumpTo(u32 addr, bool restorecpsr, u8 R15) { - if (restorecpsr) + //printf("JUMP! %08X %i %i\n", addr, restorecpsr, R15); + NDS.MonitorARM9Jump(addr); + + BranchRestore = restorecpsr; + BranchUpdate = R15; + BranchAddr = addr; + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_2; + else JumpTo_2(); +} + +void ARMv5::JumpTo_2() +{ + if (CP15Control & (1<<15)) + { + if (BranchUpdate == 1) BranchAddr = R[15] & ~1; + else if (BranchUpdate == 2) BranchAddr = R[15] | 1; + } + else if (BranchUpdate) BranchAddr = R[15]; + + if (BranchRestore) { RestoreCPSR(); - if (CPSR & 0x20) addr |= 0x1; - else addr &= ~0x1; + if (CPSR & 0x20) BranchAddr |= 0x1; + else BranchAddr &= ~0x1; } // aging cart debug crap @@ -329,47 +358,81 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr) // jumps count as nonsequential accesses on the instruction bus on the arm9 // thus it requires waiting for the current ICache line fill to complete before continuing - if (ICacheFillPtr < 7) + if (ICacheStreamPtr < 7) { - u64 fillend = ICacheFillTimes[6] + 1; + u64 fillend = ICacheStreamTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; - ICacheFillPtr = 7; + ICacheStreamPtr = 7; } - if (addr & 0x1) + if (BranchAddr & 0x1) { - addr &= ~0x1; - R[15] = addr+2; + BranchAddr &= ~0x1; + R[15] = BranchAddr+2; + + CPSR |= 0x20; // two-opcodes-at-once fetch // doesn't matter if we put garbage in the MSbs there - if (addr & 0x2) + if (BranchAddr & 0x2) { - NextInstr[0] = CodeRead32(addr-2, true) >> 16; - NextInstr[1] = CodeRead32(addr+2, false); + CodeRead32(BranchAddr-2); + + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_3A; + else JumpTo_3A(); } else { - NextInstr[0] = CodeRead32(addr, true); - NextInstr[1] = NextInstr[0] >> 16; - } + CodeRead32(BranchAddr); - CPSR |= 0x20; + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_3B; + else JumpTo_3B(); + } } else { - addr &= ~0x3; - R[15] = addr+4; - NextInstr[0] = CodeRead32(addr, true); - NextInstr[1] = CodeRead32(addr+4, false); + BranchAddr &= ~0x3; + R[15] = BranchAddr+4; CPSR &= ~0x20; - } - NDS.MonitorARM9Jump(addr); + CodeRead32(BranchAddr); + + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_3C; + else JumpTo_3C(); + } } -void ARMv4::JumpTo(u32 addr, bool restorecpsr) +void ARMv5::JumpTo_3A() +{ + NextInstr[0] = RetVal >> 16; + CodeRead32(BranchAddr+2); + + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_4; + else JumpTo_4(); +} + +void ARMv5::JumpTo_3B() +{ + NextInstr[0] = RetVal; + NextInstr[1] = NextInstr[0] >> 16; +} + +void ARMv5::JumpTo_3C() +{ + NextInstr[0] = RetVal; + CodeRead32(BranchAddr+4); + + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_4; + else JumpTo_4(); +} + +void ARMv5::JumpTo_4() +{ + NextInstr[1] = RetVal; +} + +void ARMv4::JumpTo(u32 addr, bool restorecpsr, u8 R15) { if (restorecpsr) { @@ -447,6 +510,11 @@ void ARM::RestoreCPSR() UpdateMode(oldcpsr, CPSR); } +void ARMv5::QueueUpdateMode() +{ + UpdateMode(QueueMode[0], QueueMode[1], true); +} + void ARM::UpdateMode(u32 oldmode, u32 newmode, bool phony) { if ((oldmode & 0x1F) == (newmode & 0x1F)) return; @@ -563,6 +631,7 @@ template void ARM::TriggerIRQ(); void ARMv5::PrefetchAbort() { + abt = true; AddCycles_C(); Log(LogLevel::Warn, "ARM9: prefetch abort (%08X)\n", R[15]); @@ -578,7 +647,8 @@ void ARMv5::PrefetchAbort() void ARMv5::DataAbort() { - Log(LogLevel::Warn, "ARM9: data abort (%08X)\n", R[15]); + abt = true; + Log(LogLevel::Warn, "ARM9: data abort (%08X) %08llX\n", R[15], CurInstr); u32 oldcpsr = CPSR; CPSR &= ~0xBF; @@ -595,6 +665,63 @@ void ARM::CheckGdbIncoming() GdbCheckA(); } +void ARMv5::StartExec() +{ + if (CPSR & 0x20) // THUMB + { + // prefetch + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + // code fetch is done during the execute stage cycle handling + if (R[15] & 0x2) NullFetch = true; + else NullFetch = false; + PC = R[15]; + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions + { + PrefetchAbort(); + } + else [[likely]] // actually execute + { + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); + } + } + else + { + // prefetch + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + // code fetch is done during the execute stage cycle handling + NullFetch = false; + PC = R[15]; + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions + { + PrefetchAbort(); + } + else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else if ((CurInstr & 0xFE000000) == 0xFA000000) + { + ARMInterpreter::A_BLX_IMM(this); + } + else if ((CurInstr & 0x0FF000F0) == 0x01200070) + { + ARMInterpreter::A_BKPT(this); // always passes regardless of condition code + } + else + AddCycles_C(); + } +} + template void ARMv5::Execute() { @@ -670,65 +797,81 @@ void ARMv5::Execute() else #endif { - if (CPSR & 0x20) // THUMB + if constexpr (mode == CPUExecuteMode::InterpreterGDB) + GdbCheckC(); // gdb might throw a hissy fit about this change but idc + + //printf("A:%i, F:%i, P:%i, E:%i, I:%08llX, P:%08X, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, PC, R[15]); + + (this->*FuncQueue[FuncQueueProg])(); + + if (FuncQueueActive) { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - // code fetch is done during the execute stage cycle handling - if (R[15] & 0x2) NullFetch = true; - else NullFetch = false; - PC = R[15]; - - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions + if (FuncQueueFill == FuncQueueProg) { - PrefetchAbort(); + // we did not get a new addition to the queue; increment and reset ptrs + FuncQueueFill = ++FuncQueueProg; + + // check if we're done with the queue, if so, reset everything + if (FuncQueueProg >= FuncQueueEnd) + { + + FuncQueueFill = 0; + FuncQueueProg = 0; + FuncQueueEnd = 0; + FuncQueueActive = false; + FuncQueue[0] = &ARMv5::StartExec; + /* + Platform::FileHandle* file = Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); + Platform::FileSeek(file, iter*16*4, Platform::FileSeekOrigin::Start); + u32 Regs[16]; + Platform::FileRead(Regs, 4, 16, file); + if (memcmp(Regs, R, 16*4)) + { + printf("MISMATCH ON ITERATION %lli! %08llX", iter, CurInstr); + for (int i = 0; i < 16; i++) + { + printf(" %i: %08X vs %08X", i, R[i], Regs[i]); + } + printf("\n"); + abt=1; + } + Platform::CloseFile(file); + iter++;*/ + } } - else [[likely]] // actually execute + else { - u32 icode = (CurInstr >> 6) & 0x3FF; - ARMInterpreter::THUMBInstrTable[icode](this); + // we got a new addition to the list; redo the current entry + FuncQueueFill = FuncQueueProg; } } + else if (FuncQueueFill > 0) // check if we started the queue up + { + FuncQueueEnd = FuncQueueFill; + FuncQueueFill = 0; + FuncQueueActive = true; + } else { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - // code fetch is done during the execute stage cycle handling - NullFetch = false; - PC = R[15]; - - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions + /* + Platform::FileHandle* file = Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); + Platform::FileSeek(file, iter*16*4, Platform::FileSeekOrigin::Start); + u32 Regs[16]; + Platform::FileRead(Regs, 4, 16, file); + if (memcmp(Regs, R, 16*4)) { - PrefetchAbort(); + printf("MISMATCH ON ITERATION %lli! %08llX", iter, CurInstr); + for (int i = 0; i < 16; i++) + { + printf(" %i: %08X vs %08X", i, R[i], Regs[i]); + } + printf("\n"); + abt=1; } - else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute - { - u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); - ARMInterpreter::ARMInstrTable[icode](this); - } - else if ((CurInstr & 0xFE000000) == 0xFA000000) - { - ARMInterpreter::A_BLX_IMM(this); - } - else if ((CurInstr & 0x0FF000F0) == 0x01200070) - { - ARMInterpreter::A_BKPT(this); // always passes regardless of condition code - } - else - AddCycles_C(); + Platform::CloseFile(file); + iter++;*/ } + if (MRTrack.Type != MainRAMType::Null) break; // check if we need to resolve main ram // TODO optimize this shit!!! if (Halted) @@ -907,26 +1050,26 @@ template void ARMv4::Execute(); void ARMv5::FillPipeline() { - SetupCodeMem(R[15]); + /*SetupCodeMem(R[15]); if (CPSR & 0x20) { if ((R[15] - 2) & 0x2) { - NextInstr[0] = CodeRead32(R[15] - 4, false) >> 16; - NextInstr[1] = CodeRead32(R[15], false); + NextInstr[0] = CodeRead32(R[15] - 4) >> 16; + NextInstr[1] = CodeRead32(R[15]); } else { - NextInstr[0] = CodeRead32(R[15] - 2, false); + NextInstr[0] = CodeRead32(R[15] - 2); NextInstr[1] = NextInstr[0] >> 16; } } else { - NextInstr[0] = CodeRead32(R[15] - 4, false); - NextInstr[1] = CodeRead32(R[15], false); - } + NextInstr[0] = CodeRead32(R[15] - 4); + NextInstr[1] = CodeRead32(R[15]); + }*/ } void ARMv4::FillPipeline() @@ -1160,23 +1303,37 @@ void ARMv5::CodeFetch() { // the value we need is cached by the bus // in practice we can treat this as a 1 cycle fetch, with no penalties - NextInstr[1] >>= 16; + RetVal = NextInstr[1] >> 16; NDS.ARM9Timestamp++; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; Store = false; DataRegion = Mem9_Null; } - else NextInstr[1] = CodeRead32(PC, false); + else + { + CodeRead32(PC); + } + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::AddExecute; + else AddExecute(); } -void ARMv5::AddCycles_CI(s32 numX) +void ARMv5::AddExecute() { - CodeFetch(); - NDS.ARM9Timestamp += numX; + NextInstr[1] = RetVal; + + NDS.ARM9Timestamp += ExecuteCycles; } void ARMv5::AddCycles_MW(s32 numM) { + DataCycles = numM; + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::AddCycles_MW_2; + else AddCycles_MW_2(); +} + +void ARMv5::AddCycles_MW_2() +{ + s32 numM = DataCycles; TimestampActual = numM + NDS.ARM9Timestamp; numM -= 3< void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times) { + /* if ((bitfield && (ilmask & (1<(u16 ilmask, u8* times); template void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times); void ARMv5::HandleInterlocksMemory(u8 reg) { + /* if ((reg != ILPrevReg) || (NDS.ARM9Timestamp >= ILPrevTime)) return; u64 diff = ILPrevTime - NDS.ARM9Timestamp; // should always be 1? NDS.ARM9Timestamp = ILPrevTime; ITCMTimestamp += diff; // checkme - ILPrevTime = 16; + ILPrevTime = 16;*/ } u16 ARMv4::CodeRead16(u32 addr) @@ -1265,8 +1424,10 @@ u32 ARMv4::CodeRead32(u32 addr) return BusRead32(addr); } -bool ARMv4::DataRead8(u32 addr, u32* val) +bool ARMv4::DataRead8(u32 addr, u8 reg) { + u32* val = &R[reg]; + if ((addr >> 24) == 0x02) { if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; @@ -1284,8 +1445,9 @@ bool ARMv4::DataRead8(u32 addr, u32* val) return true; } -bool ARMv4::DataRead16(u32 addr, u32* val) +bool ARMv4::DataRead16(u32 addr, u8 reg) { + u32* val = &R[reg]; addr &= ~1; if ((addr >> 24) == 0x02) @@ -1305,8 +1467,9 @@ bool ARMv4::DataRead16(u32 addr, u32* val) return true; } -bool ARMv4::DataRead32(u32 addr, u32* val) +bool ARMv4::DataRead32(u32 addr, u8 reg) { + u32* val = &R[reg]; addr &= ~3; if ((addr >> 24) == 0x02) @@ -1326,8 +1489,9 @@ bool ARMv4::DataRead32(u32 addr, u32* val) return true; } -bool ARMv4::DataRead32S(u32 addr, u32* val) +bool ARMv4::DataRead32S(u32 addr, u8 reg) { + u32* val = &R[reg]; addr &= ~3; if ((addr >> 24) == 0x02) @@ -1347,7 +1511,7 @@ bool ARMv4::DataRead32S(u32 addr, u32* val) return true; } -bool ARMv4::DataWrite8(u32 addr, u8 val) +bool ARMv4::DataWrite8(u32 addr, u8 val, u8 reg) { if ((addr >> 24) == 0x02) { @@ -1366,7 +1530,7 @@ bool ARMv4::DataWrite8(u32 addr, u8 val) return true; } -bool ARMv4::DataWrite16(u32 addr, u16 val) +bool ARMv4::DataWrite16(u32 addr, u16 val, u8 reg) { addr &= ~1; @@ -1387,7 +1551,7 @@ bool ARMv4::DataWrite16(u32 addr, u16 val) return true; } -bool ARMv4::DataWrite32(u32 addr, u32 val) +bool ARMv4::DataWrite32(u32 addr, u32 val, u8 reg) { addr &= ~3; @@ -1408,7 +1572,7 @@ bool ARMv4::DataWrite32(u32 addr, u32 val) return true; } -bool ARMv4::DataWrite32S(u32 addr, u32 val) +bool ARMv4::DataWrite32S(u32 addr, u32 val, u8 reg) { addr &= ~3; diff --git a/src/ARM.h b/src/ARM.h index 2a002df9..9fb195f4 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -53,6 +53,19 @@ enum class CPUExecuteMode : u32 #endif }; +enum class MainRAMType : u8 +{ + Null = 0, + ICacheStream, +}; + +struct MainRAMTrackers +{ + MainRAMType Type; + u8 Var; + u8 Progress; +}; + struct GDBArgs; class ARMJIT; class GPU; @@ -75,7 +88,7 @@ public: virtual void FillPipeline() = 0; - virtual void JumpTo(u32 addr, bool restorecpsr = false) = 0; + virtual void JumpTo(u32 addr, bool restorecpsr = false, u8 R15 = 0) = 0; void RestoreCPSR(); void Halt(u32 halt) @@ -135,14 +148,14 @@ public: void SetupCodeMem(u32 addr); - virtual bool DataRead8(u32 addr, u32* val) = 0; - virtual bool DataRead16(u32 addr, u32* val) = 0; - virtual bool DataRead32(u32 addr, u32* val) = 0; - virtual bool DataRead32S(u32 addr, u32* val) = 0; - virtual bool DataWrite8(u32 addr, u8 val) = 0; - virtual bool DataWrite16(u32 addr, u16 val) = 0; - virtual bool DataWrite32(u32 addr, u32 val) = 0; - virtual bool DataWrite32S(u32 addr, u32 val) = 0; + virtual bool DataRead8(u32 addr, u8 reg) = 0; + virtual bool DataRead16(u32 addr, u8 reg) = 0; + virtual bool DataRead32(u32 addr, u8 reg) = 0; + virtual bool DataRead32S(u32 addr, u8 reg) = 0; + virtual bool DataWrite8(u32 addr, u8 val, u8 reg) = 0; + virtual bool DataWrite16(u32 addr, u16 val, u8 reg) = 0; + virtual bool DataWrite32(u32 addr, u32 val, u8 reg) = 0; + virtual bool DataWrite32S(u32 addr, u32 val, u8 reg) = 0; virtual void AddCycles_C() = 0; virtual void AddCycles_CI(s32 numI) = 0; @@ -186,6 +199,29 @@ public: MemRegion CodeMem; u64 MainRAMTimestamp; + MainRAMTrackers MRTrack; + + u32 BranchAddr; + u8 BranchUpdate; + bool BranchRestore; + + u32 QueueMode[2]; + + u64 RetVal; + + u16 LDRRegs; + u16 LDRFailedRegs; + u16 STRRegs; + u32 FetchAddr[17]; + u32 STRVal[16]; + + u64 iter; + + u8 FuncQueueFill; + u8 FuncQueueEnd; + u8 FuncQueueProg; + u8 ExecuteCycles; + bool FuncQueueActive; #ifdef JIT_ENABLED u32 FastBlockLookupStart, FastBlockLookupSize; @@ -245,7 +281,7 @@ public: void FillPipeline() override; - void JumpTo(u32 addr, bool restorecpsr = false) override; + void JumpTo(u32 addr, bool restorecpsr = false, u8 R15 = 0) override; void PrefetchAbort(); void DataAbort(); @@ -254,36 +290,42 @@ public: void Execute(); // all code accesses are forced nonseq 32bit - u64 CodeRead32(const u32 addr, const bool branch); + void CodeRead32(const u32 addr); - bool DataRead8(u32 addr, u32* val) override; - bool DataRead16(u32 addr, u32* val) override; - bool DataRead32(u32 addr, u32* val) override; - bool DataRead32S(u32 addr, u32* val) override; - bool DataWrite8(u32 addr, u8 val) override; - bool DataWrite16(u32 addr, u16 val) override; - bool DataWrite32(u32 addr, u32 val) override; - bool DataWrite32S(u32 addr, u32 val) override; + bool DataRead8(u32 addr, u8 reg) override; + bool DataRead16(u32 addr, u8 reg) override; + bool DataRead32(u32 addr, u8 reg) override; + bool DataRead32S(u32 addr, u8 reg) override; + bool DataWrite8(u32 addr, u8 val, u8 reg) override; + bool DataWrite16(u32 addr, u16 val, u8 reg) override; + bool DataWrite32(u32 addr, u32 val, u8 reg) override; + bool DataWrite32S(u32 addr, u32 val, u8 reg) override; void CodeFetch(); - void AddCycles_C() override { CodeFetch(); } + void AddCycles_C() override + { + ExecuteCycles = 0; + CodeFetch(); + } - void AddCycles_CI(s32 numX) override; + void AddCycles_CI(s32 numX) override + { + ExecuteCycles = numX; + CodeFetch(); + } void AddCycles_MW(s32 numM); void AddCycles_CDI() override { AddCycles_MW(DataCycles); - DataCycles = 0; } void AddCycles_CD() override { - Store = true; + Store = true; // todo: queue this AddCycles_MW(DataCycles); - DataCycles = 0; } template @@ -366,7 +408,7 @@ public: * cache. The address is internally aligned to an word boundary * @return Value of the word at addr */ - u32 ICacheLookup(const u32 addr); + bool ICacheLookup(const u32 addr); /** * @brief Check if an address is within a instruction cachable @@ -604,6 +646,26 @@ public: * @return Value of the cp15 register */ u32 CP15Read(const u32 id) const; + + void StartExec(); + void AddExecute(); + void AddCycles_MW_2(); + void JumpTo_2(); + void JumpTo_3A(); + void JumpTo_3B(); + void JumpTo_3C(); + void JumpTo_4(); + void DAbortHandle(); + void DAbortHandleS(); + void DRead8_2(); + void DRead16_2(); + void DRead32_2(); + void DRead32S_2(); + void DWrite8_2(); + void DWrite16_2(); + void DWrite32_2(); + void DWrite32S_2(); + void QueueUpdateMode(); u32 CP15Control; //! CP15 Register 1: Control Register @@ -652,7 +714,7 @@ public: * 1 - CP15_MAP_WRITEABLE * 2 - CP15_MAP_EXECUTABLE * 4 - CP15_MAP_DCACHEABLE - * 5 - CP15_MAP_DCACHEWRITEBACK + * 5 - CP15_MAP_BUFFERABLE * 6 - CP15_MAP_ICACHEABLE */ u8 PU_UserMap[CP15_MAP_ENTRYCOUNT]; //! Memory mapping flags for User Mode @@ -665,6 +727,7 @@ public: u64 ITCMTimestamp; u64 TimestampActual; + void (ARMv5::*FuncQueue[31])(void); u32 PC; bool NullFetch; bool Store; @@ -674,10 +737,12 @@ public: u64 ILCurrTime; u64 ILPrevTime; - u8 ICacheFillPtr; - u8 DCacheFillPtr; - u64 ICacheFillTimes[7]; - u64 DCacheFillTimes[7]; + u8 ICacheStreamPtr; + u8 DCacheStreamPtr; + u64 ICacheStreamTimes[7]; + u64 DCacheStreamTimes[7]; + + bool abt; u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing @@ -716,7 +781,7 @@ public: void FillPipeline() override; - void JumpTo(u32 addr, bool restorecpsr = false) override; + void JumpTo(u32 addr, bool restorecpsr = false, u8 R15 = 0) override; template void Execute(); @@ -726,14 +791,14 @@ public: u16 CodeRead16(u32 addr); u32 CodeRead32(u32 addr); - bool DataRead8(u32 addr, u32* val) override; - bool DataRead16(u32 addr, u32* val) override; - bool DataRead32(u32 addr, u32* val) override; - bool DataRead32S(u32 addr, u32* val) override; - bool DataWrite8(u32 addr, u8 val) override; - bool DataWrite16(u32 addr, u16 val) override; - bool DataWrite32(u32 addr, u32 val) override; - bool DataWrite32S(u32 addr, u32 val) override; + bool DataRead8(u32 addr, u8 reg) override; + bool DataRead16(u32 addr, u8 reg) override; + bool DataRead32(u32 addr, u8 reg) override; + bool DataRead32S(u32 addr, u8 reg) override; + bool DataWrite8(u32 addr, u8 val, u8 reg) override; + bool DataWrite16(u32 addr, u16 val, u8 reg) override; + bool DataWrite32(u32 addr, u32 val, u8 reg) override; + bool DataWrite32S(u32 addr, u32 val, u8 reg) override; void AddCycles_C() override; void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 0400c40a..64249fac 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -50,6 +50,7 @@ void A_UNK(ARM* cpu) cpu->R_UND[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 4; + cpu->JumpTo(cpu->ExceptionBase + 0x04); } @@ -68,12 +69,13 @@ void T_UNK(ARM* cpu) cpu->R_UND[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 2; + cpu->JumpTo(cpu->ExceptionBase + 0x04); } void A_BKPT(ARM* cpu) { - if (cpu->Num == 1) A_UNK(cpu); // checkme + if (cpu->Num == 1) return A_UNK(cpu); // checkme Log(LogLevel::Warn, "BKPT: "); // combine with the prefetch abort warning message ((ARMv5*)cpu)->PrefetchAbort(); @@ -83,6 +85,9 @@ void A_BKPT(ARM* cpu) void A_MSR_IMM(ARM* cpu) { + if ((cpu->Num != 1) && (cpu->CurInstr & ((0x7<<16)|(1<<22)))) cpu->AddCycles_CI(2); // arm9 cpsr_sxc & spsr + else cpu->AddCycles_C(); + u32* psr; if (cpu->CurInstr & (1<<22)) { @@ -100,8 +105,6 @@ void A_MSR_IMM(ARM* cpu) case 0x1A: case 0x1B: psr = &cpu->R_UND[2]; break; default: - if (cpu->Num != 1) cpu->AddCycles_C(); // arm 7 - else cpu->AddCycles_CI(2); // arm 9 return; } } @@ -138,23 +141,15 @@ void A_MSR_IMM(ARM* cpu) cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least } } - - if (cpu->Num != 1) - { - if (cpu->CurInstr & (1<<22)) - { - cpu->AddCycles_CI(2); // spsr - } - else if (cpu->CurInstr & (0x7<<16)) cpu->AddCycles_CI(2); // cpsr_sxc - else cpu->AddCycles_C(); - } - else cpu->AddCycles_C(); } void A_MSR_REG(ARM* cpu) { if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr & 0xF); + if ((cpu->Num != 1) && (cpu->CurInstr & ((0x7<<16)|(1<<22)))) cpu->AddCycles_CI(2); // arm9 cpsr_sxc & spsr + else cpu->AddCycles_C(); + u32* psr; if (cpu->CurInstr & (1<<22)) { @@ -172,8 +167,6 @@ void A_MSR_REG(ARM* cpu) case 0x1A: case 0x1B: psr = &cpu->R_UND[2]; break; default: - if (cpu->Num != 1) cpu->AddCycles_C(); // arm 7 - else cpu->AddCycles_CI(2); // arm 9 return; } } @@ -210,17 +203,6 @@ void A_MSR_REG(ARM* cpu) cpu->CPSR &= ~0x20; // keep it from crashing the emulator at least } } - - if (cpu->Num != 1) - { - if (cpu->CurInstr & (1<<22)) - { - cpu->AddCycles_CI(2); // spsr - } - else if (cpu->CurInstr & (0x7<<16)) cpu->AddCycles_CI(2); // cpsr_sxc - else cpu->AddCycles_C(); - } - else cpu->AddCycles_C(); } void A_MRS(ARM* cpu) @@ -247,20 +229,19 @@ void A_MRS(ARM* cpu) else psr = cpu->CPSR; + if (cpu->Num != 1) // arm9 + { + cpu->AddCycles_C(); // 1 X + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M + } + else cpu->AddCycles_C(); // arm7 + if (((cpu->CurInstr>>12) & 0xF) == 15) { if (cpu->Num == 1) // doesn't seem to jump on the arm9? checkme cpu->JumpTo(psr & ~0x1); // checkme: this shouldn't be able to switch to thumb? } else cpu->R[(cpu->CurInstr>>12) & 0xF] = psr; - - if (cpu->Num != 1) // arm9 - { - cpu->AddCycles_C(); // 1 X - cpu->DataRegion = Mem9_Null; - ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M - } - else cpu->AddCycles_C(); // arm7 } @@ -281,7 +262,7 @@ void A_MCR(ARM* cpu) if (cpu->Num==0 && cp==15) { - ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo|(op<<12), val); + ((ARMv5*)cpu)->CP15Write((cn<<8)|(cm<<4)|cpinfo|(op<<12), val); // TODO: IF THIS RAISES AN EXCEPTION WE DO A DOUBLE CODE FETCH; FIX THAT } else if (cpu->Num==1 && cp==14) { @@ -292,7 +273,8 @@ void A_MCR(ARM* cpu) Log(LogLevel::Warn, "bad MCR opcode p%d, %d, reg, c%d, c%d, %d on ARM%d\n", cp, op, cn, cm, cpinfo, cpu->Num?7:9); return A_UNK(cpu); // TODO: check what kind of exception it really is } - + + // TODO: SINCE THIS DOES A CODE FETCH WE NEED TO DELAY ANY MPU UPDATES UNTIL *AFTER* THE CODE FETCH if (cpu->Num==0) cpu->AddCycles_CI(5); // checkme else /* ARM7 */ cpu->AddCycles_CI(1 + 1); // TODO: checkme } @@ -315,7 +297,7 @@ void A_MRC(ARM* cpu) else { // r15 updates the top 4 bits of the cpsr, done to "allow for conditional branching based on coprocessor status" - u32 flags = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo|(op<<12)) & 0xF0000000; + u32 flags = ((ARMv5*)cpu)->CP15Read((cn<<8)|(cm<<4)|cpinfo|(op<<12)) & 0xF0000000; // TODO: IF THIS RAISES AN EXCEPTION WE DO A DOUBLE CODE FETCH; FIX THAT cpu->CPSR = (cpu->CPSR & ~0xF0000000) | flags; } } @@ -332,7 +314,6 @@ void A_MRC(ARM* cpu) if (cpu->Num != 1) { cpu->AddCycles_C(); // 1 Execute cycle - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 Memory cycles ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -352,6 +333,7 @@ void A_SVC(ARM* cpu) // A_SWI cpu->R_SVC[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 4; + cpu->JumpTo(cpu->ExceptionBase + 0x08); } @@ -365,6 +347,7 @@ void T_SVC(ARM* cpu) // T_SWI cpu->R_SVC[2] = oldcpsr; cpu->R[14] = cpu->R[15] - 2; + cpu->JumpTo(cpu->ExceptionBase + 0x08); } diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 410a78e1..5edf5a39 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -926,7 +926,6 @@ void A_MUL(ARM* cpu) { cpu->AddCycles_C(); // 1 X - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -974,7 +973,6 @@ void A_MLA(ARM* cpu) { cpu->AddCycles_C(); // 1 X - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1020,7 +1018,6 @@ void A_UMULL(ARM* cpu) { cpu->AddCycles_CI(2); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1073,7 +1070,6 @@ void A_UMLAL(ARM* cpu) { cpu->AddCycles_CI(2); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1119,7 +1115,6 @@ void A_SMULL(ARM* cpu) { cpu->AddCycles_CI(2); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1171,7 +1166,6 @@ void A_SMLAL(ARM* cpu) { cpu->AddCycles_CI(2); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1219,7 +1213,6 @@ void A_SMLAxy(ARM* cpu) (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1251,7 +1244,6 @@ void A_SMLAWy(ARM* cpu) (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1279,7 +1271,6 @@ void A_SMULxy(ARM* cpu) (1 << ((cpu->CurInstr >> 8) & 0xF))); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1305,7 +1296,6 @@ void A_SMULWy(ARM* cpu) (1 << ((cpu->CurInstr >> 8) & 0xF))); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1342,7 +1332,7 @@ void A_SMLALxy(ARM* cpu) (1 << ((cpu->CurInstr >> 12) & 0xF))/* | (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); cpu->AddCycles_C(); // 1 X - cpu->DataRegion = Mem9_Null; + ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1370,11 +1360,11 @@ void A_CLZ(ARM* cpu) val |= 0x1; } - if (((cpu->CurInstr >> 12) & 0xF) == 15) cpu->JumpTo(res & ~1); - else cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; - ((ARMv5*)cpu)->HandleInterlocksExecute(cpu->CurInstr & 0xF); cpu->AddCycles_C(); + + if (((cpu->CurInstr >> 12) & 0xF) == 15) cpu->JumpTo(res & ~1); + else cpu->R[(cpu->CurInstr >> 12) & 0xF] = res; } void A_QADD(ARM* cpu) @@ -1398,7 +1388,6 @@ void A_QADD(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1424,7 +1413,6 @@ void A_QSUB(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1458,7 +1446,6 @@ void A_QDADD(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1492,7 +1479,6 @@ void A_QDSUB(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - cpu->DataRegion = Mem9_Null; ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; @@ -1902,6 +1888,8 @@ void T_CMP_HIREG(ARM* cpu) CarrySub(a, b), OverflowSub(a, b)); + cpu->AddCycles_C(); + if ((cpu->Num == 1) && (rd == 15)) { u32 oldpsr = cpu->CPSR; @@ -1913,8 +1901,6 @@ void T_CMP_HIREG(ARM* cpu) } } else if (cpu->Num == 0) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << rd) | (1 << rs)); - - cpu->AddCycles_C(); } void T_MOV_HIREG(ARM* cpu) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 0a4f7224..56380e6c 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -82,6 +82,7 @@ enum class Writeback template void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 ilmask) { + cpu->LDRFailedRegs = 0; static_assert((size == 8) || (size == 16) || (size == 32), "dummy this function only takes 8/16/32 for size!!!"); ExecuteStage(cpu, (ilmask | (1<PU_Map = ((ARMv5*)cpu)->PU_UserMap; } - u32 val; + u32 oldrd = cpu->R[rd]; bool dabort; - if constexpr (size == 8) dabort = !cpu->DataRead8 (addr, &val); - if constexpr (size == 16) dabort = !cpu->DataRead16(addr, &val); - if constexpr (size == 32) dabort = !cpu->DataRead32(addr, &val); + if constexpr (size == 8) dabort = !cpu->DataRead8 (addr, rd); + if constexpr (size == 16) dabort = !cpu->DataRead16(addr, rd); + if constexpr (size == 32) dabort = !cpu->DataRead32(addr, rd); if constexpr (writeback == Writeback::Trans) { @@ -114,21 +115,21 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 ((ARMv5*)cpu)->DataAbort(); return; } + if ((cpu->MRTrack.Type != MainRAMType::Null) && signextend && cpu->Num == 0) printf("ARGH ME BONES"); - if constexpr (size == 8 && signextend) val = (s32)(s8)val; + if constexpr (size == 8 && signextend) cpu->R[rd] = (s32)(s8)cpu->R[rd]; if constexpr (size == 16) { if (cpu->Num == 1) { - val = ROR(val, ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 - if constexpr (signextend) val = (s32)((addr&0x1) ? (s8)val : (s16)val); // sign extend like a ldrsb if we ror'd the value. + cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 + if constexpr (signextend) cpu->R[rd] = (s32)((addr&0x1) ? (s8)cpu->R[rd] : (s16)cpu->R[rd]); // sign extend like a ldrsb if we ror'd the value. } - else if constexpr (signextend) val = (s32)(s16)val; + else if constexpr (signextend) cpu->R[rd] = (s32)(s16)cpu->R[rd]; } - if constexpr (size == 32) val = ROR(val, ((addr&0x3)<<3)); - + if constexpr (size == 32) cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x3)<<3)); if constexpr (writeback >= Writeback::Post) addr += offset; if constexpr (writeback != Writeback::None) @@ -139,22 +140,23 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 } else if (cpu->Num == 1) // arm 7 { - // note that at no point does it actually write the value it loaded to a register... - cpu->JumpTo((addr+4) & ~1); + cpu->R[rd] = oldrd; // note that at no point does it actually write the value it loaded into a register... + cpu->LDRFailedRegs = 1<JumpTo((addr+4) & ~1); // +4 cause reasons return; } } if (rd == 15) { - if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) val &= ~0x1; - if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (addr&0x3)); // force an interlock + if (cpu->Num==1) cpu->R[15] &= ~0x1; - cpu->JumpTo(val); + //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (addr&0x3)); // force an interlock + + cpu->JumpTo(cpu->R[15], false, 1); } else { - cpu->R[rd] = val; if (cpu->Num == 0) { ((ARMv5*)cpu)->ILCurrReg = rd; @@ -188,9 +190,9 @@ void StoreSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 ((ARMv5*)cpu)->HandleInterlocksMemory(rd); bool dabort; - if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval); - if constexpr (size == 16) dabort = !cpu->DataWrite16(addr, storeval); - if constexpr (size == 32) dabort = !cpu->DataWrite32(addr, storeval); + if constexpr (size == 8) dabort = !cpu->DataWrite8 (addr, storeval, rd); + if constexpr (size == 16) dabort = !cpu->DataWrite16(addr, storeval, rd); + if constexpr (size == 32) dabort = !cpu->DataWrite32(addr, storeval, rd); if constexpr (writeback == Writeback::Trans) { @@ -208,11 +210,11 @@ void StoreSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if constexpr (writeback >= Writeback::Post) addr += offset; if constexpr (writeback != Writeback::None) { - if (rn != 15) [[likely]] // r15 writeback fails on arm9 + if (rn != 15) [[likely]] { cpu->R[rn] = addr; } - else if (cpu->Num == 1) // arm 7 + else if (cpu->Num == 1) // r15 writeback fails on arm9 { cpu->JumpTo(addr & ~1); } @@ -349,19 +351,20 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) offset += cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ + cpu->LDRFailedRegs = 0; \ ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ - bool dabort = !cpu->DataRead32(offset, &cpu->R[r]); \ - u32 val; dabort |= !cpu->DataRead32S(offset+4, &val); \ - if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ + bool dabort = !cpu->DataRead32(offset, r); \ + u32 oldval = cpu->R[r+1]; dabort |= !cpu->DataRead32S(offset+4, r+1); \ + /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ cpu->AddCycles_CDI(); \ if (dabort) { \ + cpu->R[r+1] = oldval; \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r+1 == 15) { \ - if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; \ - cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); } /* restores cpsr presumably due to shared dna with ldm */ \ + /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual;*/ \ + cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ - cpu->R[r+1] = val; \ if (cpu->Num == 0) { \ ((ARMv5*)cpu)->ILCurrReg = r+1; \ ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } \ @@ -372,19 +375,20 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) u32 addr = cpu->R[(cpu->CurInstr>>16) & 0xF]; \ u32 r = (cpu->CurInstr>>12) & 0xF; \ if (r&1) { A_UNK(cpu); return; } \ + cpu->LDRFailedRegs = 0; \ ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ - bool dabort = !cpu->DataRead32(addr, &cpu->R[r]); \ - u32 val; dabort |= !cpu->DataRead32S(addr+4, &val); \ - if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ + bool dabort = !cpu->DataRead32(addr, r); \ + u32 oldval = cpu->R[r+1]; dabort |= !cpu->DataRead32S(addr+4, r+1); \ + /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ cpu->AddCycles_CDI(); \ if (dabort) { \ + cpu->R[r+1] = oldval; \ ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r+1 == 15) { \ - if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; \ - cpu->JumpTo(((((ARMv5*)cpu)->CP15Control & (1<<15)) ? (val & ~0x1) : val), cpu->CurInstr & (1<<22)); } /* restores cpsr presumably due to shared dna with ldm */ \ + /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual;*/ \ + cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ - cpu->R[r+1] = val; \ if (cpu->Num == 0) { \ ((ARMv5*)cpu)->ILCurrReg = r+1; \ ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } \ @@ -397,10 +401,10 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } \ ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ - bool dabort = !cpu->DataWrite32(offset, cpu->R[r]); /* yes, this data abort behavior is on purpose */ \ + bool dabort = !cpu->DataWrite32(offset, cpu->R[r], r); \ u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ - dabort |= !cpu->DataWrite32S (offset+4, storeval); /* no, i dont understand it either */ \ - if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ + dabort |= !cpu->DataWrite32S (offset+4, storeval, r+1); \ + /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ cpu->AddCycles_CD(); \ if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -414,10 +418,10 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) if (r&1) { A_UNK(cpu); return; } \ ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ ((ARMv5*)cpu)->HandleInterlocksMemory(r); \ - bool dabort = !cpu->DataWrite32(addr, cpu->R[r]); \ + bool dabort = !cpu->DataWrite32(addr, cpu->R[r], r); \ u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ - dabort |= !cpu->DataWrite32S (addr+4, storeval); \ - if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; \ + dabort |= !cpu->DataWrite32S (addr+4, storeval, r+1); \ + /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ cpu->AddCycles_CD(); \ if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -484,29 +488,32 @@ template inline void SWP(ARM* cpu) { ExecuteStage(cpu, ((cpu->CurInstr >> 16) & 0xF)); + cpu->LDRFailedRegs = 0; u32 base = cpu->R[(cpu->CurInstr >> 16) & 0xF]; - u32 rm = cpu->R[cpu->CurInstr & 0xF]; - if ((cpu->CurInstr & 0xF) == 15) rm += 4; + u32 rd = (cpu->CurInstr >> 12) & 0xF; + u32 rm = cpu->CurInstr & 0xF; + u32 storeval = cpu->R[rm]; + if (rm == 15) storeval += 4; + - u32 val; - if ((byte ? cpu->DataRead8 (base, &val) - : cpu->DataRead32(base, &val))) [[likely]] + u32 oldrd = cpu->R[rd]; + + if ((byte ? cpu->DataRead8 (base, rd) + : cpu->DataRead32(base, rd))) [[likely]] { - cpu->NDS.ARM9Timestamp += cpu->DataCycles; // checkme + //cpu->NDS.ARM9Timestamp += cpu->DataCycles; // checkme - if ((byte ? cpu->DataWrite8 (base, rm) - : cpu->DataWrite32(base, rm))) [[likely]] + if ((byte ? cpu->DataWrite8 (base, storeval, rm) + : cpu->DataWrite32(base, storeval, rm))) [[likely]] { // rd only gets updated if both read and write succeed - u32 rd = (cpu->CurInstr >> 12) & 0xF; - if constexpr (!byte) val = ROR(val, 8*(base&0x3)); + if constexpr (!byte) cpu->R[rd] = ROR(cpu->R[rd], 8*(base&0x3)); cpu->AddCycles_CDI(); if (rd != 15) { - cpu->R[rd] = val; if (cpu->Num == 0) { ((ARMv5*)cpu)->ILCurrReg = rd; @@ -514,12 +521,18 @@ inline void SWP(ARM* cpu) ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + extra; } } - else if (cpu->Num==1) cpu->JumpTo(val & ~1); // for some reason these jumps don't seem to work on the arm 9? + else if (cpu->Num==1) // for some reason these jumps don't seem to work on the arm 9? + { + cpu->R[rd] = cpu->R[rd] & ~1; + cpu->JumpTo(cpu->R[rd], false, 1); + } return; } } // data abort handling + cpu->R[rd] = oldrd; + cpu->LDRFailedRegs = 1<AddCycles_CDI(); ((ARMv5*)cpu)->DataAbort(); } @@ -559,15 +572,15 @@ void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) if (flags & load) { - u32 pc; - cpu->DataRead32(base, &pc); + cpu->DataRead32(base, 15); cpu->AddCycles_CDI(); - cpu->JumpTo(pc, flags & restoreorthumb); + + cpu->JumpTo(cpu->R[15] & ~1, flags & restoreorthumb, 1); // TODO: fix this not maintaining current mode properly } else { - cpu->DataWrite32(base, cpu->R[15] + ((flags & restoreorthumb) ? 2 : 4)); + cpu->DataWrite32(base, cpu->R[15] + ((flags & restoreorthumb) ? 2 : 4), 15); cpu->AddCycles_CD(); } @@ -586,6 +599,7 @@ void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) void A_LDM(ARM* cpu) { + cpu->LDRFailedRegs = 0; u32 baseid = (cpu->CurInstr >> 16) & 0xF; u32 base = cpu->R[baseid]; u32 wbbase; @@ -626,48 +640,49 @@ void A_LDM(ARM* cpu) // switch to user mode regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) + { cpu->UpdateMode(cpu->CPSR, (cpu->CPSR&~0x1F)|0x10, true); + //if (cpu->MRTrack.Type != MainRAMType::Null) printf("AHA, DERES THE PROBLEM\n"); + } for (int i = 0; i < 15; i++) { if (cpu->CurInstr & (1<DataRead32 (base, &val) - : cpu->DataRead32S(base, &val)); - - // remaining loads still occur but are not written to a reg after a data abort is raised - if (!dabort) [[likely]] cpu->R[i] = val; + u32 oldval = cpu->R[i]; + dabort |= !(first ? cpu->DataRead32 (base, i) + : cpu->DataRead32S(base, i)); + if (dabort) [[unlikely]] { cpu->R[i] = oldval; cpu->LDRFailedRegs |= (1<CurInstr & (1<<15)) { if (preinc) base += 4; - dabort |= !(first ? cpu->DataRead32 (base, &pc) - : cpu->DataRead32S(base, &pc)); + u32 oldval = cpu->R[15]; + dabort |= !(first ? cpu->DataRead32 (base, 15) + : cpu->DataRead32S(base, 15)); + if (dabort) [[unlikely]] { cpu->R[15] = oldval; cpu->LDRFailedRegs |= (1<<15); } + else if (cpu->Num == 1) + cpu->R[15] &= ~0x1; if (!preinc) base += 4; - - if (cpu->Num == 1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) - pc &= ~0x1; } if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ;//cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CDI(); } @@ -675,7 +690,17 @@ void A_LDM(ARM* cpu) if (dabort) [[unlikely]] { if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) - cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + { + if (cpu->Num == 0) + { + cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; + cpu->QueueMode[1] = cpu->CPSR; + + if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::QueueUpdateMode; + else ((ARMv5*)cpu)->QueueUpdateMode(); + } + else cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + } ((ARMv5*)cpu)->DataAbort(); return; @@ -694,7 +719,7 @@ void A_LDM(ARM* cpu) { u32 rlist = cpu->CurInstr & 0xFFFF; if ((!(rlist & ~(1 << baseid))) || (rlist & ~((2 << baseid) - 1))) - cpu->R[baseid] = wbbase; + { cpu->R[baseid] = wbbase; cpu->LDRFailedRegs = 1<CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) - cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + { + if (cpu->Num == 0) + { + cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; + cpu->QueueMode[1] = cpu->CPSR; + + if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::QueueUpdateMode; + else ((ARMv5*)cpu)->QueueUpdateMode(); + } + else cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + } // jump if pc got written if (cpu->CurInstr & (1<<15)) { - if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock - cpu->JumpTo(pc, cpu->CurInstr & (1<<22)); + //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock + cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } else if (cpu->Num == 0) { @@ -785,8 +820,8 @@ void A_STM(ARM* cpu) if (i == 15) val+=4; - dabort |= !(first ? cpu->DataWrite32 (base, val) - : cpu->DataWrite32S(base, val)); + dabort |= !(first ? cpu->DataWrite32 (base, val, i) + : cpu->DataWrite32S(base, val, i)); first = false; @@ -799,14 +834,14 @@ void A_STM(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CD(); } @@ -833,8 +868,9 @@ void A_STM(ARM* cpu) void T_LDR_PCREL(ARM* cpu) { ExecuteStage(cpu, 15); + cpu->LDRFailedRegs = 0; u32 addr = (cpu->R[15] & ~0x2) + ((cpu->CurInstr & 0xFF) << 2); - bool dabort = !cpu->DataRead32(addr, &cpu->R[(cpu->CurInstr >> 8) & 0x7]); + bool dabort = !cpu->DataRead32(addr, (cpu->CurInstr >> 8) & 0x7); cpu->AddCycles_CDI(); if (dabort) [[unlikely]] ((ARMv5*)cpu)->DataAbort(); @@ -968,9 +1004,8 @@ void T_PUSH(ARM* cpu) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]) - : cpu->DataWrite32S(base, cpu->R[i])); - + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[i], i) + : cpu->DataWrite32S(base, cpu->R[i], i)); first = false; base += 4; } @@ -978,20 +1013,20 @@ void T_PUSH(ARM* cpu) if (cpu->CurInstr & (1<<8)) { - dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[14]) - : cpu->DataWrite32S(base, cpu->R[14])); + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[14], 14) + : cpu->DataWrite32S(base, cpu->R[14], 14)); } if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CD(); } @@ -1007,6 +1042,7 @@ void T_PUSH(ARM* cpu) void T_POP(ARM* cpu) { ExecuteStage(cpu, 13); + cpu->LDRFailedRegs = 0; u32 base = cpu->R[13]; bool first = true; bool dabort = false; @@ -1021,11 +1057,10 @@ void T_POP(ARM* cpu) { if (cpu->CurInstr & (1<DataRead32 (base, &val) - : cpu->DataRead32S(base, &val)); - - if (!dabort) [[likely]] cpu->R[i] = val; + u32 oldval = cpu->R[i]; + dabort |= !(first ? cpu->DataRead32 (base, i) + : cpu->DataRead32S(base, i)); + if (dabort) [[unlikely]] { cpu->R[i] = oldval; cpu->LDRFailedRegs |= (1<CurInstr & (1<<8)) { - u32 pc; - dabort |= !(first ? cpu->DataRead32 (base, &pc) - : cpu->DataRead32S(base, &pc)); + u32 oldval = cpu->R[15]; + dabort |= !(first ? cpu->DataRead32 (base, 15) + : cpu->DataRead32S(base, 15)); if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CDI(); } if (!dabort) [[likely]] { - if (cpu->Num==1 || (((ARMv5*)cpu)->CP15Control & (1<<15))) pc |= 0x1; - if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock + if (cpu->Num==1) cpu->R[15] |= 0x1; + //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock - cpu->JumpTo(pc); + cpu->JumpTo(cpu->R[15], false, 2); base += 4; } else [[unlikely]] { + cpu->R[15] = oldval; + cpu->LDRFailedRegs |= (1<<15); ((ARMv5*)cpu)->DataAbort(); return; } @@ -1069,14 +1106,14 @@ void T_POP(ARM* cpu) { if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CDI(); } @@ -1122,9 +1159,8 @@ void T_STMIA(ARM* cpu) { if (cpu->CurInstr & (1<DataWrite32 (base, cpu->R[i]) - : cpu->DataWrite32S(base, cpu->R[i])); - + dabort |= !(first ? cpu->DataWrite32 (base, cpu->R[i], i) + : cpu->DataWrite32S(base, cpu->R[i], i)); first = false; base += 4; } @@ -1132,14 +1168,14 @@ void T_STMIA(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CD(); } @@ -1170,11 +1206,11 @@ void T_LDMIA(ARM* cpu) { if (cpu->CurInstr & (1<DataRead32 (base, &val) - : cpu->DataRead32S(base, &val)); + u32 oldval = cpu->R[i]; + dabort |= !(first ? cpu->DataRead32 (base, i) + : cpu->DataRead32S(base, i)); + if (dabort) [[unlikely]] { cpu->R[i] = oldval; cpu->LDRFailedRegs |= (1<R[i] = val; first = false; base += 4; } @@ -1182,14 +1218,14 @@ void T_LDMIA(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; cpu->AddCycles_CDI(); } diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index 8bf509e9..498f84c3 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -646,17 +646,17 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept } else { - nextInstr[1] = cpuv5->CodeRead32(r15, false); + //nextInstr[1] = cpuv5->CodeRead32(r15, false); instrs[i].CodeCycles = cpu->CodeCycles; } } else { ARMv4* cpuv4 = (ARMv4*)cpu; - if (thumb) - nextInstr[1] = cpuv4->CodeRead16(r15); - else - nextInstr[1] = cpuv4->CodeRead32(r15); + if (thumb); + //nextInstr[1] = cpuv4->CodeRead16(r15); + else; + // nextInstr[1] = cpuv4->CodeRead32(r15); instrs[i].CodeCycles = cpu->CodeCycles; } instrs[i].Info = ARMInstrInfo::Decode(thumb, cpu->Num, instrs[i].Instr, LiteralOptimizations); @@ -724,7 +724,7 @@ void ARMJIT::CompileBlock(ARM* cpu) noexcept addressRanges[numAddressRanges++] = translatedAddrRounded; addressMasks[j] |= 1 << ((translatedAddr & 0x1FF) / 16); JIT_DEBUGPRINT("literal loading %08x %08x %08x %08x\n", literalAddr, translatedAddr, addressMasks[j], addressRanges[j]); - cpu->DataRead32(literalAddr, &literalValues[numLiterals]); + //cpu->DataRead32(literalAddr, &literalValues[numLiterals]); literalLoadAddrs[numLiterals++] = translatedAddr; } } diff --git a/src/ARMJIT_A64/ARMJIT_Branch.cpp b/src/ARMJIT_A64/ARMJIT_Branch.cpp index f9c2e0c5..fc08c661 100644 --- a/src/ARMJIT_A64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_A64/ARMJIT_Branch.cpp @@ -83,14 +83,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // doesn't matter if we put garbage in the MSbs there if (addr & 0x2) { - cpu9->CodeRead32(addr-2, true) >> 16; + //cpu9->CodeRead32(addr-2, true) >> 16; cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+2, false); + //cpu9->CodeRead32(addr+2, false); cycles += CurCPU->CodeCycles; } else { - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; } } @@ -99,9 +99,9 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) addr &= ~0x3; newPC = addr+4; - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+4, false); + //cpu9->CodeRead32(addr+4, false); cycles += cpu9->CodeCycles; } diff --git a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp index 6d2c4276..37d6c332 100644 --- a/src/ARMJIT_A64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_A64/ARMJIT_LoadStore.cpp @@ -79,18 +79,18 @@ bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) CurCPU->R[15] = R15; if (size == 32) { - CurCPU->DataRead32(addr & ~0x3, &val); + //CurCPU->DataRead32(addr & ~0x3, &val); val = melonDS::ROR(val, (addr & 0x3) << 3); } else if (size == 16) { - CurCPU->DataRead16(addr & ~0x1, &val); + //CurCPU->DataRead16(addr & ~0x1, &val); if (signExtend) val = ((s32)val << 16) >> 16; } else { - CurCPU->DataRead8(addr, &val); + // CurCPU->DataRead8(addr, &val); if (signExtend) val = ((s32)val << 24) >> 24; } diff --git a/src/ARMJIT_x64/ARMJIT_Branch.cpp b/src/ARMJIT_x64/ARMJIT_Branch.cpp index c32e2b73..0b5317d0 100644 --- a/src/ARMJIT_x64/ARMJIT_Branch.cpp +++ b/src/ARMJIT_x64/ARMJIT_Branch.cpp @@ -72,14 +72,14 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) // doesn't matter if we put garbage in the MSbs there if (addr & 0x2) { - cpu9->CodeRead32(addr-2, true); + //cpu9->CodeRead32(addr-2, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+2, false); + //cpu9->CodeRead32(addr+2, false); cycles += CurCPU->CodeCycles; } else { - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; } } @@ -88,9 +88,9 @@ void Compiler::Comp_JumpTo(u32 addr, bool forceNonConstantCycles) addr &= ~0x3; newPC = addr+4; - cpu9->CodeRead32(addr, true); + //cpu9->CodeRead32(addr, true); cycles += cpu9->CodeCycles; - cpu9->CodeRead32(addr+4, false); + //cpu9->CodeRead32(addr+4, false); cycles += cpu9->CodeCycles; } diff --git a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp index 219c7271..fd1fe5af 100644 --- a/src/ARMJIT_x64/ARMJIT_LoadStore.cpp +++ b/src/ARMJIT_x64/ARMJIT_LoadStore.cpp @@ -85,18 +85,18 @@ bool Compiler::Comp_MemLoadLiteral(int size, bool signExtend, int rd, u32 addr) CurCPU->R[15] = R15; if (size == 32) { - CurCPU->DataRead32(addr & ~0x3, &val); + //CurCPU->DataRead32(addr & ~0x3, &val); val = melonDS::ROR(val, (addr & 0x3) << 3); } else if (size == 16) { - CurCPU->DataRead16(addr & ~0x1, &val); + //CurCPU->DataRead16(addr & ~0x1, &val); if (signExtend) val = ((s32)val << 16) >> 16; } else { - CurCPU->DataRead8(addr, &val); + //CurCPU->DataRead8(addr, &val); if (signExtend) val = ((s32)val << 24) >> 24; } diff --git a/src/CP15.cpp b/src/CP15.cpp index 92b41e25..cf5617c3 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -367,7 +367,7 @@ u32 ARMv5::RandomLineIndex() return (RNGSeed >> 17) & 0x3; } -u32 ARMv5::ICacheLookup(const u32 addr) +bool ARMv5::ICacheLookup(const u32 addr) { const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; @@ -414,35 +414,36 @@ u32 ARMv5::ICacheLookup(const u32 addr) { u32 *cacheLine = (u32 *)&ICache[(id+set) << ICACHE_LINELENGTH_LOG2]; - if (ICacheFillPtr >= 7) + if (ICacheStreamPtr >= 7) { if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; // does this apply to streamed fetches? NDS.ARM9Timestamp++; } else { - u64 nextfill = ICacheFillTimes[ICacheFillPtr++]; + u64 nextfill = ICacheStreamTimes[ICacheStreamPtr++]; if (NDS.ARM9Timestamp < nextfill) { NDS.ARM9Timestamp = nextfill; } else { - u64 fillend = ICacheFillTimes[6] + 2; + u64 fillend = ICacheStreamTimes[6] + 2; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; else // checkme { if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; NDS.ARM9Timestamp++; } - ICacheFillPtr = 7; + ICacheStreamPtr = 7; } } - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; Store = false; - return cacheLine[(addr & (ICACHE_LINELENGTH -1)) >> 2]; + + RetVal = cacheLine[(addr & (ICACHE_LINELENGTH -1)) / 4]; + return true; } } @@ -451,35 +452,8 @@ u32 ARMv5::ICacheLookup(const u32 addr) // We do not fill the cacheline if it is disabled in the // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] - { - u8 cycles = MemTimings[addr >> 14][1]; + return false; - WriteBufferDrain(); - - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) - { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == DataRegion && Store) NDS.ARM9Timestamp += (1<> 2] = NDS.ARM9Read32(tag+i); - } - ICacheTags[line] = tag | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; // timing logic NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 24) == 0x02) + if (NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<>14] == WBLastRegion)) // check write buffer - || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store - NDS.ARM9Timestamp += 1<> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 1)); - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; // this should never trigger in practice - } - else // ICache Streaming logic - { - u8 ns = MemTimings[addr>>14][1]; - u8 seq = MemTimings[addr>>14][2]; + for (int i = 0; i < ICACHE_LINELENGTH; i+=sizeof(u32)) + ptr[i/4] = NDS.ARM9Read32(tag+i); - u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + NDS.ARM9Timestamp += 1<> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 1)); + if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; // this should never trigger in practice } + else // ICache Streaming logic + { + u8 ns = MemTimings[addr>>14][1]; + u8 seq = MemTimings[addr>>14][2]; + + u8 linepos = (addr & 0x1F) / 4; // technically this is one too low, but we want that actually - if ((addr >> 24) == 0x02) MainRAMTimestamp = ((linepos < 7) ? ICacheFillTimes[6] : NDS.ARM9Timestamp); + u64 cycles = ns + (seq * linepos); + NDS.ARM9Timestamp = cycles += NDS.ARM9Timestamp; + + ICacheStreamPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + ICacheStreamTimes[i] = cycles; + } + } + RetVal = ptr[(addr & (ICACHE_LINELENGTH-1)) / 4]; } - + Store = false; DataRegion = Mem9_Null; - return ptr[(addr & (ICACHE_LINELENGTH-1)) >> 2]; + return true; } void ARMv5::ICacheInvalidateByAddr(const u32 addr) @@ -659,23 +634,23 @@ u32 ARMv5::DCacheLookup(const u32 addr) { u32 *cacheLine = (u32 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; - if (DCacheFillPtr >= 7) + if (DCacheStreamPtr >= 7) { DataCycles = 1; } else { - u64 nextfill = DCacheFillTimes[DCacheFillPtr++]; + u64 nextfill = DCacheStreamTimes[DCacheStreamPtr++]; //if (NDS.ARM9Timestamp < nextfill) // can this ever really fail? { DataCycles = nextfill - NDS.ARM9Timestamp; } /*else { - u64 fillend = DCacheFillTimes[6] + 2; + u64 fillend = DCacheStreamTimes[6] + 2; if (NDS.ARM9Timestamp < fillend) DataCycles = fillend - NDS.ARM9Timestamp; else DataCycles = 1; - DCacheFillPtr = 7; + DCacheStreamPtr = 7; }*/ } DataRegion = Mem9_DCache; @@ -683,6 +658,14 @@ u32 ARMv5::DCacheLookup(const u32 addr) return cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2]; } } + + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: does cache trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } // cache miss miss: @@ -690,14 +673,6 @@ u32 ARMv5::DCacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] { - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: does cache trigger this? - if (ICacheFillPtr < 7) - { - u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } - WriteBufferDrain(); NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) MainRAMTimestamp = ((linepos < 7) ? ICacheFillTimes[6] : NDS.ARM9Timestamp); + if ((addr >> 24) == 0x02) MainRAMTimestamp = ((linepos < 7) ? ICacheStreamTimes[6] : NDS.ARM9Timestamp); } return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } @@ -871,7 +846,7 @@ bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK - if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) { if (addr & (DCACHE_LINELENGTH / 2)) { @@ -939,7 +914,7 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK - if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) { if (addr & (DCACHE_LINELENGTH / 2)) { @@ -1008,7 +983,7 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK - if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEWRITEBACK) + if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) { if (addr & (DCACHE_LINELENGTH / 2)) { @@ -1736,7 +1711,8 @@ void ARMv5::CP15Write(u32 id, u32 val) // we force a fill by looking up the value from cache // if it wasn't cached yet, it will be loaded into cache // low bits are set to 0x1C to trick cache streaming - ICacheLookup((val & ~0x03) | 0x1C); + printf("PREFETCH ICACHE\n"); + //ICacheLookup((val & ~0x03) | 0x1C); TODO: REIMPLEMENT WITH DEFERENCE return; case 0x7E0: @@ -2071,17 +2047,18 @@ u32 ARMv5::CP15Read(const u32 id) const // TCM are handled here. // TODO: later on, handle PU -u64 ARMv5::CodeRead32(u32 addr, bool branch) +void ARMv5::CodeRead32(u32 addr) { // prefetch abort // the actual exception is not raised until the aborted instruction is executed - if (!(PU_Map[addr>>12] & 0x04)) [[unlikely]] + if (!(PU_Map[addr>>12] & CP15_MAP_EXECUTABLE)) [[unlikely]] { NDS.ARM9Timestamp += 1; if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; Store = false; - return ((u64)1<<63); + RetVal = ((u64)1<<63); + return; } if (addr < ITCMSize) @@ -2091,7 +2068,8 @@ u64 ARMv5::CodeRead32(u32 addr, bool branch) if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; Store = false; - return *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + RetVal = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + return; } #if !DISABLE_ICACHE @@ -2101,15 +2079,15 @@ u64 ARMv5::CodeRead32(u32 addr, bool branch) { if (IsAddressICachable(addr)) { - return ICacheLookup(addr); + if (ICacheLookup(addr)) return; } #endif } // bus reads can only overlap with dcache streaming by 6 cycles - if (DCacheFillPtr < 7) + if (DCacheStreamPtr < 7) { - u64 time = DCacheFillTimes[6] - 6; // checkme: minus 6? + u64 time = DCacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } @@ -2148,41 +2126,84 @@ u64 ARMv5::CodeRead32(u32 addr, bool branch) if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; DataRegion = Mem9_Null; - return BusRead32(addr); + RetVal = BusRead32(addr); + return; } -bool ARMv5::DataRead8(u32 addr, u32* val) +void ARMv5::DAbortHandle() { - if (DCacheFillPtr < 7) + if (DCacheStreamPtr < 7) { - u64 fillend = DCacheFillTimes[6] + 1; + u64 fillend = DCacheStreamTimes[6] + 1; if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? - DCacheFillPtr = 7; + DCacheStreamPtr = 7; } + + DataCycles = 1; +} +void ARMv5::DAbortHandleS() +{ + NDS.ARM9Timestamp += DataCycles; + + if (DCacheStreamPtr < 7) + { + u64 fillend = DCacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheStreamPtr = 7; + } + + DataCycles = 1; +} + +bool ARMv5::DataRead8(u32 addr, u8 reg) +{ // Data Aborts // Exception is handled in the actual instruction implementation - if (!(PU_Map[addr>>12] & 0x01)) [[unlikely]] + if (!(PU_Map[addr>>12] & CP15_MAP_READABLE)) [[unlikely]] { - DataCycles = 1; + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; + else DAbortHandle(); return false; } + FetchAddr[reg] = addr; + LDRRegs = 1<> (8 * (addr & 3))) & 0xff; - return true; + return; } } #endif // bus reads can only overlap with icache streaming by 6 cycles // checkme: does dcache trigger this? - if (ICacheFillPtr < 7) + if (ICacheStreamPtr < 7) { - u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } @@ -2232,24 +2253,39 @@ bool ARMv5::DataRead8(u32 addr, u32* val) if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles - (3<>12] & CP15_MAP_READABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; + else DAbortHandle(); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs = 1<>12] & 0x01)) [[unlikely]] + if (DCacheStreamPtr < 7) { - DataCycles = 1; - return false; + u64 fillend = DCacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheStreamPtr = 7; } addr &= ~1; @@ -2260,14 +2296,14 @@ bool ARMv5::DataRead16(u32 addr, u32* val) ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *val = *(u16*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return true; + return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; DataRegion = Mem9_DTCM; *val = *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)]; - return true; + return; } #if !DISABLE_DCACHE @@ -2278,16 +2314,16 @@ bool ARMv5::DataRead16(u32 addr, u32* val) if (IsAddressDCachable(addr)) { *val = (DCacheLookup(addr) >> (8* (addr & 2))) & 0xffff; - return true; + return; } } #endif // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr < 7) + if (ICacheStreamPtr < 7) { - u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } @@ -2317,24 +2353,39 @@ bool ARMv5::DataRead16(u32 addr, u32* val) if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles - (3<>12] & CP15_MAP_READABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; + else DAbortHandle(); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs = 1<>12] & 0x01)) [[unlikely]] + if (DCacheStreamPtr < 7) { - DataCycles = 1; - return false; + u64 fillend = DCacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheStreamPtr = 7; } addr &= ~3; @@ -2345,14 +2396,16 @@ bool ARMv5::DataRead32(u32 addr, u32* val) ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return true; + LDRRegs &= ~1<>12] & CP15_MAP_READABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill] = &ARMv5::DAbortHandleS; + else DAbortHandleS(); + return false; + } + + FetchAddr[reg] = addr; + LDRRegs |= 1<>12] & 0x01)) [[unlikely]] - { - DataCycles = 1; - return false; - } + NDS.ARM9Timestamp += DataCycles; addr &= ~3; @@ -2425,14 +2495,16 @@ bool ARMv5::DataRead32S(u32 addr, u32* val) // we update the timestamp during the actual function, as a sequential itcm access can only occur during instructions with strange itcm wait cycles DataRegion = Mem9_ITCM; *val = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; - return true; + LDRRegs &= ~1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; + else DAbortHandle(); + return false; + } + + FetchAddr[reg] = addr; + STRRegs = 1<>12] & 0x02)) [[unlikely]] + if (DCacheStreamPtr < 7) { - DataCycles = 1; - return false; + u64 fillend = DCacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheStreamPtr = 7; } if (addr < ITCMSize) @@ -2532,14 +2621,14 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif - return true; + return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; DataRegion = Mem9_DTCM; *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; + return; } #if !DISABLE_DCACHE @@ -2550,16 +2639,16 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) if (IsAddressDCachable(addr)) { if (DCacheWrite8(addr, val)) - return true; + return; } } #endif // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr < 7) + if (ICacheStreamPtr < 7) { - u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } @@ -2594,24 +2683,39 @@ bool ARMv5::DataWrite8(u32 addr, u8 val) DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } +} + +bool ARMv5::DataWrite16(u32 addr, u16 val, u8 reg) +{ + // Data Aborts + // Exception is handled in the actual instruction implementation + if (!(PU_Map[addr>>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; + else DAbortHandle(); + return false; + } + + FetchAddr[reg] = addr; + STRRegs = 1<>12] & 0x02)) [[unlikely]] + if (DCacheStreamPtr < 7) { - DataCycles = 1; - return false; + u64 fillend = DCacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheStreamPtr = 7; } addr &= ~1; @@ -2625,14 +2729,14 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif - return true; + return; } if ((addr & DTCMMask) == DTCMBase) { DataCycles = 1; DataRegion = Mem9_DTCM; *(u16*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; - return true; + return; } #if !DISABLE_DCACHE @@ -2643,16 +2747,16 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) if (IsAddressDCachable(addr)) { if (DCacheWrite16(addr, val)) - return true; + return; } } #endif // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? - if (ICacheFillPtr < 7) + if (ICacheStreamPtr < 7) { - u64 time = ICacheFillTimes[6] - 6; // checkme: minus 6? + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } @@ -2687,24 +2791,39 @@ bool ARMv5::DataWrite16(u32 addr, u16 val) DataCycles = 1; WBDelay = NDS.ARM9Timestamp + 2; } +} + +bool ARMv5::DataWrite32(u32 addr, u32 val, u8 reg) +{ + // Data Aborts + // Exception is handled in the actual instruction implementation + if (!(PU_Map[addr>>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; + else DAbortHandle(); + return false; + } + + FetchAddr[reg] = addr; + STRRegs = 1<>12] & 0x02)) [[unlikely]] + if (DCacheStreamPtr < 7) { - DataCycles = 1; - return false; + u64 fillend = DCacheStreamTimes[6] + 1; + if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? + DCacheStreamPtr = 7; } addr &= ~3; @@ -2718,14 +2837,16 @@ bool ARMv5::DataWrite32(u32 addr, u32 val) #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif - return true; + STRRegs &= ~1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] + { + if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandleS; + else DAbortHandleS(); + return false; + } + + FetchAddr[reg] = addr; + STRRegs |= 1<>12] & 0x02)) [[unlikely]] - { - DataCycles = 1; - return false; - } + NDS.ARM9Timestamp += DataCycles; addr &= ~3; @@ -2806,14 +2946,16 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val) #ifdef JIT_ENABLED NDS.JIT.CheckAndInvalidate<0, ARMJIT_Memory::memregion_ITCM>(addr); #endif - return true; + STRRegs &= ~1< 0) && A9WENTLAST) + { + MainRAMTimestamp += 2; + A9ContentionTS += 2; + } + else + { + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += (ARM9ClockShift == 1) ? 9 : 8; + MainRAMLastAccess = A9LAST; + } + + if (*prog == ARM9.ICacheStreamPtr) ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + else if (*prog > ARM9.ICacheStreamPtr) ARM9.ICacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; + + (*prog)++; + if (*prog >= 8) + { + ARM9.RetVal = icache[(ARM9.FetchAddr[16] & 0x1F) / 4]; + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + A9ContentionTS = 0; + } + break; + } + } +} + +void NDS::MainRAMHandle() +{ + if (!A9ContentionTS) + { + A9ContentionTS = (ARM9Timestamp + ((1<> ARM9ClockShift; + if ((ARM9.MRTrack.Type != MainRAMType::Null) && (A9ContentionTS < MainRAMTimestamp)) A9ContentionTS = MainRAMTimestamp; + } + + bool A7Priority = ExMemCnt[0] & 0x8000; + if (A7Priority) + { + while (true) + { + if (A9ContentionTS < ARM7Timestamp) + { + if (ARM9.MRTrack.Type == MainRAMType::Null) { A9ContentionTS = 0; return; } + MainRAMHandleARM9(); + } + else + { + if (true) return; + } + } + } + else + { + while (true) + { + if (A9ContentionTS <= ARM7Timestamp) + { + if (ARM9.MRTrack.Type == MainRAMType::Null) { A9ContentionTS = 0; return; } + MainRAMHandleARM9(); + } + else + { + if (true) return; + } + } + } +} + +#undef A9WENTLAST +#undef A7WENTLAST +#undef A9LAST +#undef A7LAST + template u32 NDS::RunFrame() { @@ -970,16 +1068,21 @@ u32 NDS::RunFrame() ts = ARM9Timestamp - ts; for (int i = 0; i < 7; i++) { - ARM9.ICacheFillTimes[i] += ts; - ARM9.DCacheFillTimes[i] += ts; + ARM9.ICacheStreamTimes[i] += ts; + ARM9.DCacheStreamTimes[i] += ts; } ARM9.WBTimestamp += ts; } - else + else if (ARM9.MRTrack.Type == MainRAMType::Null) { + if (ARM9.abt) ARM9Timestamp = ARM9Target; ARM9.Execute(); } + + //printf("MAIN LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + + MainRAMHandle(); RunTimers(0); GPU.GPU3D.Run(); @@ -987,9 +1090,11 @@ u32 NDS::RunFrame() target = ARM9Timestamp >> ARM9ClockShift; CurCPU = 1; - while (ARM7Timestamp < target) + while ((ARM7Timestamp < target) || (ARM9.MRTrack.Type != MainRAMType::Null)) { - ARM7Target = target; // might be changed by a reschedule + ARM7Target = (ARM9.MRTrack.Type != MainRAMType::Null) ? (ARM7Timestamp+1) : target; // might be changed by a reschedule + + //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); if (CPUStop & CPUStop_DMA7) { @@ -1008,6 +1113,8 @@ u32 NDS::RunFrame() ARM7.Execute(); } + MainRAMHandle(); + RunTimers(1); } diff --git a/src/NDS.h b/src/NDS.h index c1f0ff88..8afbdf2c 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -253,6 +253,8 @@ public: // TODO: Encapsulate the rest of these members // no need to worry about those overflowing, they can keep going for atleast 4350 years u64 ARM9Timestamp, ARM9Target; u64 ARM7Timestamp, ARM7Target; + u64 MainRAMTimestamp; + u64 A9ContentionTS; u32 ARM9ClockShift; u32 IME[2]; @@ -270,6 +272,8 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed0[2*8]; alignas(u32) u8 ROMSeed1[2*8]; + bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 + protected: // These BIOS arrays should be declared *before* the component objects (JIT, SPI, etc.) // so that they're initialized before the component objects' constructors run. @@ -394,6 +398,9 @@ public: // TODO: Encapsulate the rest of these members void LoadGBAAddon(int type); std::unique_ptr EjectGBACart() { return GBACartSlot.EjectCart(); } + void MainRAMHandleARM9(); + void MainRAMHandle(); + u32 RunFrame(); bool IsRunning() const noexcept { return Running; } From ebc1168b605bdbe84a5d3167afcc0a7278b83c68 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:47:53 -0500 Subject: [PATCH 228/306] implement queueing ldr RORs and sign extension --- src/ARM.cpp | 5 ---- src/ARM.h | 11 +++++++- src/ARMInterpreter_LoadStore.cpp | 44 +++++++++++++++++++++++++++----- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 0bc138c2..68770b19 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -510,11 +510,6 @@ void ARM::RestoreCPSR() UpdateMode(oldcpsr, CPSR); } -void ARMv5::QueueUpdateMode() -{ - UpdateMode(QueueMode[0], QueueMode[1], true); -} - void ARM::UpdateMode(u32 oldmode, u32 newmode, bool phony) { if ((oldmode & 0x1F) == (newmode & 0x1F)) return; diff --git a/src/ARM.h b/src/ARM.h index 9fb195f4..92052674 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -206,6 +206,8 @@ public: bool BranchRestore; u32 QueueMode[2]; + u8 ExtReg; + u8 ExtROROffs; u64 RetVal; @@ -665,7 +667,14 @@ public: void DWrite16_2(); void DWrite32_2(); void DWrite32S_2(); - void QueueUpdateMode(); + + void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } + + void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } + + void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } + + void ROR32() { R[ExtReg] = ROR(R[ExtReg], ExtROROffs); } u32 CP15Control; //! CP15 Register 1: Control Register diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 56380e6c..6fb39f74 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -115,9 +115,17 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 ((ARMv5*)cpu)->DataAbort(); return; } - if ((cpu->MRTrack.Type != MainRAMType::Null) && signextend && cpu->Num == 0) printf("ARGH ME BONES"); - if constexpr (size == 8 && signextend) cpu->R[rd] = (s32)(s8)cpu->R[rd]; + if constexpr (size == 8 && signextend) + { + if (cpu->Num == 0) + { + cpu->ExtReg = rd; + if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::SignExtend8; + else ((ARMv5*)cpu)->SignExtend8(); + } + else cpu->R[rd] = (s32)(s8)cpu->R[rd]; + } if constexpr (size == 16) { @@ -126,10 +134,25 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 if constexpr (signextend) cpu->R[rd] = (s32)((addr&0x1) ? (s8)cpu->R[rd] : (s16)cpu->R[rd]); // sign extend like a ldrsb if we ror'd the value. } - else if constexpr (signextend) cpu->R[rd] = (s32)(s16)cpu->R[rd]; + else if constexpr (signextend) + { + cpu->ExtReg = rd; + if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::SignExtend16; + else ((ARMv5*)cpu)->SignExtend16(); + } } - if constexpr (size == 32) cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x3)<<3)); + if constexpr (size == 32) + { + if (cpu->Num == 0) + { + cpu->ExtReg = rd; + cpu->ExtROROffs = (addr & 0x3) * 8; + if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::ROR32; + else ((ARMv5*)cpu)->ROR32(); + } + else cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x3)*8)); + } if constexpr (writeback >= Writeback::Post) addr += offset; if constexpr (writeback != Writeback::None) @@ -508,8 +531,17 @@ inline void SWP(ARM* cpu) { // rd only gets updated if both read and write succeed - if constexpr (!byte) cpu->R[rd] = ROR(cpu->R[rd], 8*(base&0x3)); - + if constexpr (!byte) + { + if (cpu->Num == 0) + { + cpu->ExtReg = rd; + cpu->ExtROROffs = (base & 0x3) * 8; + if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::ROR32; + else ((ARMv5*)cpu)->ROR32(); + } + else cpu->R[rd] = ROR(cpu->R[rd], ((base&0x3)*8)); + } cpu->AddCycles_CDI(); if (rd != 15) From 08435d2272373d88cdf099f82ad327981ff3b6e1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:01:02 -0500 Subject: [PATCH 229/306] implement arm7 code fetches --- src/ARM.cpp | 465 +++++++++++++++++++++---------- src/ARM.h | 67 ++++- src/ARMInterpreter.cpp | 1 + src/ARMInterpreter_LoadStore.cpp | 80 ++---- src/CP15.cpp | 48 ++-- src/NDS.cpp | 79 +++++- src/NDS.h | 1 + 7 files changed, 489 insertions(+), 252 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 487724dd..c4655969 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -241,6 +241,7 @@ void ARMv5::Reset() void ARMv4::Reset() { + FuncQueue[0] = &ARMv4::StartExec; Nonseq = true; ARM::Reset(); @@ -337,24 +338,26 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr, u8 R15) BranchRestore = restorecpsr; BranchUpdate = R15; BranchAddr = addr; - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_2; - else JumpTo_2(); + QueueFunction(&ARMv5::JumpTo_2); } void ARMv5::JumpTo_2() { - if (CP15Control & (1<<15)) + if (BranchUpdate) { - if (BranchUpdate == 1) BranchAddr = R[15] & ~1; - else if (BranchUpdate == 2) BranchAddr = R[15] | 1; + if (CP15Control & (1<<15)) + { + if (BranchUpdate == 1) BranchAddr = R[15] & ~1; + else BranchAddr = R[15] | 1; + } + else BranchAddr = R[15]; } - else if (BranchUpdate) BranchAddr = R[15]; if (BranchRestore) { RestoreCPSR(); - if (CPSR & 0x20) BranchAddr |= 0x1; + if (CPSR & 0x20) BranchAddr |= 0x1; else BranchAddr &= ~0x1; } @@ -384,15 +387,13 @@ void ARMv5::JumpTo_2() { CodeRead32(BranchAddr-2); - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_3A; - else JumpTo_3A(); + QueueFunction(&ARMv5::JumpTo_3A); } else { CodeRead32(BranchAddr); - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_3B; - else JumpTo_3B(); + QueueFunction(&ARMv5::JumpTo_3B); } } else @@ -404,8 +405,7 @@ void ARMv5::JumpTo_2() CodeRead32(BranchAddr); - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_3C; - else JumpTo_3C(); + QueueFunction(&ARMv5::JumpTo_3C); } } @@ -414,8 +414,7 @@ void ARMv5::JumpTo_3A() NextInstr[0] = RetVal >> 16; CodeRead32(BranchAddr+2); - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_4; - else JumpTo_4(); + QueueFunction(&ARMv5::JumpTo_4); } void ARMv5::JumpTo_3B() @@ -429,8 +428,7 @@ void ARMv5::JumpTo_3C() NextInstr[0] = RetVal; CodeRead32(BranchAddr+4); - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::JumpTo_4; - else JumpTo_4(); + QueueFunction(&ARMv5::JumpTo_4); } void ARMv5::JumpTo_4() @@ -440,40 +438,71 @@ void ARMv5::JumpTo_4() void ARMv4::JumpTo(u32 addr, bool restorecpsr, u8 R15) { - if (restorecpsr) + //printf("JUMP! %08X %08X %i %i\n", addr, R[15], restorecpsr, R15); + BranchRestore = restorecpsr; + BranchUpdate = R15; + BranchAddr = addr; + QueueFunction(&ARMv4::JumpTo_2); +} + +void ARMv4::JumpTo_2() +{ + if (BranchUpdate) + { + if (BranchUpdate == 1) BranchAddr = R[15] & ~1; + else BranchAddr = R[15] | 1; + } + + if (BranchRestore) { RestoreCPSR(); - if (CPSR & 0x20) addr |= 0x1; - else addr &= ~0x1; + if (CPSR & 0x20) BranchAddr |= 0x1; + else BranchAddr &= ~0x1; } + + //printf("JUMP2! %08X\n", BranchAddr); - if (addr & 0x1) + if (BranchAddr & 0x1) { - addr &= ~0x1; - R[15] = addr+2; - - Nonseq = true; - NextInstr[0] = CodeRead16(addr); - Nonseq = false; - NextInstr[1] = CodeRead16(addr+2); + BranchAddr &= ~0x1; + R[15] = BranchAddr+2; CPSR |= 0x20; + + Nonseq = true; + CodeRead16(BranchAddr); + QueueFunction(&ARMv4::JumpTo_3A); } else { - addr &= ~0x3; - R[15] = addr+4; - - Nonseq = true; - NextInstr[0] = CodeRead32(addr); - Nonseq = false; - NextInstr[1] = CodeRead32(addr+4); + BranchAddr &= ~0x3; + R[15] = BranchAddr+4; CPSR &= ~0x20; + + Nonseq = true; + CodeRead32(BranchAddr); + QueueFunction(&ARMv4::JumpTo_3B); } } +void ARMv4::JumpTo_3A() +{ + NextInstr[0] = RetVal; + Nonseq = false; + CodeRead16(BranchAddr+2); + QueueFunction(&ARMv4::UpdateNextInstr1); +} + +void ARMv4::JumpTo_3B() +{ + NextInstr[0] = RetVal; + Nonseq = false; + CodeRead32(BranchAddr+4); + QueueFunction(&ARMv4::UpdateNextInstr1); +} + void ARM::RestoreCPSR() { u32 oldcpsr = CPSR; @@ -632,7 +661,6 @@ template void ARM::TriggerIRQ(); void ARMv5::PrefetchAbort() { - abt = true; AddCycles_C(); Log(LogLevel::Warn, "ARM9: prefetch abort (%08X)\n", R[15]); @@ -648,7 +676,6 @@ void ARMv5::PrefetchAbort() void ARMv5::DataAbort() { - abt = true; Log(LogLevel::Warn, "ARM9: data abort (%08X) %08llX\n", R[15], CurInstr); u32 oldcpsr = CPSR; @@ -815,29 +842,11 @@ void ARMv5::Execute() // check if we're done with the queue, if so, reset everything if (FuncQueueProg >= FuncQueueEnd) { - FuncQueueFill = 0; FuncQueueProg = 0; FuncQueueEnd = 0; FuncQueueActive = false; FuncQueue[0] = &ARMv5::StartExec; - /* - Platform::FileHandle* file = Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); - Platform::FileSeek(file, iter*16*4, Platform::FileSeekOrigin::Start); - u32 Regs[16]; - Platform::FileRead(Regs, 4, 16, file); - if (memcmp(Regs, R, 16*4)) - { - printf("MISMATCH ON ITERATION %lli! %08llX", iter, CurInstr); - for (int i = 0; i < 16; i++) - { - printf(" %i: %08X vs %08X", i, R[i], Regs[i]); - } - printf("\n"); - abt=1; - } - Platform::CloseFile(file); - iter++;*/ } } else @@ -852,30 +861,10 @@ void ARMv5::Execute() FuncQueueFill = 0; FuncQueueActive = true; } - else - { - /* - Platform::FileHandle* file = Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); - Platform::FileSeek(file, iter*16*4, Platform::FileSeekOrigin::Start); - u32 Regs[16]; - Platform::FileRead(Regs, 4, 16, file); - if (memcmp(Regs, R, 16*4)) - { - printf("MISMATCH ON ITERATION %lli! %08llX", iter, CurInstr); - for (int i = 0; i < 16; i++) - { - printf(" %i: %08X vs %08X", i, R[i], Regs[i]); - } - printf("\n"); - abt=1; - } - Platform::CloseFile(file); - iter++;*/ - } - if (MRTrack.Type != MainRAMType::Null) break; // check if we need to resolve main ram + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram // TODO optimize this shit!!! - if (Halted) + if (!FuncQueueActive && Halted) { if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target) { @@ -904,6 +893,45 @@ template void ARMv5::Execute(); template void ARMv5::Execute(); #endif +void ARMv4::StartExec() +{ + if (CPSR & 0x20) // THUMB + { + // prefetch + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + CodeRead16(R[15]); + QueueFunction(&ARMv4::UpdateNextInstr1); + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else + { + // actually execute + u32 icode = (CurInstr >> 6); + ARMInterpreter::THUMBInstrTable[icode](this); + } + } + else + { + // prefetch + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + CodeRead32(R[15]); + QueueFunction(&ARMv4::UpdateNextInstr1); + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CheckCondition(CurInstr >> 28)) // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else + AddCycles_C(); + } +} + template void ARMv4::Execute() { @@ -921,8 +949,11 @@ void ARMv4::Execute() Halted = 0; if (NDS.IME[1] & 0x1) { +#ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); - else IRQ = 1; + else +#endif + IRQ = 1; } } else @@ -974,48 +1005,84 @@ void ARMv4::Execute() else #endif { - if (CPSR & 0x20) // THUMB + if constexpr (mode == CPUExecuteMode::InterpreterGDB) + GdbCheckC(); + + //printf("A:%i, F:%i, P:%i, E:%i, I:%08llX, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, R[15]); + + (this->*FuncQueue[FuncQueueProg])(); + + if (FuncQueueActive) { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); + if (FuncQueueFill == FuncQueueProg) + { + // we did not get a new addition to the queue; increment and reset ptrs + FuncQueueFill = ++FuncQueueProg; - // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead16(R[15]); + // check if we're done with the queue, if so, reset everything + if (FuncQueueProg >= FuncQueueEnd) + { + FuncQueueFill = 0; + FuncQueueProg = 0; + FuncQueueEnd = 0; + FuncQueueActive = false; + FuncQueue[0] = &ARMv4::StartExec; - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + /* + if (filey == NULL) filey = Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); + else + { + u32 regscmp[16]; + Platform::FileRead(regscmp, 4, 16, filey); + if (iter > 471000 && memcmp(regscmp, R, 4*16)) + { + printf("MISMATCH on iter: %lli!!!! %08llX\n", iter, CurInstr); + for (int i = 0; i < 16; i++) + { + printf("R%i :%08X vs CMP:%08X\n", i, R[i], regscmp[i]); + } + //abt++; + } + iter++; + }*/ + } + } else { - // actually execute - u32 icode = (CurInstr >> 6); - ARMInterpreter::THUMBInstrTable[icode](this); + // we got a new addition to the list; redo the current entry + FuncQueueFill = FuncQueueProg; } } + else if (FuncQueueFill > 0) // check if we started the queue up + { + FuncQueueEnd = FuncQueueFill; + FuncQueueFill = 0; + FuncQueueActive = true; + } else { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - NextInstr[1] = CodeRead32(R[15]); - - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CheckCondition(CurInstr >> 28)) // actually execute - { - u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); - ARMInterpreter::ARMInstrTable[icode](this); - } + /* + if (filey == NULL) Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); else - AddCycles_C(); + { + u32 regscmp[16]; + Platform::FileRead(regscmp, 4, 16, filey); + if (iter > 471000 && memcmp(regscmp, R, 4*16)) + { + printf("MISMATCH on iter: %lli!!!! %08llX\n", iter, CurInstr); + for (int i = 0; i < 16; i++) + { + printf("R%i :%08X vs CMP:%08X\n", i, R[i], regscmp[i]); + } + //abt++; + iter++; + } + }*/ } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram // TODO optimize this shit!!! - if (Halted) + if (!FuncQueueActive && Halted) { if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target) { @@ -1075,7 +1142,7 @@ void ARMv5::FillPipeline() void ARMv4::FillPipeline() { - SetupCodeMem(R[15]); + /*SetupCodeMem(R[15]); if (CPSR & 0x20) { @@ -1086,7 +1153,7 @@ void ARMv4::FillPipeline() { NextInstr[0] = CodeRead32(R[15] - 4); NextInstr[1] = CodeRead32(R[15]); - } + }*/ } #ifdef GDBSTUB_ENABLED @@ -1314,8 +1381,7 @@ void ARMv5::CodeFetch() { CodeRead32(PC); } - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::AddExecute; - else AddExecute(); + QueueFunction(&ARMv5::AddExecute); } void ARMv5::AddExecute() @@ -1328,8 +1394,7 @@ void ARMv5::AddExecute() void ARMv5::AddCycles_MW(s32 numM) { DataCycles = numM; - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::AddCycles_MW_2; - else AddCycles_MW_2(); + QueueFunction(&ARMv5::AddCycles_MW_2); } void ARMv5::AddCycles_MW_2() @@ -1389,45 +1454,53 @@ void ARMv5::HandleInterlocksMemory(u8 reg) ILPrevTime = 16;*/ } -u16 ARMv4::CodeRead16(u32 addr) +void ARMv4::CodeRead16(u32 addr) { if ((addr >> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + FetchAddr[16] = addr; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRCodeFetch | MR16; + if (!Nonseq) MRTrack.Var |= MRSequential; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?0:1]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 3; + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?0:1]; + RetVal = BusRead16(addr); } - - return BusRead16(addr); } -u32 ARMv4::CodeRead32(u32 addr) +void ARMv4::CodeRead32(u32 addr) { if ((addr >> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + FetchAddr[16] = addr; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRCodeFetch | MR32; + if (!Nonseq) MRTrack.Var |= MRSequential; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?2:3]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 3; + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr>>15][Nonseq?2:3]; + RetVal = BusRead32(addr); } - - return BusRead32(addr); } bool ARMv4::DataRead8(u32 addr, u8 reg) { - u32* val = &R[reg]; + FetchAddr[reg] = addr; + LDRRegs = 1<> 24) == 0x02) { @@ -1443,12 +1516,24 @@ bool ARMv4::DataRead8(u32 addr, u8 reg) } *val = BusRead8(addr); - return true; } bool ARMv4::DataRead16(u32 addr, u8 reg) { - u32* val = &R[reg]; + FetchAddr[reg] = addr; + LDRRegs = 1<> 24) == 0x02) @@ -1465,12 +1550,24 @@ bool ARMv4::DataRead16(u32 addr, u8 reg) } *val = BusRead16(addr); - return true; } bool ARMv4::DataRead32(u32 addr, u8 reg) { - u32* val = &R[reg]; + FetchAddr[reg] = addr; + LDRRegs = 1<> 24) == 0x02) @@ -1487,12 +1584,25 @@ bool ARMv4::DataRead32(u32 addr, u8 reg) } *val = BusRead32(addr); - return true; + LDRRegs &= ~1<> 24) == 0x02) @@ -1509,11 +1619,24 @@ bool ARMv4::DataRead32S(u32 addr, u8 reg) } *val = BusRead32(addr); - return true; + LDRRegs &= ~1<> 24) == 0x02) { if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; @@ -1528,11 +1651,23 @@ bool ARMv4::DataWrite8(u32 addr, u8 val, u8 reg) } BusWrite8(addr, val); - return true; } bool ARMv4::DataWrite16(u32 addr, u16 val, u8 reg) { + FetchAddr[reg] = addr; + STRRegs = 1<> 24) == 0x02) @@ -1549,11 +1684,23 @@ bool ARMv4::DataWrite16(u32 addr, u16 val, u8 reg) } BusWrite16(addr, val); - return true; } bool ARMv4::DataWrite32(u32 addr, u32 val, u8 reg) { + FetchAddr[reg] = addr; + STRRegs = 1<> 24) == 0x02) @@ -1570,11 +1717,24 @@ bool ARMv4::DataWrite32(u32 addr, u32 val, u8 reg) } BusWrite32(addr, val); - return true; + STRRegs &= ~1<> 24) == 0x02) @@ -1591,7 +1751,7 @@ bool ARMv4::DataWrite32S(u32 addr, u32 val, u8 reg) } BusWrite32(addr, val); - return true; + STRRegs &= ~1<*QueueEntry)(); + } + void StartExec(); void AddExecute(); void AddCycles_MW_2(); @@ -669,15 +691,12 @@ public: void DWrite16_2(); void DWrite32_2(); void DWrite32S_2(); - void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } - void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } - void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } - void ROR32() { R[ExtReg] = ROR(R[ExtReg], ExtROROffs); } + u32 CP15Control; //! CP15 Register 1: Control Register u32 RNGSeed; //! Global cache line fill seed. Used for pseudo random replacement strategy with the instruction and data cache @@ -753,8 +772,6 @@ public: u64 ICacheStreamTimes[7]; u64 DCacheStreamTimes[7]; - bool abt; - u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing u8 WBWriting; // whether the buffer is actively trying to perform a write @@ -796,11 +813,13 @@ public: template void Execute(); - + + Platform::FileHandle* filey; + void (ARMv4::*FuncQueue[31])(void); bool Nonseq; - u16 CodeRead16(u32 addr); - u32 CodeRead32(u32 addr); + void CodeRead16(u32 addr); + void CodeRead32(u32 addr); bool DataRead8(u32 addr, u8 reg) override; bool DataRead16(u32 addr, u8 reg) override; @@ -814,6 +833,34 @@ public: void AddCycles_CI(s32 num) override; void AddCycles_CDI() override; void AddCycles_CD() override; + + inline void QueueFunction(void (ARMv4::*QueueEntry)(void)) + { + if (MRTrack.Type != MainRAMType::Null) + FuncQueue[FuncQueueFill++] = QueueEntry; + else + (this->*QueueEntry)(); + } + + void StartExec(); + void UpdateNextInstr1() { NextInstr[1] = RetVal; } + void JumpTo_2(); + void JumpTo_3A(); + void JumpTo_3B(); + void DRead8_2(); + void DRead16_2(); + void DRead32_2(); + void DRead32S_2(); + void DWrite8_2(); + void DWrite16_2(); + void DWrite32_2(); + void DWrite32S_2(); + void AddExecute(); + void AddExtraCycle(); + void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } + void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } + void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } + void ROR32() { R[ExtReg] = ROR(R[ExtReg], ExtROROffs); } protected: u8 BusRead8(u32 addr) override; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 64249fac..671b3d8b 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -37,6 +37,7 @@ namespace melonDS::ARMInterpreter void A_UNK(ARM* cpu) { cpu->AddCycles_C(); + cpu->abt=1; Log(LogLevel::Warn, "undefined ARM%d instruction %08X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-8); #ifdef GDBSTUB_ENABLED cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-8); diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 6fb39f74..4bf4984e 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -118,40 +118,38 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if constexpr (size == 8 && signextend) { - if (cpu->Num == 0) - { - cpu->ExtReg = rd; - if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::SignExtend8; - else ((ARMv5*)cpu)->SignExtend8(); - } - else cpu->R[rd] = (s32)(s8)cpu->R[rd]; + cpu->ExtReg = rd; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::SignExtend8); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::SignExtend8); } if constexpr (size == 16) { if (cpu->Num == 1) { - cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x1)<<3)); // unaligned 16 bit loads are ROR'd on arm7 - if constexpr (signextend) cpu->R[rd] = (s32)((addr&0x1) ? (s8)cpu->R[rd] : (s16)cpu->R[rd]); // sign extend like a ldrsb if we ror'd the value. + cpu->ExtReg = rd; + cpu->ExtROROffs = (addr & 0x1) * 8; + ((ARMv4*)cpu)->QueueFunction(&ARMv4::ROR32); // unaligned 16 bit loads are ROR'd on arm7 + + if constexpr (signextend) + { + if (addr&0x1) ((ARMv4*)cpu)->QueueFunction(&ARMv4::SignExtend8); // sign extend like an ldrsb if we ror'd the value. + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::SignExtend16); + } } else if constexpr (signextend) { cpu->ExtReg = rd; - if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::SignExtend16; - else ((ARMv5*)cpu)->SignExtend16(); + ((ARMv5*)cpu)->QueueFunction(&ARMv5::SignExtend16); } } if constexpr (size == 32) { - if (cpu->Num == 0) - { - cpu->ExtReg = rd; - cpu->ExtROROffs = (addr & 0x3) * 8; - if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::ROR32; - else ((ARMv5*)cpu)->ROR32(); - } - else cpu->R[rd] = ROR(cpu->R[rd], ((addr&0x3)*8)); + cpu->ExtReg = rd; + cpu->ExtROROffs = (addr & 0x3) * 8; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::ROR32); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::ROR32); } if constexpr (writeback >= Writeback::Post) addr += offset; @@ -172,8 +170,6 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if (rd == 15) { - if (cpu->Num==1) cpu->R[15] &= ~0x1; - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (addr&0x3)); // force an interlock cpu->JumpTo(cpu->R[15], false, 1); @@ -533,14 +529,10 @@ inline void SWP(ARM* cpu) if constexpr (!byte) { - if (cpu->Num == 0) - { - cpu->ExtReg = rd; - cpu->ExtROROffs = (base & 0x3) * 8; - if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::ROR32; - else ((ARMv5*)cpu)->ROR32(); - } - else cpu->R[rd] = ROR(cpu->R[rd], ((base&0x3)*8)); + cpu->ExtReg = rd; + cpu->ExtROROffs = (base & 0x3) * 8; + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::ROR32); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::ROR32); } cpu->AddCycles_CDI(); @@ -555,7 +547,6 @@ inline void SWP(ARM* cpu) } else if (cpu->Num==1) // for some reason these jumps don't seem to work on the arm 9? { - cpu->R[rd] = cpu->R[rd] & ~1; cpu->JumpTo(cpu->R[rd], false, 1); } return; @@ -608,7 +599,7 @@ void EmptyRListLDMSTM(ARM* cpu, const u8 baseid, const u8 flags) cpu->AddCycles_CDI(); - cpu->JumpTo(cpu->R[15] & ~1, flags & restoreorthumb, 1); // TODO: fix this not maintaining current mode properly + cpu->JumpTo(cpu->R[15], flags & restoreorthumb, 1); // TODO: fix this not maintaining current mode properly } else { @@ -699,8 +690,6 @@ void A_LDM(ARM* cpu) dabort |= !(first ? cpu->DataRead32 (base, 15) : cpu->DataRead32S(base, 15)); if (dabort) [[unlikely]] { cpu->R[15] = oldval; cpu->LDRFailedRegs |= (1<<15); } - else if (cpu->Num == 1) - cpu->R[15] &= ~0x1; if (!preinc) base += 4; } @@ -723,15 +712,11 @@ void A_LDM(ARM* cpu) { if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) { - if (cpu->Num == 0) - { - cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; - cpu->QueueMode[1] = cpu->CPSR; + cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; + cpu->QueueMode[1] = cpu->CPSR; - if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::QueueUpdateMode; - else ((ARMv5*)cpu)->QueueUpdateMode(); - } - else cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::QueueUpdateMode); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::QueueUpdateMode); } ((ARMv5*)cpu)->DataAbort(); @@ -761,15 +746,11 @@ void A_LDM(ARM* cpu) // switch back to previous regs if ((cpu->CurInstr & (1<<22)) && !(cpu->CurInstr & (1<<15))) { - if (cpu->Num == 0) - { - cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; - cpu->QueueMode[1] = cpu->CPSR; + cpu->QueueMode[0] = (cpu->CPSR&~0x1F)|0x10; + cpu->QueueMode[1] = cpu->CPSR; - if (cpu->MRTrack.Type != MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[cpu->FuncQueueFill++] = &ARMv5::QueueUpdateMode; - else ((ARMv5*)cpu)->QueueUpdateMode(); - } - else cpu->UpdateMode((cpu->CPSR&~0x1F)|0x10, cpu->CPSR, true); + if (cpu->Num == 0) ((ARMv5*)cpu)->QueueFunction(&ARMv5::QueueUpdateMode); + else ((ARMv4*)cpu)->QueueFunction(&ARMv4::QueueUpdateMode); } // jump if pc got written @@ -1120,7 +1101,6 @@ void T_POP(ARM* cpu) if (!dabort) [[likely]] { - if (cpu->Num==1) cpu->R[15] |= 0x1; //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock cpu->JumpTo(cpu->R[15], false, 2); diff --git a/src/CP15.cpp b/src/CP15.cpp index cf5617c3..538c66a4 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2163,16 +2163,14 @@ bool ARMv5::DataRead8(u32 addr, u8 reg) // Exception is handled in the actual instruction implementation if (!(PU_Map[addr>>12] & CP15_MAP_READABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; - else DAbortHandle(); + QueueFunction(&ARMv5::DAbortHandle); return false; } FetchAddr[reg] = addr; LDRRegs = 1<>12] & CP15_MAP_READABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; - else DAbortHandle(); + QueueFunction(&ARMv5::DAbortHandle); return false; } FetchAddr[reg] = addr; LDRRegs = 1<>12] & CP15_MAP_READABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; - else DAbortHandle(); + QueueFunction(&ARMv5::DAbortHandle); return false; } FetchAddr[reg] = addr; LDRRegs = 1<>12] & CP15_MAP_READABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill] = &ARMv5::DAbortHandleS; - else DAbortHandleS(); + QueueFunction(&ARMv5::DAbortHandleS); return false; } FetchAddr[reg] = addr; LDRRegs |= 1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; - else DAbortHandle(); + QueueFunction(&ARMv5::DAbortHandle); return false; } @@ -2594,8 +2585,7 @@ bool ARMv5::DataWrite8(u32 addr, u8 val, u8 reg) STRRegs = 1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; - else DAbortHandle(); + QueueFunction(&ARMv5::DAbortHandle); return false; } @@ -2700,8 +2689,7 @@ bool ARMv5::DataWrite16(u32 addr, u16 val, u8 reg) STRRegs = 1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandle; - else DAbortHandle(); + QueueFunction(&ARMv5::DAbortHandle); return false; } @@ -2808,8 +2795,7 @@ bool ARMv5::DataWrite32(u32 addr, u32 val, u8 reg) STRRegs = 1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] { - if (MRTrack.Type != MainRAMType::Null) FuncQueue[FuncQueueFill++] = &ARMv5::DAbortHandleS; - else DAbortHandleS(); + QueueFunction(&ARMv5::DAbortHandleS); return false; } @@ -2922,8 +2907,7 @@ bool ARMv5::DataWrite32S(u32 addr, u32 val, u8 reg) STRRegs |= 1< 0) // check if we started the queue up + { + ARM9.FuncQueueEnd = ARM9.FuncQueueFill; + ARM9.FuncQueueFill = 0; + ARM9.FuncQueueActive = true; + } + if (ARM7.FuncQueueFill > 0) // check if we started the queue up + { + ARM7.FuncQueueEnd = ARM7.FuncQueueFill; + ARM7.FuncQueueFill = 0; + ARM7.FuncQueueActive = true; + } PostFlag9 = 0x01; PostFlag7 = 0x01; @@ -902,19 +914,17 @@ void NDS::MainRAMHandleARM9() { switch (ARM9.MRTrack.Type) { - case MainRAMType::Null: - Platform::Log(Platform::LogLevel::Error, "NULL MAIN RAM TYPE ARM9"); + default: + { + Platform::Log(Platform::LogLevel::Error, "INVALID MAIN RAM TYPE ARM9"); break; + } + case MainRAMType::ICacheStream: { - if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; return; } - - //printf("ICACHEHANDLER\n"); - u8* prog = &ARM9.MRTrack.Progress; u32 addr = (ARM9.FetchAddr[16] & ~0x1F) | (*prog * 4); u32* icache = (u32*)&ARM9.ICache[ARM9.MRTrack.Var << 5]; - icache[*prog] = ARM9Read32(addr); if ((*prog > 0) && A9WENTLAST) { @@ -923,11 +933,15 @@ void NDS::MainRAMHandleARM9() } else { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; return; } + MainRAMTimestamp = A9ContentionTS + 9; A9ContentionTS += (ARM9ClockShift == 1) ? 9 : 8; MainRAMLastAccess = A9LAST; } + icache[*prog] = ARM9Read32(addr); + if (*prog == ARM9.ICacheStreamPtr) ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; else if (*prog > ARM9.ICacheStreamPtr) ARM9.ICacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; @@ -943,6 +957,42 @@ void NDS::MainRAMHandleARM9() } } +void NDS::MainRAMHandleARM7() +{ + switch (ARM7.MRTrack.Type) + { + default: + { + Platform::Log(Platform::LogLevel::Error, "INVALID MAIN RAM TYPE ARM7"); + break; + } + + case MainRAMType::Fetch: + { + u32 addr = ARM7.FetchAddr[16]; + u8 var = ARM7.MRTrack.Var; + + if ((var & MRSequential) && A7WENTLAST) + { + int cycles = (var & MR32) ? 2 : 1; + MainRAMTimestamp = ARM7Timestamp += cycles; + } + else + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; return; } + + MainRAMTimestamp = ARM7Timestamp + (var & MR16) ? 8 : 9; + ARM7Timestamp += (var & MR16) ? 5 : 6; + } + + if (var & MRCodeFetch) ARM7.RetVal = (var & MR32) ? ARM7Read32(addr) : ARM7Read16(addr); + + memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + break; + } + } +} + void NDS::MainRAMHandle() { if (!A9ContentionTS) @@ -963,7 +1013,8 @@ void NDS::MainRAMHandle() } else { - if (true) return; + if (ARM7.MRTrack.Type == MainRAMType::Null) return; + MainRAMHandleARM7(); } } } @@ -978,7 +1029,8 @@ void NDS::MainRAMHandle() } else { - if (true) return; + if (ARM7.MRTrack.Type == MainRAMType::Null) return; + MainRAMHandleARM7(); } } } @@ -1080,11 +1132,11 @@ u32 NDS::RunFrame() } else if (ARM9.MRTrack.Type == MainRAMType::Null) { - if (ARM9.abt) ARM9Timestamp = ARM9Target; + //if (ARM9.abt) ARM9Timestamp = ARM9Target; ARM9.Execute(); } - //printf("MAIN LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + //printf("MAIN LOOP: 9 %lli %08X %08llX 7 %lli %08X %08llX %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, IME[1], IE[1]); MainRAMHandle(); @@ -1094,7 +1146,7 @@ u32 NDS::RunFrame() target = ARM9Timestamp >> ARM9ClockShift; CurCPU = 1; - while ((ARM7Timestamp < target) || (ARM9.MRTrack.Type != MainRAMType::Null)) + while (((ARM7Timestamp < target) && (ARM7.MRTrack.Type == MainRAMType::Null)) || (ARM9.MRTrack.Type != MainRAMType::Null)) { ARM7Target = (ARM9.MRTrack.Type != MainRAMType::Null) ? (ARM7Timestamp+1) : target; // might be changed by a reschedule @@ -1112,8 +1164,9 @@ u32 NDS::RunFrame() dsi.RunNDMAs(1); } } - else + else if (ARM7.MRTrack.Type == MainRAMType::Null) { + //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; ARM7.Execute(); } diff --git a/src/NDS.h b/src/NDS.h index 38d14e88..6263247e 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -402,6 +402,7 @@ public: // TODO: Encapsulate the rest of these members std::unique_ptr EjectGBACart() { return GBACartSlot.EjectCart(); } void MainRAMHandleARM9(); + void MainRAMHandleARM7(); void MainRAMHandle(); u32 RunFrame(); From a049c43e27ff2469d818e7cfca1e53117c1e388c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:45:54 -0500 Subject: [PATCH 230/306] finish arm7 contention --- src/ARM.cpp | 162 ++++++++++++++++++++++------------------------------ src/ARM.h | 1 + src/NDS.cpp | 33 +++++++++-- 3 files changed, 96 insertions(+), 100 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index c4655969..31cc5028 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1499,23 +1499,21 @@ void ARMv4::DRead8_2() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; - u32 dummy; - u32* val = (LDRFailedRegs & (1<> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR8; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 3; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 15][0]; + *val = BusRead8(addr); } - *val = BusRead8(addr); } bool ARMv4::DataRead16(u32 addr, u8 reg) @@ -1531,25 +1529,21 @@ void ARMv4::DRead16_2() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; - u32 dummy; - u32* val = (LDRFailedRegs & (1<> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR16; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 3; - } + u32 dummy; + u32* val = (LDRFailedRegs & (1<> 15][0]; + *val = BusRead16(addr); + } } bool ARMv4::DataRead32(u32 addr, u8 reg) @@ -1565,25 +1559,21 @@ void ARMv4::DRead32_2() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; - u32 dummy; - u32* val = (LDRFailedRegs & (1<> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][2]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 3; + u32 dummy; + u32* val = (LDRFailedRegs & (1<> 15][2]; + *val = BusRead32(addr); } - - *val = BusRead32(addr); LDRRegs &= ~1<> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32 | MRSequential; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][3]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 3; - } + u32 dummy; + u32* val = (LDRFailedRegs & (1<> 15][3]; + *val = BusRead32(addr); + } LDRRegs &= ~1<> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR8; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 5; + u8 val = STRVal[reg]; + + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + BusWrite8(addr, val); } - - BusWrite8(addr, val); } bool ARMv4::DataWrite16(u32 addr, u16 val, u8 reg) @@ -1666,24 +1650,20 @@ void ARMv4::DWrite16_2() { u8 reg = __builtin_ctz(STRRegs); u32 addr = FetchAddr[reg]; - u16 val = STRVal[reg]; - addr &= ~1; - if ((addr >> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR16; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 5; - } + u16 val = STRVal[reg]; - BusWrite16(addr, val); + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][0]; + BusWrite16(addr, val); + } } bool ARMv4::DataWrite32(u32 addr, u32 val, u8 reg) @@ -1699,24 +1679,20 @@ void ARMv4::DWrite32_2() { u8 reg = __builtin_ctz(STRRegs); u32 addr = FetchAddr[reg]; - u32 val = STRVal[reg]; - - addr &= ~3; if ((addr >> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][2]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 5; - } + u32 val = STRVal[reg]; - BusWrite32(addr, val); + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][2]; + BusWrite32(addr, val); + } STRRegs &= ~1<> 24) == 0x02) { - if (NDS.ARM7Timestamp < MainRAMTimestamp) NDS.ARM7Timestamp = MainRAMTimestamp; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32 | MRSequential; + MRTrack.Progress = reg; } - - NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][3]; - - if ((addr >> 24) == 0x02) + else { - MainRAMTimestamp = NDS.ARM7Timestamp; - NDS.ARM7Timestamp -= 5; - } + u32 val = STRVal[reg]; - BusWrite32(addr, val); + NDS.ARM7Timestamp += NDS.ARM7MemTimings[addr >> 15][3]; + BusWrite32(addr, val); + } STRRegs &= ~1< Date: Fri, 6 Dec 2024 18:14:25 -0500 Subject: [PATCH 231/306] improve accuracy of contention resolution --- src/NDS.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index ea9eb76a..42eac35e 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -909,6 +909,8 @@ void NDS::RunSystemSleep(u64 timestamp) #define A7WENTLAST ( MainRAMLastAccess) #define A9LAST false #define A7LAST true +#define A9PRIORITY !(ExMemCnt[0] & 0x8000) +#define A7PRIORITY (ExMemCnt[0] & 0x8000) void NDS::MainRAMHandleARM9() { @@ -933,7 +935,7 @@ void NDS::MainRAMHandleARM9() } else { - if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; return; } + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } MainRAMTimestamp = A9ContentionTS + 9; A9ContentionTS += (ARM9ClockShift == 1) ? 9 : 8; @@ -978,11 +980,12 @@ void NDS::MainRAMHandleARM7() } else { - if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; return; } + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } MainRAMTimestamp = ARM7Timestamp + (var & MR16) ? 8 : 9; // checkme: are these correct for 8bit? if (var & MRWrite) ARM7Timestamp += (var & MR16) ? 3 : 4; else ARM7Timestamp += (var & MR16) ? 5 : 6; + MainRAMLastAccess = A7LAST; } if (var & MRCodeFetch) @@ -1024,8 +1027,7 @@ void NDS::MainRAMHandle() if ((ARM9.MRTrack.Type != MainRAMType::Null) && (A9ContentionTS < MainRAMTimestamp)) A9ContentionTS = MainRAMTimestamp; } - bool A7Priority = ExMemCnt[0] & 0x8000; - if (A7Priority) + if (A7PRIORITY) { while (true) { @@ -1063,6 +1065,8 @@ void NDS::MainRAMHandle() #undef A7WENTLAST #undef A9LAST #undef A7LAST +#undef A9PRIORITY +#undef A7PRIORITY template u32 NDS::RunFrame() From db7eb564f088cfab2565bbe336ae948b5e6f47ee Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 6 Dec 2024 19:22:59 -0500 Subject: [PATCH 232/306] handle uncached/buffered accesses for arm9 --- src/CP15.cpp | 182 +++++++++++++++++++++++++++------------------------ src/NDS.cpp | 61 +++++++++++++++++ 2 files changed, 157 insertions(+), 86 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 538c66a4..0f8d180f 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2102,13 +2102,9 @@ void ARMv5::CodeRead32(u32 addr) if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp + ((1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 24) == 0x02) { - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - if (NDS.ARM9ClockShift == 2) MainRAMTimestamp += 4; - DataRegion = Mem9_MainRAM; + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MR32 | MRSequential; + MRTrack.Progress = reg; } else { DataRegion = NDS.ARM9Regions[addr>>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<> 24) == 0x02) { - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - MainRAMTimestamp += 2<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>14]; // burst stores seem to process the extra delay cycles at the end of the burst // this means that we end up *always* able to begin code fetches 3 cycles early when accessing the bus @@ -2997,17 +3005,19 @@ void ARMv5::DWrite32S_2() if ((addr >> 24) == 0x02) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; - } + else + { + DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1< Date: Fri, 6 Dec 2024 21:55:21 -0500 Subject: [PATCH 233/306] do dcache; tweak some contention handling logic --- src/ARM.h | 8 ++- src/CP15.cpp | 176 +++++++++++++++++++++++++++------------------------ src/NDS.cpp | 56 +++++++++++++--- src/NDS.h | 2 +- 4 files changed, 148 insertions(+), 94 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index e9d99184..41b99882 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -57,8 +57,9 @@ enum class CPUExecuteMode : u32 enum class MainRAMType : u8 { Null = 0, + Fetch, ICacheStream, - Fetch + DCacheStream, }; // each one represents a bit in the field @@ -501,7 +502,7 @@ public: * cache. The address is internally aligned to an word boundary * @return Value of the word at addr */ - u32 DCacheLookup(const u32 addr); + bool DCacheLookup(const u32 addr); /** * @brief Updates a word in the data cache if present @@ -684,8 +685,11 @@ public: void JumpTo_4(); void DAbortHandle(); void DAbortHandleS(); + void DCacheFin8(); void DRead8_2(); + void DCacheFin16(); void DRead16_2(); + void DCacheFin32(); void DRead32_2(); void DRead32S_2(); void DWrite8_2(); diff --git a/src/CP15.cpp b/src/CP15.cpp index 0f8d180f..2496fa68 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -586,7 +586,7 @@ bool ARMv5::IsAddressICachable(const u32 addr) const return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_ICACHEABLE; } -u32 ARMv5::DCacheLookup(const u32 addr) +bool ARMv5::DCacheLookup(const u32 addr) { //Log(LogLevel::Debug,"DCache load @ %08x\n", addr); const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); @@ -655,7 +655,8 @@ u32 ARMv5::DCacheLookup(const u32 addr) } DataRegion = Mem9_DCache; //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2], set, id>>2); - return cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2]; + RetVal = cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2]; + return true; } } @@ -672,24 +673,7 @@ u32 ARMv5::DCacheLookup(const u32 addr) // We do not fill the cacheline if it is disabled in the // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] - { - WriteBufferDrain(); - - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 14][1]; // CHECKME: can this do sequential accesses? - - if ((addr >> 24) == 0x02) - { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = (MainRAMTimestamp + ((1<>14]; - - return BusRead32(addr & ~3); - } + return false; u32 line; @@ -728,73 +712,67 @@ u32 ARMv5::DCacheLookup(const u32 addr) DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); #endif - for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) - { - ptr[i >> 2] = BusRead32(tag+i); - } - DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; // timing logic - - // Disabled DCACHE Streaming: - // Wait until the entire cache line is filled before continuing with execution - if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] + + if (NDS.ARM9Regions[addr>>14] == Mem9_MainRAM) { - NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 2)); - DataCycles = MemTimings[tag>>14][2]; - - if ((addr >> 24) == 0x02) + MRTrack.Type = MainRAMType::DCacheStream; + MRTrack.Var = line; + FetchAddr[16] = addr; + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] + DCacheStreamPtr = 7; + else DCacheStreamPtr = (addr & 0x1F) / 4; + } + else + { + for (int i = 0; i < DCACHE_LINELENGTH; i+=sizeof(u32)) { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - MainRAMTimestamp = NDS.ARM9Timestamp + DataCycles; - DataRegion = Mem9_MainRAM; + ptr[i >> 2] = BusRead32(tag+i); } - else + // Disabled DCACHE Streaming: + // Wait until the entire cache line is filled before continuing with execution + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] { + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 2)); + DataCycles = MemTimings[tag>>14][2]; + DataRegion = NDS.ARM9Regions[addr>>14]; if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer - || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store + || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store NDS.ARM9Timestamp += 1<>14]; - if ((addr >> 24) == 0x02) - { - if (NDS.ARM9Timestamp < MainRAMTimestamp) NDS.ARM9Timestamp = MainRAMTimestamp; - } - else + else // DCache Streaming logic { + DataRegion = NDS.ARM9Regions[addr>>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14][1]; - u8 seq = MemTimings[addr>>14][2]; + u8 ns = MemTimings[addr>>14][1]; + u8 seq = MemTimings[addr>>14][2]; - u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually + u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually - u64 cycles = ns + (seq * linepos); - DataCycles = cycles; + u64 cycles = ns + (seq * linepos); + DataCycles = cycles; - cycles += NDS.ARM9Timestamp; + cycles += NDS.ARM9Timestamp; - DCacheStreamPtr = linepos; - for (int i = linepos; i < 7; i++) - { - cycles += seq; - DCacheStreamTimes[i] = cycles; + DCacheStreamPtr = linepos; + for (int i = linepos; i < 7; i++) + { + cycles += seq; + DCacheStreamTimes[i] = cycles; + } } - - if ((addr >> 24) == 0x02) MainRAMTimestamp = ((linepos < 7) ? ICacheStreamTimes[6] : NDS.ARM9Timestamp); + RetVal = ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; } - return ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; + return true; } bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) @@ -2152,6 +2130,15 @@ void ARMv5::DAbortHandleS() DataCycles = 1; } +void ARMv5::DCacheFin8() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> (8 * (addr & 3))) & 0xff; +} + bool ARMv5::DataRead8(u32 addr, u8 reg) { // Data Aborts @@ -2173,8 +2160,7 @@ void ARMv5::DRead8_2() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; - u32 dummy; - u32* val = (LDRFailedRegs & (1<> (8 * (addr & 3))) & 0xff; - return; + if (DCacheLookup(addr)) + { + QueueFunction(&ARMv5::DCacheFin8); + return; + } } } #endif @@ -2247,6 +2236,15 @@ void ARMv5::DRead8_2() } } +void ARMv5::DCacheFin16() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> (8 * (addr & 2))) & 0xffff; +} + bool ARMv5::DataRead16(u32 addr, u8 reg) { // Data Aborts @@ -2268,8 +2266,7 @@ void ARMv5::DRead16_2() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; - u32 dummy; - u32* val = (LDRFailedRegs & (1<> (8* (addr & 2))) & 0xffff; - return; + if (DCacheLookup(addr)) + { + QueueFunction(&ARMv5::DCacheFin16); + return; + } } } #endif @@ -2344,6 +2344,14 @@ void ARMv5::DRead16_2() } } +void ARMv5::DCacheFin32() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 dummy; u32* val = (LDRFailedRegs & (1< 0) && A9WENTLAST) + { + MainRAMTimestamp += 2; + A9ContentionTS += 2; + } + else + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += (ARM9ClockShift == 1) ? 9 : 8; + MainRAMLastAccess = A9LAST; + } + + dcache[*prog] = ARM9Read32(addr); + + if (*prog == ARM9.DCacheStreamPtr) ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + else if (*prog > ARM9.DCacheStreamPtr) ARM9.DCacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; + + (*prog)++; + if (*prog >= 8) + { + ARM9.RetVal = dcache[(ARM9.FetchAddr[16] & 0x1F) / 4]; + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; } break; } @@ -1082,10 +1118,14 @@ void NDS::MainRAMHandleARM7() void NDS::MainRAMHandle() { - if (!A9ContentionTS) + if (!ConTSLock) { A9ContentionTS = (ARM9Timestamp + ((1<> ARM9ClockShift; - if ((ARM9.MRTrack.Type != MainRAMType::Null) && (A9ContentionTS < MainRAMTimestamp)) A9ContentionTS = MainRAMTimestamp; + if (ARM9.MRTrack.Type != MainRAMType::Null) + { + ConTSLock = true; + if (A9ContentionTS < MainRAMTimestamp) A9ContentionTS = MainRAMTimestamp; + } } if (A7PRIORITY) @@ -1094,7 +1134,7 @@ void NDS::MainRAMHandle() { if (A9ContentionTS < ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null) { A9ContentionTS = 0; return; } + if (ARM9.MRTrack.Type == MainRAMType::Null) return; MainRAMHandleARM9(); } else @@ -1110,7 +1150,7 @@ void NDS::MainRAMHandle() { if (A9ContentionTS <= ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null) { A9ContentionTS = 0; return; } + if (ARM9.MRTrack.Type == MainRAMType::Null) return; MainRAMHandleARM9(); } else @@ -1220,7 +1260,7 @@ u32 NDS::RunFrame() } else if (ARM9.MRTrack.Type == MainRAMType::Null) { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; + if (ARM9.abt) ARM9Timestamp = ARM9Target; ARM9.Execute(); } diff --git a/src/NDS.h b/src/NDS.h index 6263247e..d0377fb1 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -258,7 +258,7 @@ public: // TODO: Encapsulate the rest of these members u64 ARM9Timestamp, ARM9Target; u64 ARM7Timestamp, ARM7Target; u64 MainRAMTimestamp; - u64 A9ContentionTS; + u64 A9ContentionTS; bool ConTSLock; u32 ARM9ClockShift; u32 IME[2]; From 3d6ebc1d2b6be2a69e678c7d969a1f5f60025d4a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 7 Dec 2024 00:43:36 -0500 Subject: [PATCH 234/306] rework tracking of overlap --- src/ARM.cpp | 7 +- src/ARM.h | 1 - src/ARMInterpreter.cpp | 5 +- src/ARMInterpreter_ALU.cpp | 6 +- src/CP15.cpp | 179 ++++++++++++++++--------------------- src/DSi.cpp | 23 ++++- src/NDS.cpp | 3 +- 7 files changed, 108 insertions(+), 116 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 31cc5028..3a0ffcf1 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1399,12 +1399,9 @@ void ARMv5::AddCycles_MW(s32 numM) void ARMv5::AddCycles_MW_2() { - s32 numM = DataCycles; - TimestampActual = numM + NDS.ARM9Timestamp; + TimestampActual = NDS.ARM9Timestamp; - numM -= 3< 0) NDS.ARM9Timestamp += numM; + NDS.ARM9Timestamp -= DataCycles; } template diff --git a/src/ARM.h b/src/ARM.h index 41b99882..6add4d29 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -684,7 +684,6 @@ public: void JumpTo_3C(); void JumpTo_4(); void DAbortHandle(); - void DAbortHandleS(); void DCacheFin8(); void DRead8_2(); void DCacheFin16(); diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 671b3d8b..cfe5ef92 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -37,7 +37,6 @@ namespace melonDS::ARMInterpreter void A_UNK(ARM* cpu) { cpu->AddCycles_C(); - cpu->abt=1; Log(LogLevel::Warn, "undefined ARM%d instruction %08X @ %08X\n", cpu->Num?7:9, cpu->CurInstr, cpu->R[15]-8); #ifdef GDBSTUB_ENABLED cpu->GdbStub.Enter(cpu->GdbStub.IsConnected(), Gdb::TgtStatus::FaultInsn, cpu->R[15]-8); @@ -232,7 +231,7 @@ void A_MRS(ARM* cpu) if (cpu->Num != 1) // arm9 { - cpu->AddCycles_C(); // 1 X + cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M } else cpu->AddCycles_C(); // arm7 @@ -314,7 +313,7 @@ void A_MRC(ARM* cpu) if (cpu->Num != 1) { - cpu->AddCycles_C(); // 1 Execute cycle + cpu->AddCycles_CI(2); // 1 Execute cycle ((ARMv5*)cpu)->AddCycles_MW(2); // 2 Memory cycles ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 5edf5a39..073d3530 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -924,7 +924,7 @@ void A_MUL(ARM* cpu) if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); // S else { - cpu->AddCycles_C(); // 1 X + cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; @@ -971,7 +971,7 @@ void A_MLA(ARM* cpu) if (cpu->CurInstr & (1<<20)) cpu->AddCycles_CI(3); else { - cpu->AddCycles_C(); // 1 X + cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; @@ -1331,7 +1331,7 @@ void A_SMLALxy(ARM* cpu) (1 << ((cpu->CurInstr >> 8) & 0xF)) | (1 << ((cpu->CurInstr >> 12) & 0xF))/* | (1 << ((cpu->CurInstr >> 16) & 0xF))*/, iltime); - cpu->AddCycles_C(); // 1 X + cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks diff --git a/src/CP15.cpp b/src/CP15.cpp index 2496fa68..27705262 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -636,7 +636,7 @@ bool ARMv5::DCacheLookup(const u32 addr) if (DCacheStreamPtr >= 7) { - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; } else { @@ -644,6 +644,8 @@ bool ARMv5::DCacheLookup(const u32 addr) //if (NDS.ARM9Timestamp < nextfill) // can this ever really fail? { DataCycles = nextfill - NDS.ARM9Timestamp; + if (DataCycles > (3<> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 2)); + NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 1)); DataCycles = MemTimings[tag>>14][2]; DataRegion = NDS.ARM9Regions[addr>>14]; @@ -759,9 +761,9 @@ bool ARMv5::DCacheLookup(const u32 addr) u8 linepos = (addr & 0x1F) >> 2; // technically this is one too low, but we want that actually u64 cycles = ns + (seq * linepos); - DataCycles = cycles; - - cycles += NDS.ARM9Timestamp; + DataCycles = 3<> 2] = val; - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) @@ -889,7 +891,7 @@ bool ARMv5::DCacheWrite16(const u32 addr, const u16 val) { u16 *cacheLine = (u16 *)&DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[(addr & (DCACHE_LINELENGTH-1)) >> 1] = val; - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) @@ -958,7 +960,7 @@ bool ARMv5::DCacheWrite8(const u32 addr, const u8 val) { u8 *cacheLine = &DCache[(id+set) << DCACHE_LINELENGTH_LOG2]; cacheLine[addr & (DCACHE_LINELENGTH-1)] = val; - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; DataRegion = Mem9_DCache; #if !DISABLE_CACHEWRITEBACK if (PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_BUFFERABLE) @@ -2113,21 +2115,7 @@ void ARMv5::DAbortHandle() DCacheStreamPtr = 7; } - DataCycles = 1; -} - -void ARMv5::DAbortHandleS() -{ - NDS.ARM9Timestamp += DataCycles; - - if (DCacheStreamPtr < 7) - { - u64 fillend = DCacheStreamTimes[6] + 1; - if (NDS.ARM9Timestamp < fillend) NDS.ARM9Timestamp = fillend; // checkme: should this be data cycles? - DCacheStreamPtr = 7; - } - - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; } void ARMv5::DCacheFin8() @@ -2171,15 +2159,15 @@ void ARMv5::DRead8_2() if (addr < ITCMSize) { - DataCycles = 1; - ITCMTimestamp = NDS.ARM9Timestamp + DataCycles; + NDS.ARM9Timestamp += DataCycles = 1; + ITCMTimestamp = NDS.ARM9Timestamp; DataRegion = Mem9_ITCM; *val = *(u8*)&ITCM[addr & (ITCMPhysicalSize - 1)]; return; } if ((addr & DTCMMask) == DTCMBase) { - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; DataRegion = Mem9_DTCM; *val = *(u8*)&DTCM[addr & (DTCMPhysicalSize - 1)]; return; @@ -2216,8 +2204,6 @@ void ARMv5::DRead8_2() NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 14][0]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2226,12 +2212,14 @@ void ARMv5::DRead8_2() } else { + NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 14][0]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2334,12 +2320,14 @@ void ARMv5::DRead16_2() } else { + NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 14][1]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2443,12 +2429,14 @@ void ARMv5::DRead32_2() } else { + NDS.ARM9Timestamp += MemTimings[addr >> 14][1]; + DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>12] & CP15_MAP_READABLE)) [[unlikely]] { - QueueFunction(&ARMv5::DAbortHandleS); + QueueFunction(&ARMv5::DAbortHandle); return false; } @@ -2477,13 +2465,11 @@ void ARMv5::DRead32S_2() u32 addr = FetchAddr[reg]; u32 dummy; u32* val = (LDRFailedRegs & (1<>14][2]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2541,12 +2525,14 @@ void ARMv5::DRead32S_2() } else { + NDS.ARM9Timestamp += MemTimings[addr>>14][2]; + DataCycles = MemTimings[addr>>14][2]; DataRegion = NDS.ARM9Regions[addr>>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14][1]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2564,12 +2548,14 @@ void ARMv5::DRead32S_2() } else { + NDS.ARM9Timestamp += MemTimings[addr>>14][1]; + DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 14][0]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2663,10 +2647,12 @@ void ARMv5::DWrite8_2() } else { + NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = 3<>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<> 14][0]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2771,10 +2755,12 @@ void ARMv5::DWrite16_2() } else { + NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = NDS.ARM9ClockShift; DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<> 14][1]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2884,10 +2868,12 @@ void ARMv5::DWrite32_2() } else { + NDS.ARM9Timestamp += MemTimings[addr >> 14][1]; + DataCycles = 3<>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>12] & CP15_MAP_WRITEABLE)) [[unlikely]] { - QueueFunction(&ARMv5::DAbortHandleS); + QueueFunction(&ARMv5::DAbortHandle); return false; } @@ -2928,13 +2914,11 @@ void ARMv5::DWrite32S_2() u32 addr = FetchAddr[reg]; u32 val = STRVal[reg]; - NDS.ARM9Timestamp += DataCycles; - addr &= ~3; if (addr < ITCMSize) { - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; // we update the timestamp during the actual function, as a sequential itcm access can only occur during instructions with strange itcm wait cycles DataRegion = Mem9_ITCM; *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)] = val; @@ -2946,7 +2930,7 @@ void ARMv5::DWrite32S_2() } if ((addr & DTCMMask) == DTCMBase) { - DataCycles = 1; + NDS.ARM9Timestamp += DataCycles = 1; DataRegion = Mem9_DTCM; *(u32*)&DTCM[addr & (DTCMPhysicalSize - 1)] = val; STRRegs &= ~1<>14][2]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -2994,25 +2976,18 @@ void ARMv5::DWrite32S_2() } else { + NDS.ARM9Timestamp += DataCycles = MemTimings[addr>>14][2]; DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>14][1]; - if ((addr >> 24) == 0x02) { MRTrack.Type = MainRAMType::Fetch; @@ -3021,10 +2996,12 @@ void ARMv5::DWrite32S_2() } else { + NDS.ARM9Timestamp += MemTimings[addr>>14][1]; + DataCycles = 3 << NDS.ARM9ClockShift; DataRegion = NDS.ARM9Regions[addr>>14]; - if (WBTimestamp < ((NDS.ARM9Timestamp + DataCycles + ((1<>= ARM9ClockShift; ARM9Target >>= ARM9ClockShift; + for (int i = 0; i < 7; i++) + { + ARM9.ICacheStreamTimes[i] >>= ARM9ClockShift; + ARM9.DCacheStreamTimes[i] >>= ARM9ClockShift; + } + ARM9.WBTimestamp >>= ARM9ClockShift; + ARM9.WBDelay >>= ARM9ClockShift; + ARM9.WBReleaseTS >>= ARM9ClockShift; + ARM9.WBInitialTS >>= ARM9ClockShift; Log(LogLevel::Debug, "CLOCK9=%04X\n", val); SCFG_Clock9 = val & 0x0187; @@ -1286,6 +1295,16 @@ void DSi::Set_SCFG_Clock9(u16 val) ARM9Timestamp <<= ARM9ClockShift; ARM9Target <<= ARM9ClockShift; + for (int i = 0; i < 7; i++) + { + ARM9.ICacheStreamTimes[i] <<= ARM9ClockShift; + ARM9.DCacheStreamTimes[i] <<= ARM9ClockShift; + } + ARM9.WBTimestamp <<= ARM9ClockShift; + ARM9.WBDelay <<= ARM9ClockShift; + ARM9.WBReleaseTS <<= ARM9ClockShift; + ARM9.WBInitialTS <<= ARM9ClockShift; + ARM9.UpdateRegionTimings(0x00000, 0x40000); } @@ -2562,7 +2581,7 @@ void DSi::ARM9IOWrite32(u32 addr, u32 val) if (oldvram != newvram) SetVRAMTimings(newvram); - /*switch ((SCFG_EXT[0] >> 14) & 0x3) + switch ((SCFG_EXT[0] >> 14) & 0x3) { case 0: case 1: @@ -2575,7 +2594,7 @@ void DSi::ARM9IOWrite32(u32 addr, u32 val) NDS::MainRAMMask = 0xFFFFFF; printf("RAM: 16MB\n"); break; - }*/ + } // HAX!! // a change to the RAM size setting is supposed to apply immediately (it does so on hardware) // however, doing so will cause DS-mode app startup to break, because the change happens while the ARM7 diff --git a/src/NDS.cpp b/src/NDS.cpp index 2436b926..034a2dcc 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -928,7 +928,8 @@ void NDS::MainRAMHandleARM9() if ((var & MRSequential) && A9WENTLAST) { - MainRAMTimestamp = A9ContentionTS += 2; + A9ContentionTS += 2; + MainRAMTimestamp += 2; ARM9.DataCycles = 2 << ARM9ClockShift; } else From 9a4dc9491010aeada8899a30b978e16730eb6420 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 7 Dec 2024 09:36:33 -0500 Subject: [PATCH 235/306] reimplement interlocks --- src/ARM.cpp | 42 +++++++++---------- src/ARM.h | 46 ++++++++++++++++++--- src/ARMInterpreter.cpp | 3 +- src/ARMInterpreter_ALU.cpp | 70 +++++++++++++------------------- src/ARMInterpreter_LoadStore.cpp | 61 +++++++++------------------- src/CP15.cpp | 8 ++-- src/DSi.cpp | 8 ++++ 7 files changed, 121 insertions(+), 117 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 3a0ffcf1..942fff91 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -220,7 +220,7 @@ void ARMv5::Reset() Store = false; ITCMTimestamp = 0; - TimestampActual = 0; + TimestampMemory = 0; ILCurrReg = 16; ILPrevReg = 16; @@ -1373,7 +1373,7 @@ void ARMv5::CodeFetch() // in practice we can treat this as a 1 cycle fetch, with no penalties RetVal = NextInstr[1] >> 16; NDS.ARM9Timestamp++; - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; Store = false; DataRegion = Mem9_Null; } @@ -1391,26 +1391,24 @@ void ARMv5::AddExecute() NDS.ARM9Timestamp += ExecuteCycles; } -void ARMv5::AddCycles_MW(s32 numM) -{ - DataCycles = numM; - QueueFunction(&ARMv5::AddCycles_MW_2); -} - void ARMv5::AddCycles_MW_2() { - TimestampActual = NDS.ARM9Timestamp; + TimestampMemory = NDS.ARM9Timestamp; NDS.ARM9Timestamp -= DataCycles; } -template -void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times) +void ARMv5::SetupInterlock_2() { - /* - if ((bitfield && (ilmask & (1<(u16 ilmask, u8* times); -template void ARMv5::HandleInterlocksExecute(u16 ilmask, u8* times); -void ARMv5::HandleInterlocksMemory(u8 reg) +void ARMv5::HandleInterlocksMemory_2() { - /* - if ((reg != ILPrevReg) || (NDS.ARM9Timestamp >= ILPrevTime)) return; + if ((ILQueueMemReg != ILPrevReg) || (NDS.ARM9Timestamp >= ILPrevTime)) return; u64 diff = ILPrevTime - NDS.ARM9Timestamp; // should always be 1? NDS.ARM9Timestamp = ILPrevTime; ITCMTimestamp += diff; // checkme - ILPrevTime = 16;*/ + ILPrevTime = 16; } void ARMv4::CodeRead16(u32 addr) diff --git a/src/ARM.h b/src/ARM.h index 6add4d29..c5a530cf 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -335,7 +335,11 @@ public: CodeFetch(); } - void AddCycles_MW(s32 numM); + void AddCycles_MW(s32 numM) + { + DataCycles = numM; + QueueFunction(&ARMv5::AddCycles_MW_2); + } void AddCycles_CDI() override { @@ -347,10 +351,33 @@ public: Store = true; // todo: queue this AddCycles_MW(DataCycles); } - + + inline void SetupInterlock(u8 reg, s8 delay = 0) + { + ILQueueReg = reg; + ILQueueDelay = delay; + + QueueFunction(&ARMv5::SetupInterlock_2); + } + template - void HandleInterlocksExecute(u16 ilmask, u8* times = NULL); - void HandleInterlocksMemory(u8 reg); + inline void HandleInterlocksExecute(u16 ilmask, u8* times = NULL) + { + if constexpr (bitfield) ILQueueMask = ilmask; + else ILQueueMask = 1<*QueueEntry)(); } + // Queue Functions void StartExec(); void AddExecute(); void AddCycles_MW_2(); @@ -695,6 +723,9 @@ public: void DWrite16_2(); void DWrite32_2(); void DWrite32S_2(); + void SetupInterlock_2(); + void HandleInterlocksExecute_2(); + void HandleInterlocksMemory_2(); void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } @@ -760,7 +791,7 @@ public: bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); u64 ITCMTimestamp; - u64 TimestampActual; + u64 TimestampMemory; void (ARMv5::*FuncQueue[31])(void); u32 PC; bool NullFetch; @@ -770,6 +801,11 @@ public: u8 ILPrevReg; u64 ILCurrTime; u64 ILPrevTime; + u8 ILQueueReg; + s8 ILQueueDelay; + u8 ILQueueMemReg; + u8 ILQueueTimes[16]; + u16 ILQueueMask; u8 ICacheStreamPtr; u8 DCacheStreamPtr; diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index cfe5ef92..5c8b2b20 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -315,8 +315,7 @@ void A_MRC(ARM* cpu) { cpu->AddCycles_CI(2); // 1 Execute cycle ((ARMv5*)cpu)->AddCycles_MW(2); // 2 Memory cycles - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } else cpu->AddCycles_CI(2 + 1); // TODO: checkme } diff --git a/src/ARMInterpreter_ALU.cpp b/src/ARMInterpreter_ALU.cpp index 073d3530..66386274 100644 --- a/src/ARMInterpreter_ALU.cpp +++ b/src/ARMInterpreter_ALU.cpp @@ -927,8 +927,7 @@ void A_MUL(ARM* cpu) cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } } else @@ -974,8 +973,7 @@ void A_MLA(ARM* cpu) cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } } else @@ -1018,9 +1016,8 @@ void A_UMULL(ARM* cpu) { cpu->AddCycles_CI(2); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks } } else @@ -1070,9 +1067,8 @@ void A_UMLAL(ARM* cpu) { cpu->AddCycles_CI(2); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks } } else @@ -1115,9 +1111,8 @@ void A_SMULL(ARM* cpu) { cpu->AddCycles_CI(2); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks } } else @@ -1166,9 +1161,8 @@ void A_SMLAL(ARM* cpu) { cpu->AddCycles_CI(2); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); // only one rd interlocks } } else @@ -1213,9 +1207,8 @@ void A_SMLAxy(ARM* cpu) (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMLAWy(ARM* cpu) @@ -1234,6 +1227,7 @@ void A_SMLAWy(ARM* cpu) if (((cpu->CurInstr >> 16) & 0xF) != 15) cpu->R[(cpu->CurInstr >> 16) & 0xF] = res; + if (OverflowAdd(res_mul, rn)) cpu->CPSR |= 0x08000000; @@ -1244,9 +1238,8 @@ void A_SMLAWy(ARM* cpu) (1 << ((cpu->CurInstr >> 12) & 0xF)), iltime); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMULxy(ARM* cpu) @@ -1271,9 +1264,8 @@ void A_SMULxy(ARM* cpu) (1 << ((cpu->CurInstr >> 8) & 0xF))); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMULWy(ARM* cpu) @@ -1296,9 +1288,8 @@ void A_SMULWy(ARM* cpu) (1 << ((cpu->CurInstr >> 8) & 0xF))); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } void A_SMLALxy(ARM* cpu) @@ -1334,8 +1325,7 @@ void A_SMLALxy(ARM* cpu) cpu->AddCycles_CI(2); // 1 X ((ARMv5*)cpu)->AddCycles_MW(2); // 2 M - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 16) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 16) & 0xF); } @@ -1388,9 +1378,8 @@ void A_QADD(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } void A_QSUB(ARM* cpu) @@ -1413,9 +1402,8 @@ void A_QSUB(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } void A_QDADD(ARM* cpu) @@ -1446,9 +1434,8 @@ void A_QDADD(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } void A_QDSUB(ARM* cpu) @@ -1479,9 +1466,8 @@ void A_QDSUB(ARM* cpu) ((ARMv5*)cpu)->HandleInterlocksExecute((1 << (cpu->CurInstr & 0xF)) | (1 << ((cpu->CurInstr >> 16) & 0xF))); cpu->AddCycles_C(); - ((ARMv5*)cpu)->AddCycles_MW(1); // dummy memory stage for interlock handling - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 12) & 0xF; // only one rd interlocks - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->AddCycles_MW(1); // normally 1 length memory stages should be implicit, but we need one here explicitly for interlocks to work + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 12) & 0xF); } diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 4bf4984e..697d9a6e 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -170,19 +170,11 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if (rd == 15) { - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual + ((size<32) || (addr&0x3)); // force an interlock + //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory + ((size<32) || (addr&0x3)); // force an interlock cpu->JumpTo(cpu->R[15], false, 1); } - else - { - if (cpu->Num == 0) - { - ((ARMv5*)cpu)->ILCurrReg = rd; - bool extra = ((size < 32) || (addr&0x3)); - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + extra; - } - } + else if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(rd, (size < 32) || (addr&0x3)); } template @@ -381,12 +373,10 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r+1 == 15) { \ - /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual;*/ \ + /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory;*/ \ cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ - if (cpu->Num == 0) { \ - ((ARMv5*)cpu)->ILCurrReg = r+1; \ - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } \ + if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(r+1); } \ if (cpu->CurInstr & (1<<21)) cpu->R[(cpu->CurInstr>>16) & 0xF] = offset; #define A_LDRD_POST \ @@ -405,12 +395,10 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r+1 == 15) { \ - /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual;*/ \ + /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory;*/ \ cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ - if (cpu->Num == 0) { \ - ((ARMv5*)cpu)->ILCurrReg = r+1; \ - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; } } \ + if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(r+1); } \ cpu->R[(cpu->CurInstr>>16) & 0xF] += offset; #define A_STRD \ @@ -538,12 +526,7 @@ inline void SWP(ARM* cpu) if (rd != 15) { - if (cpu->Num == 0) - { - ((ARMv5*)cpu)->ILCurrReg = rd; - bool extra = (byte || (base&0x3)); - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual + extra; - } + if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(rd, byte || (base&0x3)); } else if (cpu->Num==1) // for some reason these jumps don't seem to work on the arm 9? { @@ -698,7 +681,7 @@ void A_LDM(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0) ;//cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ;//cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -756,14 +739,13 @@ void A_LDM(ARM* cpu) // jump if pc got written if (cpu->CurInstr & (1<<15)) { - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock + //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // force an interlock cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } else if (cpu->Num == 0) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0x7FFF); - ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock(lastreg); } } @@ -849,7 +831,7 @@ void A_STM(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -889,8 +871,7 @@ void T_LDR_PCREL(ARM* cpu) if (dabort) [[unlikely]] ((ARMv5*)cpu)->DataAbort(); else if (cpu->Num == 0) { - ((ARMv5*)cpu)->ILCurrReg = (cpu->CurInstr >> 8) & 0x7; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock((cpu->CurInstr >> 8) & 0x7); } } @@ -1034,7 +1015,7 @@ void T_PUSH(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1090,7 +1071,7 @@ void T_POP(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1101,7 +1082,7 @@ void T_POP(ARM* cpu) if (!dabort) [[likely]] { - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // force an interlock + //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // force an interlock cpu->JumpTo(cpu->R[15], false, 2); base += 4; @@ -1120,7 +1101,7 @@ void T_POP(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1139,8 +1120,7 @@ void T_POP(ARM* cpu) else { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); - ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock(lastreg); } } } @@ -1182,7 +1162,7 @@ void T_STMIA(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1232,7 +1212,7 @@ void T_LDMIA(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampActual; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1250,8 +1230,7 @@ void T_LDMIA(ARM* cpu) if (cpu->Num == 0) { u8 lastreg = 31 - __builtin_clz(cpu->CurInstr & 0xFF); - ((ARMv5*)cpu)->ILCurrReg = lastreg; - ((ARMv5*)cpu)->ILCurrTime = ((ARMv5*)cpu)->TimestampActual; + ((ARMv5*)cpu)->SetupInterlock(lastreg); } if (!(cpu->CurInstr & (1<<((cpu->CurInstr >> 8) & 0x7)))) diff --git a/src/CP15.cpp b/src/CP15.cpp index 27705262..e15fb41b 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -438,7 +438,7 @@ bool ARMv5::ICacheLookup(const u32 addr) ICacheStreamPtr = 7; } } - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; DataRegion = Mem9_Null; Store = false; @@ -521,7 +521,7 @@ bool ARMv5::ICacheLookup(const u32 addr) if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_STREAMING) [[unlikely]] { NDS.ARM9Timestamp += MemTimings[tag >> 14][1] + (MemTimings[tag >> 14][2] * ((DCACHE_LINELENGTH / 4) - 1)); - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; // this should never trigger in practice + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; // this should never trigger in practice } else // ICache Streaming logic { @@ -2034,7 +2034,7 @@ void ARMv5::CodeRead32(u32 addr) if (!(PU_Map[addr>>12] & CP15_MAP_EXECUTABLE)) [[unlikely]] { NDS.ARM9Timestamp += 1; - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; DataRegion = Mem9_Null; Store = false; RetVal = ((u64)1<<63); @@ -2045,7 +2045,7 @@ void ARMv5::CodeRead32(u32 addr) { if (NDS.ARM9Timestamp < ITCMTimestamp) NDS.ARM9Timestamp = ITCMTimestamp; NDS.ARM9Timestamp += 1; - if (NDS.ARM9Timestamp < TimestampActual) NDS.ARM9Timestamp = TimestampActual; + if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; DataRegion = Mem9_Null; Store = false; RetVal = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; diff --git a/src/DSi.cpp b/src/DSi.cpp index 7ae969af..e1e6816e 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -1282,10 +1282,14 @@ void DSi::Set_SCFG_Clock9(u16 val) ARM9.ICacheStreamTimes[i] >>= ARM9ClockShift; ARM9.DCacheStreamTimes[i] >>= ARM9ClockShift; } + ARM9.TimestampMemory >>= ARM9ClockShift; + ARM9.ITCMTimestamp >>= ARM9ClockShift; ARM9.WBTimestamp >>= ARM9ClockShift; ARM9.WBDelay >>= ARM9ClockShift; ARM9.WBReleaseTS >>= ARM9ClockShift; ARM9.WBInitialTS >>= ARM9ClockShift; + ARM9.ILCurrTime >>= ARM9ClockShift; + ARM9.ILPrevTime >>= ARM9ClockShift; Log(LogLevel::Debug, "CLOCK9=%04X\n", val); SCFG_Clock9 = val & 0x0187; @@ -1300,10 +1304,14 @@ void DSi::Set_SCFG_Clock9(u16 val) ARM9.ICacheStreamTimes[i] <<= ARM9ClockShift; ARM9.DCacheStreamTimes[i] <<= ARM9ClockShift; } + ARM9.TimestampMemory <<= ARM9ClockShift; + ARM9.ITCMTimestamp <<= ARM9ClockShift; ARM9.WBTimestamp <<= ARM9ClockShift; ARM9.WBDelay <<= ARM9ClockShift; ARM9.WBReleaseTS <<= ARM9ClockShift; ARM9.WBInitialTS <<= ARM9ClockShift; + ARM9.ILCurrTime <<= ARM9ClockShift; + ARM9.ILPrevTime <<= ARM9ClockShift; ARM9.UpdateRegionTimings(0x00000, 0x40000); } From 98f24d05c7736aeeab3db118e0e14f3b3097715b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 7 Dec 2024 09:55:55 -0500 Subject: [PATCH 236/306] reimplement forced interlocks --- src/ARM.cpp | 5 +++++ src/ARM.h | 8 ++++++++ src/ARMInterpreter_LoadStore.cpp | 24 ++++++++++++------------ 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 942fff91..040d8bfb 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1447,6 +1447,11 @@ void ARMv5::HandleInterlocksMemory_2() ILPrevTime = 16; } +void ARMv5::ForceInterlock_2() +{ + NDS.ARM9Timestamp = TimestampMemory + ILForceDelay; +} + void ARMv4::CodeRead16(u32 addr) { if ((addr >> 24) == 0x02) diff --git a/src/ARM.h b/src/ARM.h index c5a530cf..b78c4130 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -372,6 +372,12 @@ public: QueueFunction(&ARMv5::HandleInterlocksExecute_2); } + inline void ForceInterlock(s8 delay = 0) + { + ILForceDelay = delay; + QueueFunction(&ARMv5::ForceInterlock_2); + } + inline void HandleInterlocksMemory(u8 reg) { ILQueueMemReg = reg; @@ -726,6 +732,7 @@ public: void SetupInterlock_2(); void HandleInterlocksExecute_2(); void HandleInterlocksMemory_2(); + void ForceInterlock_2(); void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } @@ -812,6 +819,7 @@ public: u64 ICacheStreamTimes[7]; u64 DCacheStreamTimes[7]; + s8 ILForceDelay; u8 WBWritePointer; // which entry to attempt to write next; should always be ANDed with 0xF after incrementing u8 WBFillPointer; // where the next entry should be added; should always be ANDed with 0xF after incrementing u8 WBWriting; // whether the buffer is actively trying to perform a write diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 697d9a6e..ff9d6e5c 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -170,7 +170,7 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 if (rd == 15) { - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory + ((size<32) || (addr&0x3)); // force an interlock + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock((size<32) || (addr&0x3)); cpu->JumpTo(cpu->R[15], false, 1); } @@ -373,7 +373,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r+1 == 15) { \ - /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory;*/ \ + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); \ cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(r+1); } \ @@ -395,7 +395,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ((ARMv5*)cpu)->DataAbort(); \ return; } \ if (r+1 == 15) { \ - /*if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory;*/ \ + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); ; \ cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } /* restores cpsr presumably due to shared dna with ldm */ \ else { \ if (cpu->Num == 0) ((ARMv5*)cpu)->SetupInterlock(r+1); } \ @@ -681,7 +681,7 @@ void A_LDM(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0) ;//cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -739,7 +739,7 @@ void A_LDM(ARM* cpu) // jump if pc got written if (cpu->CurInstr & (1<<15)) { - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // force an interlock + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); cpu->JumpTo(cpu->R[15], cpu->CurInstr & (1<<22), 1); } else if (cpu->Num == 0) @@ -831,7 +831,7 @@ void A_STM(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1015,7 +1015,7 @@ void T_PUSH(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1071,7 +1071,7 @@ void T_POP(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1082,7 +1082,7 @@ void T_POP(ARM* cpu) if (!dabort) [[likely]] { - //if (cpu->Num==0) cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // force an interlock + if (cpu->Num==0) ((ARMv5*)cpu)->ForceInterlock(); cpu->JumpTo(cpu->R[15], false, 2); base += 4; @@ -1101,7 +1101,7 @@ void T_POP(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1162,7 +1162,7 @@ void T_STMIA(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CD(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else @@ -1212,7 +1212,7 @@ void T_LDMIA(ARM* cpu) { //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; cpu->AddCycles_CDI(); - if (cpu->Num == 0);// cpu->NDS.ARM9Timestamp = ((ARMv5*)cpu)->TimestampMemory; // on arm9 single reg ldm/stm cannot overlap memory and fetch stages + if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else From d14c5ea246179674fe9057a3076182fb3b9a17f0 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 7 Dec 2024 10:07:57 -0500 Subject: [PATCH 237/306] re-add itcm delay for ldm/stm --- src/ARM.cpp | 5 +++++ src/ARM.h | 12 ++++++++-- src/ARMInterpreter_LoadStore.cpp | 38 +++++++++++++++----------------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 040d8bfb..a7f19414 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1398,6 +1398,11 @@ void ARMv5::AddCycles_MW_2() NDS.ARM9Timestamp -= DataCycles; } +void ARMv5::DelayIfITCM_2() +{ + if (DataRegion == Mem9_ITCM) NDS.ARM9Timestamp += ITCMDelay; +} + void ARMv5::SetupInterlock_2() { ILCurrReg = ILQueueReg; diff --git a/src/ARM.h b/src/ARM.h index b78c4130..737b196f 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -352,6 +352,12 @@ public: AddCycles_MW(DataCycles); } + void DelayIfITCM(s8 delay) + { + ITCMDelay = delay; + QueueFunction(&ARMv5::DelayIfITCM_2); + } + inline void SetupInterlock(u8 reg, s8 delay = 0) { ILQueueReg = reg; @@ -712,6 +718,7 @@ public: void StartExec(); void AddExecute(); void AddCycles_MW_2(); + void DelayIfITCM_2(); void JumpTo_2(); void JumpTo_3A(); void JumpTo_3B(); @@ -761,11 +768,11 @@ public: u8 ITCM[ITCMPhysicalSize]; //! Content of the ITCM u8* DTCM; //! Content of the DTCM - u8 ICache[ICACHE_SIZE]; //! Instruction Cache Content organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS times @ref ICACHE_LINELENGTH bytes + alignas(u32) u8 ICache[ICACHE_SIZE]; //! Instruction Cache Content organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS times @ref ICACHE_LINELENGTH bytes u32 ICacheTags[ICACHE_LINESPERSET*ICACHE_SETS]; //! Instruction Cache Tags organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS Tags u8 ICacheCount; //! Global instruction line fill counter. Used for round-robin replacement strategy with the instruction cache - u8 DCache[DCACHE_SIZE]; //! Data Cache Content organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS times @ref DCACHE_LINELENGTH bytes + alignas(u32) u8 DCache[DCACHE_SIZE]; //! Data Cache Content organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS times @ref DCACHE_LINELENGTH bytes u32 DCacheTags[DCACHE_LINESPERSET*DCACHE_SETS]; //! Data Cache Tags organized in @ref DCACHE_LINESPERSET times @ref DCACHE_SETS Tags u8 DCacheCount; //! Global data line fill counter. Used for round-robin replacement strategy with the instruction cache @@ -803,6 +810,7 @@ public: u32 PC; bool NullFetch; bool Store; + s8 ITCMDelay; u8 ILCurrReg; u8 ILPrevReg; diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index ff9d6e5c..658ab4c8 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -366,7 +366,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ bool dabort = !cpu->DataRead32(offset, r); \ u32 oldval = cpu->R[r+1]; dabort |= !cpu->DataRead32S(offset+4, r+1); \ - /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ cpu->AddCycles_CDI(); \ if (dabort) { \ cpu->R[r+1] = oldval; \ @@ -388,7 +388,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) ExecuteStage(cpu, ilmask | (1 << ((cpu->CurInstr>>16) & 0xF))); \ bool dabort = !cpu->DataRead32(addr, r); \ u32 oldval = cpu->R[r+1]; dabort |= !cpu->DataRead32S(addr+4, r+1); \ - /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ cpu->AddCycles_CDI(); \ if (dabort) { \ cpu->R[r+1] = oldval; \ @@ -411,7 +411,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dabort = !cpu->DataWrite32(offset, cpu->R[r], r); \ u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ dabort |= !cpu->DataWrite32S (offset+4, storeval, r+1); \ - /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ cpu->AddCycles_CD(); \ if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -428,7 +428,7 @@ A_IMPLEMENT_WB_LDRSTR(LDRB) bool dabort = !cpu->DataWrite32(addr, cpu->R[r], r); \ u32 storeval = cpu->R[r+1]; if (r+1 == 15) storeval+=4; \ dabort |= !cpu->DataWrite32S (addr+4, storeval, r+1); \ - /*if (cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2;*/ \ + ((ARMv5*)cpu)->DelayIfITCM(2); \ cpu->AddCycles_CD(); \ if (dabort) [[unlikely]] { \ ((ARMv5*)cpu)->DataAbort(); \ @@ -508,8 +508,6 @@ inline void SWP(ARM* cpu) if ((byte ? cpu->DataRead8 (base, rd) : cpu->DataRead32(base, rd))) [[likely]] { - //cpu->NDS.ARM9Timestamp += cpu->DataCycles; // checkme - if ((byte ? cpu->DataWrite8 (base, storeval, rm) : cpu->DataWrite32(base, storeval, rm))) [[likely]] { @@ -679,14 +677,14 @@ void A_LDM(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CDI(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CDI(); } @@ -829,14 +827,14 @@ void A_STM(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFFFF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CD(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CD(); } @@ -1013,14 +1011,14 @@ void T_PUSH(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CD(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CD(); } @@ -1069,14 +1067,14 @@ void T_POP(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CDI(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CDI(); } @@ -1099,14 +1097,14 @@ void T_POP(ARM* cpu) { if (__builtin_popcount(cpu->CurInstr & 0x1FF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CDI(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CDI(); } @@ -1160,14 +1158,14 @@ void T_STMIA(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CD(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CD(); } @@ -1210,14 +1208,14 @@ void T_LDMIA(ARM* cpu) if (__builtin_popcount(cpu->CurInstr & 0xFF) == 1) [[unlikely]] // single reg { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 1; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(1); cpu->AddCycles_CDI(); if (cpu->Num == 0) ((ARMv5*)cpu)->ForceInterlock(); // on arm9 single reg ldm/stm cannot overlap memory and fetch stages else; // CHECKME: ARM7 timing behavior? } else { - //if (cpu->Num == 0 && cpu->DataRegion == Mem9_ITCM) cpu->NDS.ARM9Timestamp += 2; + if (cpu->Num == 0) ((ARMv5*)cpu)->DelayIfITCM(2); cpu->AddCycles_CDI(); } From b40c6bc41d71714eedf96e000bedd66db302af19 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 00:19:43 -0500 Subject: [PATCH 238/306] implement write buffer --- src/ARM.cpp | 21 +- src/ARM.h | 45 +++- src/CP15.cpp | 718 ++++++++++++++++++++++++++++++++------------------- src/NDS.cpp | 104 +++++++- 4 files changed, 594 insertions(+), 294 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index a7f19414..b2b10c63 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -198,15 +198,12 @@ void ARM::Reset() BreakReq = false; #endif - MainRAMTimestamp = 0; - memset(&MRTrack, 0, sizeof(MRTrack)); FuncQueueFill = 0; FuncQueueEnd = 0; FuncQueueProg = 0; FuncQueueActive = false; - ExecuteCycles = 0; // zorp JumpTo(ExceptionBase); @@ -748,6 +745,12 @@ void ARMv5::StartExec() else AddCycles_C(); } + QueueFunction(&ARMv5::WBCheck_2); +} + +void ARMv5::WBCheck_2() +{ + WriteBufferCheck(); } template @@ -756,7 +759,7 @@ void ARMv5::Execute() if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckB(); - if (Halted) + if (!FuncQueueActive && Halted) { if (Halted == 2) { @@ -777,7 +780,6 @@ void ARMv5::Execute() else { NDS.ARM9Timestamp = NDS.ARM9Target; - WriteBufferCheck(); return; } } @@ -828,7 +830,7 @@ void ARMv5::Execute() if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); // gdb might throw a hissy fit about this change but idc - //printf("A:%i, F:%i, P:%i, E:%i, I:%08llX, P:%08X, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, PC, R[15]); + //printf("A9: A:%i, F:%i, P:%i, E:%i, I:%08llX, P:%08X, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, PC, R[15]); (this->*FuncQueue[FuncQueueProg])(); @@ -882,7 +884,6 @@ void ARMv5::Execute() //NDS.ARM9Timestamp += Cycles; //Cycles = 0; } - WriteBufferCheck(); if (Halted == 2) Halted = 0; @@ -938,7 +939,7 @@ void ARMv4::Execute() if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckB(); - if (Halted) + if (!FuncQueueActive && Halted) { if (Halted == 2) { @@ -1008,8 +1009,8 @@ void ARMv4::Execute() if constexpr (mode == CPUExecuteMode::InterpreterGDB) GdbCheckC(); - //printf("A:%i, F:%i, P:%i, E:%i, I:%08llX, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, R[15]); - + //printf("A7: A:%i, F:%i, P:%i, E:%i, I:%08llX, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, R[15]); + (this->*FuncQueue[FuncQueueProg])(); if (FuncQueueActive) diff --git a/src/ARM.h b/src/ARM.h index 737b196f..7eb8e842 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -21,6 +21,7 @@ #include #include +#include #include "types.h" #include "MemRegion.h" @@ -54,12 +55,26 @@ enum class CPUExecuteMode : u32 #endif }; +enum class WBMode +{ + Check, + Force, + SingleBurst, + WaitEntry, +}; + enum class MainRAMType : u8 { Null = 0, Fetch, ICacheStream, DCacheStream, + WriteBufferCmds, // all write buffer commands must be above this one; wb cmds not strictly used for main ram + WBDrain, + WBWrite, + WBCheck, + WBWaitRead, + WBWaitWrite, }; // each one represents a bit in the field @@ -214,7 +229,6 @@ public: MemRegion CodeMem; - u64 MainRAMTimestamp; MainRAMTrackers MRTrack; u32 BranchAddr; @@ -493,7 +507,7 @@ public: */ void ICacheInvalidateAll(); - template inline bool WriteBufferHandle(); + template bool WriteBufferHandle(); template void WriteBufferCheck(); void WriteBufferWrite(u32 val, u8 flag, u32 addr = 0); void WriteBufferDrain(); @@ -724,18 +738,35 @@ public: void JumpTo_3B(); void JumpTo_3C(); void JumpTo_4(); + void CodeRead32_2(); + void ICacheLookup_2(); void DAbortHandle(); void DCacheFin8(); void DRead8_2(); + void DRead8_3(); void DCacheFin16(); void DRead16_2(); + void DRead16_3(); void DCacheFin32(); void DRead32_2(); + void DRead32_3(); void DRead32S_2(); + void DRead32S_3(); void DWrite8_2(); + void DWrite8_3(); void DWrite16_2(); + void DWrite16_3(); void DWrite32_2(); + void DWrite32_3(); void DWrite32S_2(); + void DWrite32S_3(); + void WBCheck_2(); + void DCacheLookup_2(); + void DCacheLookup_3(); + void DCClearAddr_2(); + void DCClearSetWay_2(); + void DCClearInvalidateAddr_2(); + void DCClearInvalidateSetWay_2(); void SetupInterlock_2(); void HandleInterlocksExecute_2(); void HandleInterlocksMemory_2(); @@ -806,11 +837,14 @@ public: u64 ITCMTimestamp; u64 TimestampMemory; - void (ARMv5::*FuncQueue[31])(void); + void (ARMv5::*FuncQueue[32])(void); + void (ARMv5::*DelayedQueue)(void); u32 PC; bool NullFetch; bool Store; s8 ITCMDelay; + u32 QueuedDCacheLine; + u32 CP15Queue; u8 ILCurrReg; u8 ILPrevReg; @@ -833,7 +867,9 @@ public: u8 WBWriting; // whether the buffer is actively trying to perform a write u32 WBCurAddr; // address the write buffer is currently writing to u64 WBCurVal; // current value being written; 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address (invalid in this variable) + u32 WBAddrQueued[40]; u32 storeaddr[16]; // temp until i figure out why using the fifo address entries directly didn't work + u64 WBValQueued[40]; u64 WriteBufferFifo[16]; // 0-31: val | 61-63: flag; 0 = byte ns; 1 = halfword ns; 2 = word ns; 3 = word s; 4 = address u64 WBTimestamp; // current timestamp //u64 WBMainRAMDelay; // timestamp used to emulate the delay before the next main ram write can begin @@ -870,8 +906,7 @@ public: template void Execute(); - Platform::FileHandle* filey; - void (ARMv4::*FuncQueue[31])(void); + void (ARMv4::*FuncQueue[32])(void); bool Nonseq; void CodeRead16(u32 addr); diff --git a/src/CP15.cpp b/src/CP15.cpp index e15fb41b..979cbb38 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -454,6 +454,18 @@ bool ARMv5::ICacheLookup(const u32 addr) if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] return false; + WriteBufferDrain(); + FetchAddr[16] = addr; + QueueFunction(&ARMv5::ICacheLookup_2); + return true; +} + +void ARMv5::ICacheLookup_2() +{ + u32 addr = FetchAddr[16]; + const u32 tag = (addr & ~(ICACHE_LINELENGTH - 1)); + const u32 id = ((addr >> ICACHE_LINELENGTH_LOG2) & (ICACHE_LINESPERSET-1)) << ICACHE_SETS_LOG2; + u32 line; if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] @@ -491,8 +503,6 @@ bool ARMv5::ICacheLookup(const u32 addr) if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - WriteBufferDrain(); - ICacheTags[line] = tag | (line & (ICACHE_SETS-1)) | CACHE_FLAG_VALID; // timing logic @@ -544,7 +554,6 @@ bool ARMv5::ICacheLookup(const u32 addr) } Store = false; DataRegion = Mem9_Null; - return true; } void ARMv5::ICacheInvalidateByAddr(const u32 addr) @@ -658,6 +667,7 @@ bool ARMv5::DCacheLookup(const u32 addr) DataRegion = Mem9_DCache; //Log(LogLevel::Debug, "DCache hit at %08lx returned %08x from set %i, line %i\n", addr, cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2], set, id>>2); RetVal = cacheLine[(addr & (DCACHE_LINELENGTH -1)) >> 2]; + (this->*DelayedQueue)(); return true; } } @@ -676,7 +686,18 @@ bool ARMv5::DCacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] return false; + WriteBufferDrain(); // checkme? + FetchAddr[16] = addr; + QueueFunction(&ARMv5::DCacheLookup_2); + return true; +} + +void ARMv5::DCacheLookup_2() +{ + u32 addr = FetchAddr[16]; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; u32 line; if (CP15Control & CP15_CACHE_CR_ROUNDROBIN) [[likely]] @@ -701,19 +722,26 @@ bool ARMv5::DCacheLookup(const u32 addr) u8 minSet = DCacheLockDown & (DCACHE_SETS-1); line = line | minSet; } - } + } line += id; - u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; - - WriteBufferDrain(); // checkme? - #if !DISABLE_CACHEWRITEBACK // Before we fill the cacheline, we need to write back dirty content // Datacycles will be incremented by the required cycles to do so DCacheClearByASetAndWay(line & (DCACHE_SETS-1), line >> DCACHE_SETS_LOG2); #endif + + QueuedDCacheLine = line; + QueueFunction(&ARMv5::DCacheLookup_3); +} +void ARMv5::DCacheLookup_3() +{ + u32 addr = FetchAddr[16]; + const u32 tag = (addr & ~(DCACHE_LINELENGTH - 1)); + const u32 id = ((addr >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET-1)) << DCACHE_SETS_LOG2; + u32 line = QueuedDCacheLine; + u32* ptr = (u32 *)&DCache[line << DCACHE_LINELENGTH_LOG2]; DCacheTags[line] = tag | (line & (DCACHE_SETS-1)) | CACHE_FLAG_VALID; // timing logic @@ -722,10 +750,12 @@ bool ARMv5::DCacheLookup(const u32 addr) { MRTrack.Type = MainRAMType::DCacheStream; MRTrack.Var = line; - FetchAddr[16] = addr; + if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_STREAMING) [[unlikely]] DCacheStreamPtr = 7; else DCacheStreamPtr = (addr & 0x1F) / 4; + + QueueFunction(DelayedQueue); } else { @@ -773,8 +803,8 @@ bool ARMv5::DCacheLookup(const u32 addr) } } RetVal = ptr[(addr & (DCACHE_LINELENGTH-1)) >> 2]; + (this->*DelayedQueue)(); } - return true; } bool ARMv5::DCacheWrite32(const u32 addr, const u32 val) @@ -1099,7 +1129,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) WriteBufferWrite(ptr[1], 3, tag+0x04); WriteBufferWrite(ptr[2], 3, tag+0x08); WriteBufferWrite(ptr[3], 3, tag+0x0C); - NDS.ARM9Timestamp += 4; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? + //NDS.ARM9Timestamp += 4; //DataCycles += 5; CHECKME: does this function like a write does but with mcr? } if (DCacheTags[index] & CACHE_FLAG_DIRTY_UPPERHALF) // todo: check how this behaves when both fields need to be written { @@ -1117,7 +1147,7 @@ void ARMv5::DCacheClearByASetAndWay(const u8 cacheSet, const u8 cacheLine) WriteBufferWrite(ptr[5], 3, tag+0x14); WriteBufferWrite(ptr[6], 3, tag+0x18); WriteBufferWrite(ptr[7], 3, tag+0x1C); - NDS.ARM9Timestamp += 4; + //NDS.ARM9Timestamp += 4; } DCacheTags[index] &= ~(CACHE_FLAG_DIRTY_LOWERHALF | CACHE_FLAG_DIRTY_UPPERHALF); #endif @@ -1128,135 +1158,180 @@ bool ARMv5::IsAddressDCachable(const u32 addr) const return PU_Map[addr >> CP15_MAP_ENTRYSIZE_LOG2] & CP15_MAP_DCACHEABLE; } -template -inline bool ARMv5::WriteBufferHandle() +#define A9WENTLAST (!NDS.MainRAMLastAccess) +#define A7WENTLAST ( NDS.MainRAMLastAccess) +#define A9LAST false +#define A7LAST true +#define A9PRIORITY !(NDS.ExMemCnt[0] & 0x8000) +#define A7PRIORITY (NDS.ExMemCnt[0] & 0x8000) + +template +bool ARMv5::WriteBufferHandle() { - // handle write buffer writes - if (WBWriting) + while (true) { - // look up timings - // TODO: handle interrupted bursts? - u32 cycles; - switch (WBCurVal >> 61) + if (WBWriting) { - case 0: + if ((mode == WBMode::Check) && ((NDS.A9ContentionTS << NDS.ARM9ClockShift) > NDS.ARM9Timestamp)) return true; + // look up timings + // TODO: handle interrupted bursts? + u32 cycles; + switch (WBCurVal >> 61) { - if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + case 0: { - cycles = (4 << NDS.ARM9ClockShift) - 1; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } + cycles = 4; + NDS.MainRAMTimestamp = NDS.A9ContentionTS + 9; + NDS.MainRAMLastAccess = A9LAST; + } + else cycles = (MemTimings[WBCurAddr>>14][0] - 5) >> NDS.ARM9ClockShift; // todo: twl timings + break; } - else cycles = MemTimings[WBCurAddr>>14][0] - 6; // todo: twl timings - break; - } - case 1: - { - if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + case 1: { - cycles = (3 << NDS.ARM9ClockShift) - 1; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } + NDS.MainRAMTimestamp = NDS.A9ContentionTS + 8; + cycles = 3; + NDS.MainRAMLastAccess = A9LAST; + } + else cycles = (MemTimings[WBCurAddr>>14][0] - 5) >> NDS.ARM9ClockShift; // todo: twl timings + break; } - else cycles = MemTimings[WBCurAddr>>14][0] - 6; // todo: twl timings - break; - } - case 2: - { - if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + case 3: { - cycles = (4 << NDS.ARM9ClockShift) - 1; + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (A9WENTLAST) + { + NDS.MainRAMTimestamp += 2; + cycles = 2; + break; + } + } + else + { + cycles = MemTimings[WBCurAddr>>14][2] >> NDS.ARM9ClockShift; + break; + } + } + case 2: + { + if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) + { + if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } + NDS.MainRAMTimestamp = NDS.A9ContentionTS + 9; + cycles = 4; + NDS.MainRAMLastAccess = A9LAST; + } + else cycles = (MemTimings[WBCurAddr>>14][1] - 5) >> NDS.ARM9ClockShift; // todo: twl timings + break; } - else cycles = MemTimings[WBCurAddr>>14][1] - 6; // todo: twl timings - break; } - case 3: + + NDS.A9ContentionTS += cycles; + WBReleaseTS = (NDS.A9ContentionTS << NDS.ARM9ClockShift) - 1; + if (NDS.ARM9Regions[WBCurAddr>>14] != Mem9_MainRAM && ((WBCurVal >> 61) != 3)) { - cycles = MemTimings[WBCurAddr>>14][2]; - break; + NDS.A9ContentionTS += 1; + WBTimestamp = WBReleaseTS + 2; // todo: twl timings } + else + { + WBTimestamp = WBReleaseTS; + } + if (WBWritePointer != 16 && (WriteBufferFifo[WBWritePointer] >> 61) != 3) WBInitialTS = WBTimestamp; + + switch (WBCurVal >> 61) + { + case 0: // byte + BusWrite8 (WBCurAddr, WBCurVal); + break; + case 1: // halfword + BusWrite16(WBCurAddr, WBCurVal); + break; + case 2: // word + case 3: + BusWrite32(WBCurAddr, WBCurVal); + break; + default: // invalid + Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE NONSENSE VIA THE WRITE BUFFER! PANIC!!! Flag: %i\n", (u8)(WBCurVal >> 61)); + break; + } + + WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; + WBWriting = false; + if ((mode == WBMode::SingleBurst) && ((WriteBufferFifo[WBWritePointer] >> 61) != 3)) return true; } - if ((NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) && (((MainRAMTimestamp + ((1< WBTimestamp)) - WBTimestamp = (MainRAMTimestamp + ((1< NDS.ARM9Timestamp) return true; - if ( force && ts > NDS.ARM9Timestamp) + // attempt to drain write buffer + if ((WriteBufferFifo[WBWritePointer] >> 61) != 4) // not an address { - NDS.ARM9Timestamp = ts; - } + if (WBInitialTS > NDS.ARM9Timestamp) + { + if (mode == WBMode::Check) return true; + else NDS.ARM9Timestamp = WBInitialTS; + } - if ((WBCurVal >> 61) != 3) - { - WBReleaseTS = WBTimestamp = (ts + ((1<>14] == Mem9_MainRAM) MainRAMTimestamp = ts + ((((WBCurVal >> 61) == 0) ? 4 : 5) << NDS.ARM9ClockShift); - else WBTimestamp += 2; // todo: twl timings + //if ((WriteBufferFifo[WBWritePointer] >> 61) == 3) WBCurAddr+=4; // TODO + //if (storeaddr[WBWritePointer] != WBCurAddr) printf("MISMATCH: %08X %08X\n", storeaddr[WBWritePointer], WBCurAddr); + + WBCurAddr = storeaddr[WBWritePointer]; + WBCurVal = WriteBufferFifo[WBWritePointer]; + WBWriting = true; } else { - WBTimestamp = ts; - if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) MainRAMTimestamp += 2 << NDS.ARM9ClockShift; + //WBCurAddr = (u32)WriteBufferFifo[WBWritePointer]; // TODO } - WBInitialTS = WBTimestamp; - - switch (WBCurVal >> 61) + + WBWritePointer = (WBWritePointer + 1) & 0xF; + if (WBWritePointer == WBFillPointer) { - case 0: // byte - BusWrite8 (WBCurAddr, WBCurVal); - break; - case 1: // halfword - BusWrite16(WBCurAddr, WBCurVal); - break; - case 2: // word - case 3: - BusWrite32(WBCurAddr, WBCurVal); - break; - default: // invalid - Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE AN ADDRESS VIA THE WRITE BUFFER! PANIC!!!\n", (u8)(WBCurVal >> 61)); - break; + WBWritePointer = 16; + WBFillPointer = 0; } - - WBLastRegion = NDS.ARM9Regions[WBCurAddr>>14]; - WBWriting = false; - if ((force == 2) && ((WriteBufferFifo[WBWritePointer] >> 61) != 3)) return true; + if ((mode == WBMode::WaitEntry) && (WBWritePointer != WBFillPointer)) return true; } - - // check if write buffer is empty - if (WBWritePointer == 16) return true; - // attempt to drain write buffer - if ((WriteBufferFifo[WBWritePointer] >> 61) != 4) // not an address - { - if (NDS.ARM9Regions[storeaddr[WBWritePointer]>>14] == Mem9_MainRAM) // main ram handling - { - if (!force && (WBTimestamp > NDS.ARM9Timestamp)) return true; - if ( force && (WBTimestamp > NDS.ARM9Timestamp)) - NDS.ARM9Timestamp = WBTimestamp; - - WBTimestamp = std::max(MainRAMTimestamp, WBTimestamp); - } - - WBTimestamp = (WBTimestamp + ((1<(); +template bool ARMv5::WriteBufferHandle(); +template bool ARMv5::WriteBufferHandle(); +template bool ARMv5::WriteBufferHandle(); + +#undef A9WENTLAST +#undef A7WENTLAST +#undef A9LAST +#undef A7LAST +#undef A9PRIORITY +#undef A7PRIORITY template void ARMv5::WriteBufferCheck() { + if ((WBWritePointer != 16) || WBWriting) + { + if constexpr (next == 0) + { + MRTrack.Type = MainRAMType::WBCheck; + } + else if constexpr (next == 2) + { + MRTrack.Type = MainRAMType::WBWaitWrite; + } + else + { + MRTrack.Type = MainRAMType::WBWaitRead; + } + } + /* while (!WriteBufferHandle<0>()); // loop until we've cleared out all writeable entries if constexpr (next == 1 || next == 3) // check if the next write is occuring @@ -1273,7 +1348,7 @@ void ARMv5::WriteBufferCheck() { //if (NDS.ARM9Timestamp >= WBInitialTS) while(!WriteBufferHandle<2>()); - } + }*/ } template void ARMv5::WriteBufferCheck<3>(); template void ARMv5::WriteBufferCheck<2>(); @@ -1282,7 +1357,26 @@ template void ARMv5::WriteBufferCheck<0>(); void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) { - WriteBufferCheck<0>(); + MRTrack.Type = MainRAMType::WBWrite; + WBAddrQueued[MRTrack.Var] = addr; + WBValQueued[MRTrack.Var++] = val | (u64)flag << 61; + /*switch (flag) + { + case 0: // byte + BusWrite8 (addr, val); + break; + case 1: // halfword + BusWrite16(addr, val); + break; + case 2: // word + case 3: + BusWrite32(addr, val); + break; + default: // invalid + //Platform::Log(Platform::LogLevel::Warn, "WHY ARE WE TRYING TO WRITE NONSENSE VIA THE WRITE BUFFER! PANIC!!! Flag: %i\n", (u8)(WBCurVal >> 61)); + break; + }*/ + /*WriteBufferCheck<0>(); if (WBFillPointer == WBWritePointer) // if the write buffer is full then we stall the cpu until room is made WriteBufferHandle<1>(); @@ -1302,12 +1396,14 @@ void ARMv5::WriteBufferWrite(u32 val, u8 flag, u32 addr) WriteBufferFifo[WBFillPointer] = val | (u64)flag << 61; storeaddr[WBFillPointer] = addr; - WBFillPointer = (WBFillPointer + 1) & 0xF; + WBFillPointer = (WBFillPointer + 1) & 0xF;*/ } void ARMv5::WriteBufferDrain() { - while (!WriteBufferHandle<1>()); // loop until drained fully + if ((WBWritePointer != 16) || WBWriting) + MRTrack.Type = MainRAMType::WBDrain; + //while (!WriteBufferHandle<1>()); // loop until drained fully } void ARMv5::CP15Write(u32 id, u32 val) @@ -1561,6 +1657,7 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x704: case 0x782: + //WriteBufferDrain(); // checkme Halt(1); return; @@ -1578,7 +1675,7 @@ void ARMv5::CP15Write(u32 id, u32 val) ICacheInvalidateByAddr(val); //Halt(255); return; - case 0x752: + /*case 0x752: // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { @@ -1592,7 +1689,7 @@ void ARMv5::CP15Write(u32 id, u32 val) } //Halt(255); return; - + */ case 0x760: // requires priv mode or causes UNKNOWN INSTRUCTION exception @@ -1612,7 +1709,7 @@ void ARMv5::CP15Write(u32 id, u32 val) DCacheInvalidateByAddr(val); //printf("inval data cache SI\n"); return; - case 0x762: + /*case 0x762: // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { @@ -1625,15 +1722,15 @@ void ARMv5::CP15Write(u32 id, u32 val) DCacheInvalidateBySetAndWay(cacheSet, cacheLine); } return; - - case 0x770: + */ + /*case 0x770: // invalidate both caches // can be called from user and privileged ICacheInvalidateAll(); DCacheInvalidateAll(); break; - - case 0x7A0: + */ + /*case 0x7A0: // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { @@ -1641,15 +1738,16 @@ void ARMv5::CP15Write(u32 id, u32 val) } //Log(LogLevel::Debug,"clean data cache\n"); DCacheClearAll(); - return; + return;*/ case 0x7A1: // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) { return ARMInterpreter::A_UNK(this); } - //Log(LogLevel::Debug,"clean data cache MVA\n"); - DCacheClearByAddr(val); + //Log(LogLevel::Debug,"clean data cache MVA\n");= + CP15Queue = val; + QueueFunction(&ARMv5::DCClearAddr_2); return; case 0x7A2: //Log(LogLevel::Debug,"clean data cache SET/WAY\n"); @@ -1660,9 +1758,8 @@ void ARMv5::CP15Write(u32 id, u32 val) } else { // Cache invalidat by line number and set number - u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); - u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); - DCacheClearByASetAndWay(cacheSet, cacheLine); + CP15Queue = val; + QueueFunction(&ARMv5::DCClearSetWay_2); } return; case 0x7A3: @@ -1678,7 +1775,7 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x7A4: // Can be used in user and privileged mode // Drain Write Buffer: Stall until all write back completed - WriteBufferDrain(); + QueueFunction(&ARMv5::WriteBufferDrain); return; case 0x7D1: @@ -1695,7 +1792,7 @@ void ARMv5::CP15Write(u32 id, u32 val) //ICacheLookup((val & ~0x03) | 0x1C); TODO: REIMPLEMENT WITH DEFERENCE return; - case 0x7E0: + /*case 0x7E0: //Log(LogLevel::Debug,"clean & invalidate data cache\n"); // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) @@ -1704,7 +1801,7 @@ void ARMv5::CP15Write(u32 id, u32 val) } DCacheClearAll(); DCacheInvalidateAll(); - return; + return;*/ case 0x7E1: //Log(LogLevel::Debug,"clean & invalidate data cache MVA\n"); // requires priv mode or causes UNKNOWN INSTRUCTION exception @@ -1712,8 +1809,8 @@ void ARMv5::CP15Write(u32 id, u32 val) { return ARMInterpreter::A_UNK(this); } - DCacheClearByAddr(val); - DCacheInvalidateByAddr(val); + CP15Queue = val; + QueueFunction(&ARMv5::DCClearInvalidateAddr_2); return; case 0x7E2: //Log(LogLevel::Debug,"clean & invalidate data cache SET/WAY\n"); @@ -1724,13 +1821,11 @@ void ARMv5::CP15Write(u32 id, u32 val) } else { // Cache invalidat by line number and set number - u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); - u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); - DCacheClearByASetAndWay(cacheSet, cacheLine); - DCacheInvalidateBySetAndWay(cacheSet, cacheLine); + CP15Queue = val; + QueueFunction(&ARMv5::DCClearInvalidateSetWay_2); } return; - + case 0x900: // requires priv mode or causes UNKNOWN INSTRUCTION exception if (PU_Map != PU_PrivMap) @@ -2023,6 +2118,35 @@ u32 ARMv5::CP15Read(const u32 id) const return 0; } +void ARMv5::DCClearAddr_2() +{ + u32 val = CP15Queue; + DCacheClearByAddr(val); +} + +void ARMv5::DCClearSetWay_2() +{ + u32 val = CP15Queue; + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheClearByASetAndWay(cacheSet, cacheLine); +} + +void ARMv5::DCClearInvalidateAddr_2() +{ + u32 val = CP15Queue; + DCacheClearByAddr(val); + DCacheInvalidateByAddr(val); +} + +void ARMv5::DCClearInvalidateSetWay_2() +{ + u32 val = CP15Queue; + u8 cacheSet = val >> (32 - DCACHE_SETS_LOG2) & (DCACHE_SETS -1); + u8 cacheLine = (val >> DCACHE_LINELENGTH_LOG2) & (DCACHE_LINESPERSET -1); + DCacheClearByASetAndWay(cacheSet, cacheLine); + DCacheInvalidateBySetAndWay(cacheSet, cacheLine); +} // TCM are handled here. // TODO: later on, handle PU @@ -2070,16 +2194,24 @@ void ARMv5::CodeRead32(u32 addr) u64 time = DCacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - - u8 cycles = MemTimings[addr >> 14][1]; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else WriteBufferCheck<3>(); + FetchAddr[16] = addr; + QueueFunction(&ARMv5::CodeRead32_2); +} + +void ARMv5::CodeRead32_2() +{ + u32 addr = FetchAddr[16]; + NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1]; + if ((addr >> 24) == 0x02) { FetchAddr[16] = addr; @@ -2180,11 +2312,8 @@ void ARMv5::DRead8_2() { if (IsAddressDCachable(addr)) { - if (DCacheLookup(addr)) - { - QueueFunction(&ARMv5::DCacheFin8); - return; - } + DelayedQueue = &ARMv5::DCacheFin8; + if (DCacheLookup(addr)) return; } } #endif @@ -2202,6 +2331,15 @@ void ARMv5::DRead8_2() else WriteBufferCheck<1>(); + QueueFunction(&ARMv5::DRead8_3); +} + +void ARMv5::DRead8_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) @@ -2288,11 +2426,8 @@ void ARMv5::DRead16_2() { if (IsAddressDCachable(addr)) { - if (DCacheLookup(addr)) - { - QueueFunction(&ARMv5::DCacheFin16); - return; - } + DelayedQueue = &ARMv5::DCacheFin16; + if (DCacheLookup(addr)) return; } } #endif @@ -2310,6 +2445,15 @@ void ARMv5::DRead16_2() else WriteBufferCheck<1>(); + QueueFunction(&ARMv5::DRead16_3); +} + +void ARMv5::DRead16_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) @@ -2397,11 +2541,8 @@ void ARMv5::DRead32_2() { if (IsAddressDCachable(addr)) { - if (DCacheLookup(addr)) - { - QueueFunction(&ARMv5::DCacheFin32); - return; - } + DelayedQueue = &ARMv5::DCacheFin32; + if (DCacheLookup(addr)) return; } } #endif @@ -2419,6 +2560,15 @@ void ARMv5::DRead32_2() else WriteBufferCheck<1>(); + QueueFunction(&ARMv5::DRead32_3); +} + +void ARMv5::DRead32_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<> 24) == 0x02) @@ -2492,11 +2642,8 @@ void ARMv5::DRead32S_2() { if (IsAddressDCachable(addr)) { - if (DCacheLookup(addr)) - { - QueueFunction(&ARMv5::DCacheFin32); - return; - } + DelayedQueue = &ARMv5::DCacheFin32; + if (DCacheLookup(addr)) return; } } #endif @@ -2514,6 +2661,15 @@ void ARMv5::DRead32S_2() else WriteBufferCheck<1>(); + QueueFunction(&ARMv5::DRead32S_3); +} + +void ARMv5::DRead32S_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<>12] & (0x30))) { WriteBufferCheck<2>(); - - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) - { - MRTrack.Type = MainRAMType::Fetch; - MRTrack.Var = MRWrite | MR8; - MRTrack.Progress = reg; - } - else - { - NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; - DataCycles = 3<>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR8; + MRTrack.Progress = reg; + } + else + { + NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>12] & 0x30)) { WriteBufferCheck<2>(); - - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) - { - MRTrack.Type = MainRAMType::Fetch; - MRTrack.Var = MRWrite | MR16; - MRTrack.Progress = reg; - } - else - { - NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; - DataCycles = NDS.ARM9ClockShift; - DataRegion = NDS.ARM9Regions[addr>>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR16; + MRTrack.Progress = reg; + } + else + { + NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; + DataCycles = NDS.ARM9ClockShift; + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>12] & 0x30)) { WriteBufferCheck<2>(); - - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<> 24) == 0x02) - { - MRTrack.Type = MainRAMType::Fetch; - MRTrack.Var = MRWrite | MR32; - MRTrack.Progress = reg; - } - else - { - NDS.ARM9Timestamp += MemTimings[addr >> 14][1]; - DataCycles = 3<>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32; + MRTrack.Progress = reg; + } + else + { + NDS.ARM9Timestamp += MemTimings[addr >> 14][1]; + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>12] & 0x30)) // non-bufferable { WriteBufferCheck<2>(); - - // bursts cannot cross a 1kb boundary - if (addr & 0x3FF) // s - { - if ((addr >> 24) == 0x02) - { - MRTrack.Type = MainRAMType::Fetch; - MRTrack.Var = MRWrite | MR32 | MRSequential; - MRTrack.Progress = reg; - } - else - { - NDS.ARM9Timestamp += DataCycles = MemTimings[addr>>14][2]; - DataRegion = NDS.ARM9Regions[addr>>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 24) == 0x02) - { - MRTrack.Type = MainRAMType::Fetch; - MRTrack.Var = MRWrite | MR32; - MRTrack.Progress = reg; - } - else - { - NDS.ARM9Timestamp += MemTimings[addr>>14][1]; - DataCycles = 3 << NDS.ARM9ClockShift; - DataRegion = NDS.ARM9Regions[addr>>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32 | MRSequential; + MRTrack.Progress = reg; + } + else + { + NDS.ARM9Timestamp += DataCycles = MemTimings[addr>>14][2]; + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 24) == 0x02) + { + MRTrack.Type = MainRAMType::Fetch; + MRTrack.Var = MRWrite | MR32; + MRTrack.Progress = reg; + } + else + { + NDS.ARM9Timestamp += MemTimings[addr>>14][1]; + DataCycles = 3 << NDS.ARM9ClockShift; + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<()) return; + + if ((ARM9.WBWritePointer == 16) && !ARM9.WBWriting) + { + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + } + break; + } + + case MainRAMType::WBWrite: + { + if (!ARM9.WriteBufferHandle()) return; + + if (ARM9.WBWritePointer == ARM9.WBFillPointer) + { + if (!ARM9.WriteBufferHandle()) return; + } + else if (ARM9.WBWritePointer == 16) + { + ARM9.WBWritePointer = 0; + if (!ARM9.WBWriting) + { + u64 ts = (ARM9Timestamp + 1 + ((1<> 61) != 4) + { + ARM9Timestamp += ARM9.DataCycles = 1; + ARM9.WBDelay = ARM9Timestamp + 1; + } + + ARM9.MRTrack.Progress++; + if (ARM9.MRTrack.Progress >= ARM9.MRTrack.Var) + { + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + } + break; + } + + case MainRAMType::WBCheck: + { + if (!ARM9.WriteBufferHandle()) return; + + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + break; + } + + case MainRAMType::WBWaitRead: + { + if (!ARM9.WriteBufferHandle()) return; + + if (ARM9Timestamp >= ARM9.WBInitialTS) + { + if (!ARM9.WriteBufferHandle()) return; + if (ARM9Timestamp < ARM9.WBReleaseTS) ARM9Timestamp = ARM9.WBReleaseTS; + } + + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + break; + } + + case MainRAMType::WBWaitWrite: + { + if (!ARM9.WriteBufferHandle()) return; + + if (!ARM9.WriteBufferHandle()) return; + if (ARM9Timestamp < ARM9.WBReleaseTS) ARM9Timestamp = ARM9.WBReleaseTS; + + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + break; + } } } @@ -1121,12 +1206,12 @@ void NDS::MainRAMHandle() { if (!ConTSLock) { - A9ContentionTS = (ARM9Timestamp + ((1<> ARM9ClockShift; - if (ARM9.MRTrack.Type != MainRAMType::Null) - { - ConTSLock = true; - if (A9ContentionTS < MainRAMTimestamp) A9ContentionTS = MainRAMTimestamp; - } + if (ARM9.MRTrack.Type != MainRAMType::Null) ConTSLock = true; + + if (ARM9.MRTrack.Type > MainRAMType::WriteBufferCmds) + A9ContentionTS = (ARM9.WBTimestamp + ((1<> ARM9ClockShift; + else + A9ContentionTS = (ARM9Timestamp + ((1<> ARM9ClockShift; } if (A7PRIORITY) @@ -1261,14 +1346,14 @@ u32 NDS::RunFrame() } else if (ARM9.MRTrack.Type == MainRAMType::Null) { - if (ARM9.abt) ARM9Timestamp = ARM9Target; + //if (ARM9.abt) ARM9Timestamp = ARM9Target; ARM9.Execute(); } - //printf("MAIN LOOP: 9 %lli %08X %08llX 7 %lli %08X %08llX %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, IME[1], IE[1]); + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); MainRAMHandle(); - + RunTimers(0); GPU.GPU3D.Run(); @@ -1326,6 +1411,7 @@ u32 NDS::RunFrame() SPU.TransferOutput(); break; } + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); // In the context of TASes, frame count is traditionally the primary measure of emulated time, // so it needs to be tracked even if NDS is powered off. From 68b4d96f0d2db03f5374ec3ec83e11a4f26fc3e2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 00:25:44 -0500 Subject: [PATCH 239/306] Queue ICache Prefetch --- src/ARM.h | 1 + src/CP15.cpp | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 7eb8e842..3629b7a8 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -761,6 +761,7 @@ public: void DWrite32S_2(); void DWrite32S_3(); void WBCheck_2(); + void ICachePrefetch_2(); void DCacheLookup_2(); void DCacheLookup_3(); void DCClearAddr_2(); diff --git a/src/CP15.cpp b/src/CP15.cpp index 979cbb38..a00a5c37 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1788,8 +1788,8 @@ void ARMv5::CP15Write(u32 id, u32 val) // we force a fill by looking up the value from cache // if it wasn't cached yet, it will be loaded into cache // low bits are set to 0x1C to trick cache streaming - printf("PREFETCH ICACHE\n"); - //ICacheLookup((val & ~0x03) | 0x1C); TODO: REIMPLEMENT WITH DEFERENCE + CP15Queue = val; + QueueFunction(&ARMv5::ICachePrefetch_2); return; /*case 0x7E0: @@ -2117,6 +2117,11 @@ u32 ARMv5::CP15Read(const u32 id) const Log(LogLevel::Debug, "unknown CP15 read op %04X\n", id); return 0; } +void ARMv5::ICachePrefetch_2() +{ + u32 val = CP15Queue; + ICacheLookup((val & ~0x03) | 0x1C); +} void ARMv5::DCClearAddr_2() { From e69a2aa1b52e665696064fdaa1ee7da0aa48d959 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 09:05:33 -0500 Subject: [PATCH 240/306] write buffer shouldn't continue resolving main ram accesses if it passes the a7 ts --- src/CP15.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/CP15.cpp b/src/CP15.cpp index a00a5c37..841b1c9e 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1182,6 +1182,8 @@ bool ARMv5::WriteBufferHandle() { if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } cycles = 4; NDS.MainRAMTimestamp = NDS.A9ContentionTS + 9; @@ -1194,6 +1196,8 @@ bool ARMv5::WriteBufferHandle() { if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } NDS.MainRAMTimestamp = NDS.A9ContentionTS + 8; cycles = 3; @@ -1206,6 +1210,8 @@ bool ARMv5::WriteBufferHandle() { if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } if (A9WENTLAST) { NDS.MainRAMTimestamp += 2; @@ -1223,6 +1229,8 @@ bool ARMv5::WriteBufferHandle() { if (NDS.ARM9Regions[WBCurAddr>>14] == Mem9_MainRAM) { + if (A7PRIORITY) { if (NDS.A9ContentionTS >= NDS.ARM7Timestamp) return false; } + else { if (NDS.A9ContentionTS > NDS.ARM7Timestamp) return false; } if (NDS.A9ContentionTS < NDS.MainRAMTimestamp) { NDS.A9ContentionTS = NDS.MainRAMTimestamp; if (A7PRIORITY) return false; } NDS.MainRAMTimestamp = NDS.A9ContentionTS + 9; cycles = 4; From 8209fdebb44479e6ed8020dfdf6b7215bfda9a3c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 10:02:25 -0500 Subject: [PATCH 241/306] fix main ram timestamp i hate order of operations --- src/NDS.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index e1daa7a0..de171a46 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -936,12 +936,12 @@ void NDS::MainRAMHandleARM9() { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } - MainRAMTimestamp = A9ContentionTS + (var & MR16) ? 8 : 9; // checkme: are these correct for 8bit? - if (var & MRWrite) A9ContentionTS += (var & MR16) ? 5 : 6; // checkme: is this correct for 133mhz? + MainRAMTimestamp = A9ContentionTS + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? + if (var & MRWrite) A9ContentionTS += ((var & MR16) ? 5 : 6); // checkme: is this correct for 133mhz? else { - if (ARM9ClockShift == 1) A9ContentionTS += (var & MR16) ? 8 : 9; - else A9ContentionTS += (var & MR16) ? 7 : 8; + if (ARM9ClockShift == 1) A9ContentionTS += ((var & MR16) ? 8 : 9); + else A9ContentionTS += ((var & MR16) ? 7 : 8); ARM9.DataCycles = 3 << ARM9ClockShift; } MainRAMLastAccess = A9LAST; @@ -967,7 +967,7 @@ void NDS::MainRAMHandleARM9() else // read { u32 dummy; - u32* val = (ARM9.LDRFailedRegs & (1< Date: Sun, 8 Dec 2024 11:19:49 -0500 Subject: [PATCH 242/306] only recalc mpu lut if it changed --- src/CP15.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 841b1c9e..64ed0424 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1459,7 +1459,7 @@ void ARMv5::CP15Write(u32 id, u32 val) if (diff & (1<> 4) & 0xF]; PU_Region[(id >> 4) & 0xF] = val & ~(0x3F<<6); + u32 diff = old ^ PU_Region[(id >> 4) & 0xF]; std::snprintf(log_output, sizeof(log_output), @@ -1659,7 +1663,7 @@ void ARMv5::CP15Write(u32 id, u32 val) (val & 0x3E) >> 1 ); // TODO: smarter region update for this? - UpdatePURegions(true); + if (diff) UpdatePURegions(true); return; From 8e6755ce2c1c8a9a34bfc8353f138624b4a811b9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 11:20:36 -0500 Subject: [PATCH 243/306] jakly pls --- src/CP15.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 64ed0424..c19c32e9 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -1648,23 +1648,25 @@ void ARMv5::CP15Write(u32 id, u32 val) case 0x661: case 0x670: case 0x671: - char log_output[1024]; - u32 old = PU_Region[(id >> 4) & 0xF]; - PU_Region[(id >> 4) & 0xF] = val & ~(0x3F<<6); - u32 diff = old ^ PU_Region[(id >> 4) & 0xF]; + { + char log_output[1024]; + u32 old = PU_Region[(id >> 4) & 0xF]; + PU_Region[(id >> 4) & 0xF] = val & ~(0x3F<<6); + u32 diff = old ^ PU_Region[(id >> 4) & 0xF]; - std::snprintf(log_output, - sizeof(log_output), - "PU: region %d = %08X : %s, start: %08X size: %02X\n", - (id >> 4) & 0xF, - val, - val & 1 ? "enabled" : "disabled", - val & 0xFFFFF000, - (val & 0x3E) >> 1 - ); - // TODO: smarter region update for this? - if (diff) UpdatePURegions(true); - return; + std::snprintf(log_output, + sizeof(log_output), + "PU: region %d = %08X : %s, start: %08X size: %02X\n", + (id >> 4) & 0xF, + val, + val & 1 ? "enabled" : "disabled", + val & 0xFFFFF000, + (val & 0x3E) >> 1 + ); + // TODO: smarter region update for this? + if (diff) UpdatePURegions(true); + return; + } case 0x704: From 91752c192523c87bfd7dacde15682ab11a577145 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 14:24:33 -0500 Subject: [PATCH 244/306] fix emulator hanging under certain circumstances --- src/NDS.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index de171a46..53d9275a 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1359,10 +1359,10 @@ u32 NDS::RunFrame() RunTimers(0); GPU.GPU3D.Run(); - target = ARM9Timestamp >> ARM9ClockShift; + target = (ARM9.MRTrack.Type == MainRAMType::Null) ? (ARM9Timestamp >> ARM9ClockShift) : ARM7Timestamp + 1; CurCPU = 1; - while (((ARM7Timestamp < target) && (ARM7.MRTrack.Type == MainRAMType::Null)) || (ARM9.MRTrack.Type != MainRAMType::Null)) + while ((ARM7Timestamp < target) && (ARM7.MRTrack.Type == MainRAMType::Null)) { ARM7Target = (ARM9.MRTrack.Type != MainRAMType::Null) ? (ARM7Timestamp+1) : target; // might be changed by a reschedule From 0df4369305c666f5ddd12c432cfffa630f1ae179 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 15:25:18 -0500 Subject: [PATCH 245/306] tweak scheduler for better performance might be less accurate --- src/NDS.cpp | 120 +++++++++++++++++++++++++++------------------------- src/NDS.h | 2 +- 2 files changed, 64 insertions(+), 58 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 53d9275a..d5a2d0d4 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1204,7 +1204,7 @@ void NDS::MainRAMHandleARM7() } } -void NDS::MainRAMHandle() +bool NDS::MainRAMHandle() { if (!ConTSLock) { @@ -1222,12 +1222,12 @@ void NDS::MainRAMHandle() { if (A9ContentionTS < ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null) return; + if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; MainRAMHandleARM9(); } else { - if (ARM7.MRTrack.Type == MainRAMType::Null) return; + if (ARM7.MRTrack.Type == MainRAMType::Null) return 1; MainRAMHandleARM7(); } } @@ -1238,12 +1238,12 @@ void NDS::MainRAMHandle() { if (A9ContentionTS <= ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null) return; + if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; MainRAMHandleARM9(); } else { - if (ARM7.MRTrack.Type == MainRAMType::Null) return; + if (ARM7.MRTrack.Type == MainRAMType::Null) return 1; MainRAMHandleARM7(); } } @@ -1315,81 +1315,87 @@ u32 NDS::RunFrame() while (Running && GPU.TotalScanlines==0) { u64 target = NextTarget(); + ARM9Target = target << ARM9ClockShift; CurCPU = 0; - if (CPUStop & CPUStop_GXStall) + while (ARM9Timestamp < ARM9Target) { - // GXFIFO stall - s32 cycles = GPU.GPU3D.CyclesToRunFor(); - - ARM9Timestamp = std::min(ARM9Target, ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(0); - } - ts = ARM9Timestamp - ts; - for (int i = 0; i < 7; i++) - { - ARM9.ICacheStreamTimes[i] += ts; - ARM9.DCacheStreamTimes[i] += ts; - } - ARM9.WBTimestamp += ts; + if (CPUStop & CPUStop_GXStall) + { + // GXFIFO stall + s32 cycles = GPU.GPU3D.CyclesToRunFor(); - } - else if (ARM9.MRTrack.Type == MainRAMType::Null) - { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; - ARM9.Execute(); - } + ARM9Timestamp = std::min(ARM9Target, ARM9Timestamp+(cycles<(*this); + dsi.RunNDMAs(0); + } + ts = ARM9Timestamp - ts; + for (int i = 0; i < 7; i++) + { + ARM9.ICacheStreamTimes[i] += ts; + ARM9.DCacheStreamTimes[i] += ts; + } + ARM9.WBTimestamp += ts; + } + else + { + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); + } + } - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - MainRAMHandle(); + if (MainRAMHandle()) break; + } RunTimers(0); GPU.GPU3D.Run(); - target = (ARM9.MRTrack.Type == MainRAMType::Null) ? (ARM9Timestamp >> ARM9ClockShift) : ARM7Timestamp + 1; + ARM7Target = target; CurCPU = 1; - while ((ARM7Timestamp < target) && (ARM7.MRTrack.Type == MainRAMType::Null)) + while (ARM7Timestamp < ARM7Target) { - ARM7Target = (ARM9.MRTrack.Type != MainRAMType::Null) ? (ARM7Timestamp+1) : target; // might be changed by a reschedule - //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); - if (CPUStop & CPUStop_DMA7) + if (ARM7.MRTrack.Type == MainRAMType::Null) { - DMAs[4].Run(); - DMAs[5].Run(); - DMAs[6].Run(); - DMAs[7].Run(); - if (ConsoleType == 1) + if (CPUStop & CPUStop_DMA7) { - auto& dsi = dynamic_cast(*this); - dsi.RunNDMAs(1); + DMAs[4].Run(); + DMAs[5].Run(); + DMAs[6].Run(); + DMAs[7].Run(); + if (ConsoleType == 1) + { + auto& dsi = dynamic_cast(*this); + dsi.RunNDMAs(1); + } + } + else + { + //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; + ARM7.Execute(); } } - else if (ARM7.MRTrack.Type == MainRAMType::Null) - { - //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; - ARM7.Execute(); - } - MainRAMHandle(); - - RunTimers(1); + if (!MainRAMHandle()) break; } + RunTimers(1); RunSystem(target); diff --git a/src/NDS.h b/src/NDS.h index d0377fb1..1c1c487f 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -403,7 +403,7 @@ public: // TODO: Encapsulate the rest of these members void MainRAMHandleARM9(); void MainRAMHandleARM7(); - void MainRAMHandle(); + bool MainRAMHandle(); u32 RunFrame(); From 1a1934df00cb63a1969d386f0148482b836c72f9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 19:24:19 -0500 Subject: [PATCH 246/306] ...removing the (s32) fixes sign extension? ig??? --- src/ARM.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 3629b7a8..80b492cb 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -773,8 +773,8 @@ public: void HandleInterlocksMemory_2(); void ForceInterlock_2(); void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } - void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } - void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } + void SignExtend8() { R[ExtReg] = (s8)R[ExtReg]; } + void SignExtend16() { R[ExtReg] = (s16)R[ExtReg]; } void ROR32() { R[ExtReg] = ROR(R[ExtReg], ExtROROffs); } @@ -950,9 +950,9 @@ public: void AddExecute(); void AddExtraCycle(); void QueueUpdateMode() { UpdateMode(QueueMode[0], QueueMode[1], true); } - void SignExtend8() { R[ExtReg] = (s32)(s8)R[ExtReg]; } - void SignExtend16() { R[ExtReg] = (s32)(s16)R[ExtReg]; } - void ROR32() { R[ExtReg] = ROR(R[ExtReg], ExtROROffs); } + void SignExtend8() { if (!(LDRFailedRegs & 1< Date: Sun, 8 Dec 2024 19:48:46 -0500 Subject: [PATCH 247/306] fix writeback when rn is also rd in ldr something *has* to rely on this, as stupid as it seems --- src/ARMInterpreter_LoadStore.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARMInterpreter_LoadStore.cpp b/src/ARMInterpreter_LoadStore.cpp index 658ab4c8..f4a8494f 100644 --- a/src/ARMInterpreter_LoadStore.cpp +++ b/src/ARMInterpreter_LoadStore.cpp @@ -157,7 +157,7 @@ void LoadSingle(ARM* cpu, const u8 rd, const u8 rn, const s32 offset, const u16 { if (rn != 15) [[likely]] // r15 writeback fails on arm9 { - cpu->R[rn] = addr; + if (rd != rn) cpu->R[rn] = addr; } else if (cpu->Num == 1) // arm 7 { From f823a920203eeea880c14d875b62d59881ff2431 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 22:41:32 -0500 Subject: [PATCH 248/306] fix branches being able to break the queue system fixes bw2 --- src/ARM.cpp | 22 +++++++++------------- src/ARM.h | 2 +- src/CP15.cpp | 5 +++++ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index b2b10c63..96e97ce3 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -331,7 +331,7 @@ void ARMv5::JumpTo(u32 addr, bool restorecpsr, u8 R15) { //printf("JUMP! %08X %i %i\n", addr, restorecpsr, R15); NDS.MonitorARM9Jump(addr); - + BranchRestore = restorecpsr; BranchUpdate = R15; BranchAddr = addr; @@ -382,15 +382,13 @@ void ARMv5::JumpTo_2() // doesn't matter if we put garbage in the MSbs there if (BranchAddr & 0x2) { + DelayedQueue = &ARMv5::JumpTo_3A; CodeRead32(BranchAddr-2); - - QueueFunction(&ARMv5::JumpTo_3A); } else { + DelayedQueue = &ARMv5::JumpTo_3B; CodeRead32(BranchAddr); - - QueueFunction(&ARMv5::JumpTo_3B); } } else @@ -399,19 +397,17 @@ void ARMv5::JumpTo_2() R[15] = BranchAddr+4; CPSR &= ~0x20; - + + DelayedQueue = &ARMv5::JumpTo_3C; CodeRead32(BranchAddr); - - QueueFunction(&ARMv5::JumpTo_3C); } } void ARMv5::JumpTo_3A() { NextInstr[0] = RetVal >> 16; + DelayedQueue = &ARMv5::JumpTo_4; CodeRead32(BranchAddr+2); - - QueueFunction(&ARMv5::JumpTo_4); } void ARMv5::JumpTo_3B() @@ -423,9 +419,8 @@ void ARMv5::JumpTo_3B() void ARMv5::JumpTo_3C() { NextInstr[0] = RetVal; + DelayedQueue = &ARMv5::JumpTo_4; CodeRead32(BranchAddr+4); - - QueueFunction(&ARMv5::JumpTo_4); } void ARMv5::JumpTo_4() @@ -1377,12 +1372,13 @@ void ARMv5::CodeFetch() if (NDS.ARM9Timestamp < TimestampMemory) NDS.ARM9Timestamp = TimestampMemory; Store = false; DataRegion = Mem9_Null; + QueueFunction(&ARMv5::AddExecute); } else { + DelayedQueue = &ARMv5::AddExecute; CodeRead32(PC); } - QueueFunction(&ARMv5::AddExecute); } void ARMv5::AddExecute() diff --git a/src/ARM.h b/src/ARM.h index 80b492cb..c23c2254 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -839,7 +839,7 @@ public: u64 ITCMTimestamp; u64 TimestampMemory; void (ARMv5::*FuncQueue[32])(void); - void (ARMv5::*DelayedQueue)(void); + void (ARMv5::*DelayedQueue)(void); // adding more than one new entry to the queue while it's already active does not work. so uh. we use this to work around that. it's less than ideal... u32 PC; bool NullFetch; bool Store; diff --git a/src/CP15.cpp b/src/CP15.cpp index c19c32e9..c7b076cb 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -443,6 +443,7 @@ bool ARMv5::ICacheLookup(const u32 addr) Store = false; RetVal = cacheLine[(addr & (ICACHE_LINELENGTH -1)) / 4]; + QueueFunction(DelayedQueue); return true; } } @@ -554,6 +555,7 @@ void ARMv5::ICacheLookup_2() } Store = false; DataRegion = Mem9_Null; + QueueFunction(DelayedQueue); } void ARMv5::ICacheInvalidateByAddr(const u32 addr) @@ -2181,6 +2183,7 @@ void ARMv5::CodeRead32(u32 addr) DataRegion = Mem9_Null; Store = false; RetVal = ((u64)1<<63); + QueueFunction(DelayedQueue); return; } @@ -2192,6 +2195,7 @@ void ARMv5::CodeRead32(u32 addr) DataRegion = Mem9_Null; Store = false; RetVal = *(u32*)&ITCM[addr & (ITCMPhysicalSize - 1)]; + QueueFunction(DelayedQueue); return; } @@ -2253,6 +2257,7 @@ void ARMv5::CodeRead32_2() Store = false; DataRegion = Mem9_Null; + QueueFunction(DelayedQueue); return; } From aa2cdc37a181962fb159d10566f969d0444ccc31 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 8 Dec 2024 23:10:53 -0500 Subject: [PATCH 249/306] optimize one of the main loops --- src/ARM.cpp | 219 ++++++++++++++++++++++++---------------------------- 1 file changed, 103 insertions(+), 116 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 96e97ce3..0a52c1d7 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -822,52 +822,65 @@ void ARMv5::Execute() else #endif { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); // gdb might throw a hissy fit about this change but idc - - //printf("A9: A:%i, F:%i, P:%i, E:%i, I:%08llX, P:%08X, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, PC, R[15]); - - (this->*FuncQueue[FuncQueueProg])(); - if (FuncQueueActive) { - if (FuncQueueFill == FuncQueueProg) + while (FuncQueueActive) { - // we did not get a new addition to the queue; increment and reset ptrs - FuncQueueFill = ++FuncQueueProg; + (this->*FuncQueue[FuncQueueProg])(); - // check if we're done with the queue, if so, reset everything - if (FuncQueueProg >= FuncQueueEnd) + if (FuncQueueFill == FuncQueueProg) { + // we did not get a new addition to the queue; increment and reset ptrs + FuncQueueFill = ++FuncQueueProg; + + // check if we're done with the queue, if so, reset everything + if (FuncQueueProg >= FuncQueueEnd) + { + FuncQueueFill = 0; + FuncQueueProg = 0; + FuncQueueEnd = 0; + FuncQueueActive = false; + FuncQueue[0] = &ARMv5::StartExec; + } + } + else + { + // we got a new addition to the list; redo the current entry and exit to resolve main ram + FuncQueueFill = FuncQueueProg; + return; + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram + } + } + else + { + while (NDS.ARM9Timestamp < NDS.ARM9Target) + { + if constexpr (mode == CPUExecuteMode::InterpreterGDB) + GdbCheckC(); // gdb might throw a hissy fit about this change but idc + + //printf("A9: A:%i, F:%i, P:%i, E:%i, I:%08llX, P:%08X, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, PC, R[15]); + (this->*FuncQueue[FuncQueueProg])(); + + if (FuncQueueFill > 0) // check if we started the queue up + { + FuncQueueEnd = FuncQueueFill; FuncQueueFill = 0; - FuncQueueProg = 0; - FuncQueueEnd = 0; - FuncQueueActive = false; - FuncQueue[0] = &ARMv5::StartExec; + FuncQueueActive = true; + return; // exit to resolve main ram + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram + + // TODO optimize this shit!!! + if (Halted) + { + if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target) + { + NDS.ARM9Timestamp = NDS.ARM9Target; + } + goto exit; } } - else - { - // we got a new addition to the list; redo the current entry - FuncQueueFill = FuncQueueProg; - } - } - else if (FuncQueueFill > 0) // check if we started the queue up - { - FuncQueueEnd = FuncQueueFill; - FuncQueueFill = 0; - FuncQueueActive = true; - } - if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram - - // TODO optimize this shit!!! - if (!FuncQueueActive && Halted) - { - if (Halted == 1 && NDS.ARM9Timestamp < NDS.ARM9Target) - { - NDS.ARM9Timestamp = NDS.ARM9Target; - } - break; } /*if (NDS::IF[0] & NDS::IE[0]) { @@ -880,6 +893,8 @@ void ARMv5::Execute() //Cycles = 0; } + exit: + if (Halted == 2) Halted = 0; } @@ -1001,99 +1016,71 @@ void ARMv4::Execute() else #endif { - if constexpr (mode == CPUExecuteMode::InterpreterGDB) - GdbCheckC(); - - //printf("A7: A:%i, F:%i, P:%i, E:%i, I:%08llX, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, R[15]); - - (this->*FuncQueue[FuncQueueProg])(); - if (FuncQueueActive) { - if (FuncQueueFill == FuncQueueProg) + while (FuncQueueActive) { - // we did not get a new addition to the queue; increment and reset ptrs - FuncQueueFill = ++FuncQueueProg; + (this->*FuncQueue[FuncQueueProg])(); - // check if we're done with the queue, if so, reset everything - if (FuncQueueProg >= FuncQueueEnd) + if (FuncQueueFill == FuncQueueProg) { - FuncQueueFill = 0; - FuncQueueProg = 0; - FuncQueueEnd = 0; - FuncQueueActive = false; - FuncQueue[0] = &ARMv4::StartExec; + // we did not get a new addition to the queue; increment and reset ptrs + FuncQueueFill = ++FuncQueueProg; - /* - if (filey == NULL) filey = Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); - else + // check if we're done with the queue, if so, reset everything + if (FuncQueueProg >= FuncQueueEnd) { - u32 regscmp[16]; - Platform::FileRead(regscmp, 4, 16, filey); - if (iter > 471000 && memcmp(regscmp, R, 4*16)) - { - printf("MISMATCH on iter: %lli!!!! %08llX\n", iter, CurInstr); - for (int i = 0; i < 16; i++) - { - printf("R%i :%08X vs CMP:%08X\n", i, R[i], regscmp[i]); - } - //abt++; - } - iter++; - }*/ - } - } - else - { - // we got a new addition to the list; redo the current entry - FuncQueueFill = FuncQueueProg; - } - } - else if (FuncQueueFill > 0) // check if we started the queue up - { - FuncQueueEnd = FuncQueueFill; - FuncQueueFill = 0; - FuncQueueActive = true; - } - else - { - /* - if (filey == NULL) Platform::OpenFile("REGLOG.bin", Platform::FileMode::Read); - else - { - u32 regscmp[16]; - Platform::FileRead(regscmp, 4, 16, filey); - if (iter > 471000 && memcmp(regscmp, R, 4*16)) - { - printf("MISMATCH on iter: %lli!!!! %08llX\n", iter, CurInstr); - for (int i = 0; i < 16; i++) - { - printf("R%i :%08X vs CMP:%08X\n", i, R[i], regscmp[i]); + FuncQueueFill = 0; + FuncQueueProg = 0; + FuncQueueEnd = 0; + FuncQueueActive = false; + FuncQueue[0] = &ARMv4::StartExec; } - //abt++; - iter++; } - }*/ - } - if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram - - // TODO optimize this shit!!! - if (!FuncQueueActive && Halted) - { - if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target) - { - NDS.ARM7Timestamp = NDS.ARM7Target; + else + { + // we got a new addition to the list; redo the current entry and exit to resolve main ram + FuncQueueFill = FuncQueueProg; + return; + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram } - break; } - /*if (NDS::IF[1] & NDS::IE[1]) + else { - if (NDS::IME[1] & 0x1) - TriggerIRQ(); - }*/ + while (NDS.ARM7Timestamp < NDS.ARM7Target) + { + if constexpr (mode == CPUExecuteMode::InterpreterGDB) + GdbCheckC(); + + //printf("A7: A:%i, F:%i, P:%i, E:%i, I:%08llX, 15:%08X\n", FuncQueueActive, FuncQueueFill, FuncQueueProg, FuncQueueEnd, CurInstr, R[15]); + (this->*FuncQueue[FuncQueueProg])(); + + if (FuncQueueFill > 0) // check if we started the queue up + { + FuncQueueEnd = FuncQueueFill; + FuncQueueFill = 0; + FuncQueueActive = true; + return; // exit to resolve main ram + } + if (MRTrack.Type != MainRAMType::Null) return; // check if we need to resolve main ram + + // TODO optimize this shit!!! + if (Halted) + { + if (Halted == 1 && NDS.ARM7Timestamp < NDS.ARM7Target) + { + NDS.ARM7Timestamp = NDS.ARM7Target; + } + goto exit; + } + } + } } } + exit: + if (Halted == 2) Halted = 0; From 33f62189725f48ef42197b52b929331f9c724446 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 00:31:21 -0500 Subject: [PATCH 250/306] avoid checking T bit every instruction --- src/ARM.cpp | 182 +++++++++++++++++++++-------------------- src/ARM.h | 10 ++- src/ARMInterpreter.cpp | 14 +++- 3 files changed, 113 insertions(+), 93 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 0a52c1d7..07d7dd49 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -211,8 +211,6 @@ void ARM::Reset() void ARMv5::Reset() { - FuncQueue[0] = &ARMv5::StartExec; - PU_Map = PU_PrivMap; Store = false; @@ -238,7 +236,6 @@ void ARMv5::Reset() void ARMv4::Reset() { - FuncQueue[0] = &ARMv4::StartExec; Nonseq = true; ARM::Reset(); @@ -373,6 +370,9 @@ void ARMv5::JumpTo_2() if (BranchAddr & 0x1) { + StartExec = &ARMv5::StartExecTHUMB; + if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + BranchAddr &= ~0x1; R[15] = BranchAddr+2; @@ -393,6 +393,9 @@ void ARMv5::JumpTo_2() } else { + StartExec = &ARMv5::StartExecARM; + if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + BranchAddr &= ~0x3; R[15] = BranchAddr+4; @@ -457,6 +460,9 @@ void ARMv4::JumpTo_2() if (BranchAddr & 0x1) { + StartExec = &ARMv4::StartExecTHUMB; + if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + BranchAddr &= ~0x1; R[15] = BranchAddr+2; @@ -468,6 +474,9 @@ void ARMv4::JumpTo_2() } else { + StartExec = &ARMv4::StartExecARM; + if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + BranchAddr &= ~0x3; R[15] = BranchAddr+4; @@ -685,61 +694,60 @@ void ARM::CheckGdbIncoming() GdbCheckA(); } -void ARMv5::StartExec() +void ARMv5::StartExecTHUMB() { - if (CPSR & 0x20) // THUMB - { - // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - // code fetch is done during the execute stage cycle handling - if (R[15] & 0x2) NullFetch = true; - else NullFetch = false; - PC = R[15]; + // prefetch + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + // code fetch is done during the execute stage cycle handling + if (R[15] & 0x2) NullFetch = true; + else NullFetch = false; + PC = R[15]; - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions - { - PrefetchAbort(); - } - else [[likely]] // actually execute - { - u32 icode = (CurInstr >> 6) & 0x3FF; - ARMInterpreter::THUMBInstrTable[icode](this); - } + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions + { + PrefetchAbort(); + } + else [[likely]] // actually execute + { + u32 icode = (CurInstr >> 6) & 0x3FF; + ARMInterpreter::THUMBInstrTable[icode](this); + } + QueueFunction(&ARMv5::WBCheck_2); +} + +void ARMv5::StartExecARM() +{ + // prefetch + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + // code fetch is done during the execute stage cycle handling + NullFetch = false; + PC = R[15]; + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions + { + PrefetchAbort(); + } + else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else if ((CurInstr & 0xFE000000) == 0xFA000000) + { + ARMInterpreter::A_BLX_IMM(this); + } + else if ((CurInstr & 0x0FF000F0) == 0x01200070) + { + ARMInterpreter::A_BKPT(this); // always passes regardless of condition code } else - { - // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - // code fetch is done during the execute stage cycle handling - NullFetch = false; - PC = R[15]; - - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions - { - PrefetchAbort(); - } - else if (CheckCondition(CurInstr >> 28)) [[likely]] // actually execute - { - u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); - ARMInterpreter::ARMInstrTable[icode](this); - } - else if ((CurInstr & 0xFE000000) == 0xFA000000) - { - ARMInterpreter::A_BLX_IMM(this); - } - else if ((CurInstr & 0x0FF000F0) == 0x01200070) - { - ARMInterpreter::A_BKPT(this); // always passes regardless of condition code - } - else - AddCycles_C(); - } + AddCycles_C(); QueueFunction(&ARMv5::WBCheck_2); } @@ -840,7 +848,7 @@ void ARMv5::Execute() FuncQueueProg = 0; FuncQueueEnd = 0; FuncQueueActive = false; - FuncQueue[0] = &ARMv5::StartExec; + FuncQueue[0] = StartExec; } } else @@ -904,45 +912,43 @@ template void ARMv5::Execute(); template void ARMv5::Execute(); #endif -void ARMv4::StartExec() +void ARMv4::StartExecTHUMB() { - if (CPSR & 0x20) // THUMB - { - // prefetch - R[15] += 2; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - CodeRead16(R[15]); - QueueFunction(&ARMv4::UpdateNextInstr1); + // prefetch + R[15] += 2; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + CodeRead16(R[15]); + QueueFunction(&ARMv4::UpdateNextInstr1); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else - { - // actually execute - u32 icode = (CurInstr >> 6); - ARMInterpreter::THUMBInstrTable[icode](this); - } - } + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else { - // prefetch - R[15] += 4; - CurInstr = NextInstr[0]; - NextInstr[0] = NextInstr[1]; - CodeRead32(R[15]); - QueueFunction(&ARMv4::UpdateNextInstr1); - - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); - else if (CheckCondition(CurInstr >> 28)) // actually execute - { - u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); - ARMInterpreter::ARMInstrTable[icode](this); - } - else - AddCycles_C(); + // actually execute + u32 icode = (CurInstr >> 6); + ARMInterpreter::THUMBInstrTable[icode](this); } } +void ARMv4::StartExecARM() +{ + // prefetch + R[15] += 4; + CurInstr = NextInstr[0]; + NextInstr[0] = NextInstr[1]; + CodeRead32(R[15]); + QueueFunction(&ARMv4::UpdateNextInstr1); + + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + else if (CheckCondition(CurInstr >> 28)) // actually execute + { + u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); + ARMInterpreter::ARMInstrTable[icode](this); + } + else + AddCycles_C(); +} + template void ARMv4::Execute() { @@ -1034,7 +1040,7 @@ void ARMv4::Execute() FuncQueueProg = 0; FuncQueueEnd = 0; FuncQueueActive = false; - FuncQueue[0] = &ARMv4::StartExec; + FuncQueue[0] = StartExec; } } else diff --git a/src/ARM.h b/src/ARM.h index c23c2254..64c921e9 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -729,7 +729,8 @@ public: } // Queue Functions - void StartExec(); + void StartExecARM(); + void StartExecTHUMB(); void AddExecute(); void AddCycles_MW_2(); void DelayIfITCM_2(); @@ -840,6 +841,7 @@ public: u64 TimestampMemory; void (ARMv5::*FuncQueue[32])(void); void (ARMv5::*DelayedQueue)(void); // adding more than one new entry to the queue while it's already active does not work. so uh. we use this to work around that. it's less than ideal... + void (ARMv5::*StartExec)(void); u32 PC; bool NullFetch; bool Store; @@ -908,6 +910,7 @@ public: void Execute(); void (ARMv4::*FuncQueue[32])(void); + void (ARMv4::*StartExec)(void); bool Nonseq; void CodeRead16(u32 addr); @@ -933,8 +936,9 @@ public: else (this->*QueueEntry)(); } - - void StartExec(); + + void StartExecARM(); + void StartExecTHUMB(); void UpdateNextInstr1() { NextInstr[1] = RetVal; } void JumpTo_2(); void JumpTo_3A(); diff --git a/src/ARMInterpreter.cpp b/src/ARMInterpreter.cpp index 5c8b2b20..cd3346d0 100644 --- a/src/ARMInterpreter.cpp +++ b/src/ARMInterpreter.cpp @@ -134,7 +134,12 @@ void A_MSR_IMM(ARM* cpu) if (cpu->CPSR & 0x20) [[unlikely]] { - if (cpu->Num == 0) cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. + if (cpu->Num == 0) + { + cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. + ((ARMv5*)cpu)->StartExec = &ARMv5::StartExecTHUMB; + if (cpu->MRTrack.Type == MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[0] = ((ARMv5*)cpu)->StartExec; + } else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); @@ -196,7 +201,12 @@ void A_MSR_REG(ARM* cpu) if (cpu->CPSR & 0x20) [[unlikely]] { - if (cpu->Num == 0) cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. + if (cpu->Num == 0) + { + cpu->R[15] += 2; // pc should actually increment by 4 one more time after switching to thumb mode without a pipeline flush, this gets the same effect. + ((ARMv5*)cpu)->StartExec = &ARMv5::StartExecTHUMB; + if (cpu->MRTrack.Type == MainRAMType::Null) ((ARMv5*)cpu)->FuncQueue[0] = ((ARMv5*)cpu)->StartExec; + } else { Platform::Log(Platform::LogLevel::Warn, "UNIMPLEMENTED: MSR REG T bit change on ARM7\n"); From fe9a9ee27d55c4caa0082627c5de573e4a744876 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 00:39:24 -0500 Subject: [PATCH 251/306] actually those do literally nothing --- src/ARM.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 07d7dd49..0ca86283 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -371,7 +371,7 @@ void ARMv5::JumpTo_2() if (BranchAddr & 0x1) { StartExec = &ARMv5::StartExecTHUMB; - if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + FuncQueue[0] = StartExec; BranchAddr &= ~0x1; R[15] = BranchAddr+2; @@ -394,7 +394,7 @@ void ARMv5::JumpTo_2() else { StartExec = &ARMv5::StartExecARM; - if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + FuncQueue[0] = StartExec; BranchAddr &= ~0x3; R[15] = BranchAddr+4; @@ -461,7 +461,7 @@ void ARMv4::JumpTo_2() if (BranchAddr & 0x1) { StartExec = &ARMv4::StartExecTHUMB; - if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + FuncQueue[0] = StartExec; BranchAddr &= ~0x1; R[15] = BranchAddr+2; @@ -475,7 +475,7 @@ void ARMv4::JumpTo_2() else { StartExec = &ARMv4::StartExecARM; - if (MRTrack.Type == MainRAMType::Null) FuncQueue[0] = StartExec; + FuncQueue[0] = StartExec; BranchAddr &= ~0x3; R[15] = BranchAddr+4; From cbdd6a0faf5f347ef09d7fa1b5812a41689565f2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 09:10:24 -0500 Subject: [PATCH 252/306] cacheline align register array IM SORRY GENERIC --- src/ARM.h | 2 +- src/ARMJIT.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 64c921e9..97974d21 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -215,7 +215,7 @@ public: u32 DataRegion; s32 DataCycles; - u32 R[16]; // heh + alignas(64) u32 R[16]; // heh u32 CPSR; u32 R_FIQ[8]; // holding SPSR too u32 R_SVC[3]; diff --git a/src/ARMJIT.cpp b/src/ARMJIT.cpp index b73116b9..a0afb5d4 100644 --- a/src/ARMJIT.cpp +++ b/src/ARMJIT.cpp @@ -51,10 +51,10 @@ namespace melonDS using Platform::Log; using Platform::LogLevel; -static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset, ""); +/*static_assert(offsetof(ARM, CPSR) == ARM_CPSR_offset, ""); static_assert(offsetof(ARM, Cycles) == ARM_Cycles_offset, ""); static_assert(offsetof(ARM, StopExecution) == ARM_StopExecution_offset, ""); - +*/ #define JIT_DEBUGPRINT(msg, ...) //#define JIT_DEBUGPRINT(msg, ...) Platform::Log(Platform::LogLevel::Debug, msg, ## __VA_ARGS__) From 918df047b81ffdaf50cf4547732908bce0598a4c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:03:47 -0500 Subject: [PATCH 253/306] cache line boundary align condition lut table --- src/ARM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 0ca86283..d4e49723 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -87,7 +87,7 @@ void ARM::GdbCheckC() {} -const u32 ARM::ConditionTable[16] = +alignas(64) const u32 ARM::ConditionTable[16] = { 0xF0F0, // EQ 0x0F0F, // NE From 0111ee7fac41414c89be8e982aa0da88704e8b56 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 11:11:24 -0500 Subject: [PATCH 254/306] micro-optimization --- src/ARM.h | 8 ++++---- src/NDS.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 97974d21..44b7d259 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -798,7 +798,7 @@ public: u32 DTCMMask; //! Internal: DTCM Address Mask used in conjunction with @ref DTCMBase to check for DTCM access s32 RegionCodeCycles; //! Internal: Cached amount of cycles to fetch instruction from the current code region. - u8 ITCM[ITCMPhysicalSize]; //! Content of the ITCM + alignas(u32) u8 ITCM[ITCMPhysicalSize]; //! Content of the ITCM u8* DTCM; //! Content of the DTCM alignas(u32) u8 ICache[ICACHE_SIZE]; //! Instruction Cache Content organized in @ref ICACHE_LINESPERSET times @ref ICACHE_SETS times @ref ICACHE_LINELENGTH bytes @@ -837,11 +837,11 @@ public: bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); - u64 ITCMTimestamp; - u64 TimestampMemory; - void (ARMv5::*FuncQueue[32])(void); + alignas(64) void (ARMv5::*FuncQueue[32])(void); void (ARMv5::*DelayedQueue)(void); // adding more than one new entry to the queue while it's already active does not work. so uh. we use this to work around that. it's less than ideal... void (ARMv5::*StartExec)(void); + u64 ITCMTimestamp; + u64 TimestampMemory; u32 PC; bool NullFetch; bool Store; diff --git a/src/NDS.h b/src/NDS.h index 1c1c487f..6c430aa5 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -281,8 +281,8 @@ public: // TODO: Encapsulate the rest of these members protected: // These BIOS arrays should be declared *before* the component objects (JIT, SPI, etc.) // so that they're initialized before the component objects' constructors run. - std::array ARM9BIOS; - std::array ARM7BIOS; + alignas(u32) std::array ARM9BIOS; + alignas(u32) std::array ARM7BIOS; bool ARM9BIOSNative; bool ARM7BIOSNative; public: // TODO: Encapsulate the rest of these members From 52e14612b1b0f1e551b2f451d407545273bf8ac5 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 12:25:23 -0500 Subject: [PATCH 255/306] probably faster to directly access main ram? --- src/NDS.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index d5a2d0d4..a2870801 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -951,7 +951,7 @@ void NDS::MainRAMHandleARM9() if (var & MRCodeFetch) { u32 addr = ARM9.FetchAddr[16]; - ARM9.RetVal = ARM9Read32(addr); + ARM9.RetVal = *(u32*)&MainRAM[addr&MainRAMMask]; } else { @@ -960,17 +960,17 @@ void NDS::MainRAMHandleARM9() if (var & MRWrite) // write { u32 val = ARM9.STRVal[reg]; - if (var & MR32) ARM9Write32(addr, val); - else if (var & MR16) ARM9Write16(addr, val); - else ARM9Write8 (addr, val); + if (var & MR32) *(u32*)&MainRAM[addr&MainRAMMask] = val; + else if (var & MR16) *(u16*)&MainRAM[addr&MainRAMMask] = val; + else *(u8 *)&MainRAM[addr&MainRAMMask] = val; } else // read { u32 dummy; u32* val = ((ARM9.LDRFailedRegs & (1< ARM9.ICacheStreamPtr) ARM9.ICacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; @@ -1040,7 +1040,7 @@ void NDS::MainRAMHandleARM9() MainRAMLastAccess = A9LAST; } - dcache[*prog] = ARM9Read32(addr); + dcache[*prog] = *(u32*)&MainRAM[addr&MainRAMMask]; if (*prog == ARM9.DCacheStreamPtr) ARM9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; else if (*prog > ARM9.DCacheStreamPtr) ARM9.DCacheStreamTimes[*prog-1] = (A9ContentionTS << ARM9ClockShift) - 1; @@ -1176,7 +1176,7 @@ void NDS::MainRAMHandleARM7() if (var & MRCodeFetch) { u32 addr = ARM7.FetchAddr[16]; - ARM7.RetVal = (((var & MR32) ? ARM7Read32(addr) : ARM7Read16(addr))); + ARM7.RetVal = ((var & MR32) ? *(u32*)&MainRAM[addr&MainRAMMask] : *(u16*)&MainRAM[addr&MainRAMMask]); } else { @@ -1185,17 +1185,17 @@ void NDS::MainRAMHandleARM7() if (var & MRWrite) // write { u32 val = ARM7.STRVal[reg]; - if (var & MR32) ARM7Write32(addr, val); - else if (var & MR16) ARM7Write16(addr, val); - else ARM7Write8 (addr, val); + if (var & MR32) *(u32*)&MainRAM[addr&MainRAMMask] = val; + else if (var & MR16) *(u16*)&MainRAM[addr&MainRAMMask] = val; + else *(u8 *)&MainRAM[addr&MainRAMMask] = val; } else // read { u32 dummy; u32* val = ((ARM7.LDRFailedRegs & (1< Date: Mon, 9 Dec 2024 15:51:42 -0500 Subject: [PATCH 256/306] fix a main loop freeze; exmemcnt bit 15 starts set fixes twilight menu --- src/NDS.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index a2870801..50a8d4e2 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -484,8 +484,8 @@ void NDS::Reset() MapSharedWRAM(0); - ExMemCnt[0] = 0x4000; - ExMemCnt[1] = 0x4000; + ExMemCnt[0] = 0xC000; // checkme: should bit 15 be set by default? + ExMemCnt[1] = 0xC000; memset(ROMSeed0, 0, 2*8); memset(ROMSeed1, 0, 2*8); SetGBASlotTimings(); @@ -1358,12 +1358,13 @@ u32 NDS::RunFrame() } //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); + + RunTimers(0); + GPU.GPU3D.Run(); if (MainRAMHandle()) break; } - RunTimers(0); - GPU.GPU3D.Run(); ARM7Target = target; CurCPU = 1; @@ -1393,10 +1394,10 @@ u32 NDS::RunFrame() } } + RunTimers(1); + if (!MainRAMHandle()) break; } - RunTimers(1); - RunSystem(target); if (CPUStop & CPUStop_Sleep) From b048e0cbfd8d69e0be8e85af3e8615fd74cd0c23 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 9 Dec 2024 19:30:47 -0500 Subject: [PATCH 257/306] improve ExMemCnt handling and defaults --- src/NDS.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 50a8d4e2..9dcb2e71 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -484,8 +484,8 @@ void NDS::Reset() MapSharedWRAM(0); - ExMemCnt[0] = 0xC000; // checkme: should bit 15 be set by default? - ExMemCnt[1] = 0xC000; + ExMemCnt[0] = 0xE88C; // checkme: is this correct? ...and what does bit 10 do...? it can be set on 3ds it seems... + ExMemCnt[1] = 0xE88C; // note: these should only matter for direct boot; bios sets these values fairly quickly during native boot memset(ROMSeed0, 0, 2*8); memset(ROMSeed1, 0, 2*8); SetGBASlotTimings(); @@ -3802,8 +3802,8 @@ void NDS::ARM9IOWrite16(u32 addr, u16 val) case 0x04000204: { u16 oldVal = ExMemCnt[0]; - ExMemCnt[0] = val; - ExMemCnt[1] = (ExMemCnt[1] & 0x007F) | (val & 0xFF80); + ExMemCnt[0] = (ExMemCnt[0] & 0x7700) | (val & 0x88FF); + ExMemCnt[1] = (ExMemCnt[1] & 0x777F) | (val & 0x8880); if ((oldVal ^ ExMemCnt[0]) & 0xFF) SetGBASlotTimings(); return; From 96c8f67d5fdd9525cc6b7fbd06a1edb56fd8285c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 10 Dec 2024 08:04:00 -0500 Subject: [PATCH 258/306] implement bit 10 of exmemcnt --- src/DSi.cpp | 4 ++++ src/NDS.cpp | 11 +++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/DSi.cpp b/src/DSi.cpp index e1e6816e..505fc411 100644 --- a/src/DSi.cpp +++ b/src/DSi.cpp @@ -129,6 +129,9 @@ void DSi::Reset() //ARM9.CP15Write(0x911, 0x00000020); //ARM9.CP15Write(0x100, ARM9.CP15Read(0x100) | 0x00050000); NDS::Reset(); + + ExMemCnt[0] = 0xEC8C; // checkme: bit 10 should be explicitly set? + ExMemCnt[1] = 0xEC8C; // The SOUNDBIAS register does nothing on DSi SPU.SetApplyBias(false); @@ -3114,6 +3117,7 @@ void DSi::ARM7IOWrite32(u32 addr, u32 val) SCFG_EXT[0] |= (val & 0x03000000); SCFG_EXT[1] &= ~0x93FF0F07; SCFG_EXT[1] |= (val & 0x93FF0F07); + if (!(val & (1<<24))) { ExMemCnt[0] &= ~(1<<10); ExMemCnt[1] &= ~(1<<10); } // bit 10 of exmemcnt is cleared when disabling second card slot access Log(LogLevel::Debug, "SCFG_EXT = %08X / %08X (val7 %08X)\n", SCFG_EXT[0], SCFG_EXT[1], val); return; case 0x04004010: diff --git a/src/NDS.cpp b/src/NDS.cpp index 9dcb2e71..3e124663 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -484,7 +484,7 @@ void NDS::Reset() MapSharedWRAM(0); - ExMemCnt[0] = 0xE88C; // checkme: is this correct? ...and what does bit 10 do...? it can be set on 3ds it seems... + ExMemCnt[0] = 0xE88C; // checkme: is this correct? ExMemCnt[1] = 0xE88C; // note: these should only matter for direct boot; bios sets these values fairly quickly during native boot memset(ROMSeed0, 0, 2*8); memset(ROMSeed1, 0, 2*8); @@ -3801,9 +3801,11 @@ void NDS::ARM9IOWrite16(u32 addr, u16 val) case 0x04000204: { + u16 settablemask = 0x88FF; + if ((ConsoleType == 1) && (((DSi*)this)->SCFG_EXT[1] & (1<<24))) settablemask |= 0x0400; // bit 10 can be set if SCFG_EXT bit 24 is set u16 oldVal = ExMemCnt[0]; - ExMemCnt[0] = (ExMemCnt[0] & 0x7700) | (val & 0x88FF); - ExMemCnt[1] = (ExMemCnt[1] & 0x777F) | (val & 0x8880); + ExMemCnt[0] = (ExMemCnt[0] & ~settablemask) | (val & settablemask); + ExMemCnt[1] = (ExMemCnt[1] & (~settablemask | 0x7F)) | (val & (settablemask & ~0x7F)); if ((oldVal ^ ExMemCnt[0]) & 0xFF) SetGBASlotTimings(); return; @@ -4596,8 +4598,9 @@ void NDS::ARM7IOWrite16(u32 addr, u16 val) case 0x04000204: { + u16 settablemask = 0x007F; u16 oldVal = ExMemCnt[1]; - ExMemCnt[1] = (ExMemCnt[1] & 0xFF80) | (val & 0x007F); + ExMemCnt[1] = (ExMemCnt[1] & ~settablemask) | (val & settablemask); if ((ExMemCnt[1] ^ oldVal) & 0xFF) SetGBASlotTimings(); return; From feb1cd562d7b6900da4855d923b8cf5740d2704e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 10 Dec 2024 12:04:43 -0500 Subject: [PATCH 259/306] clarify some more write buffer details --- src/ARM.h | 2 +- src/CP15.cpp | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 44b7d259..04020540 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -811,7 +811,7 @@ public: u32 PU_CodeCacheable; //! CP15 Register 2 Opcode2 1: Code Cachable Bits u32 PU_DataCacheable; //! CP15 Register 2 Opcode2 0: Data Cachable Bits - u32 PU_DataCacheWrite; //! CP15 Register 3 Opcode2 0: WriteBuffer Control Register + u32 PU_WriteBufferability; //! CP15 Register 3 Opcode2 0: Write Buffer Control Register u32 PU_CodeRW; //! CP15 Register 5 Opcode2 3: Code Access Permission register u32 PU_DataRW; //! CP15 Register 5 Opcode2 2: Data Access Permission register diff --git a/src/CP15.cpp b/src/CP15.cpp index c7b076cb..747f2955 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -67,7 +67,7 @@ void ARMv5::CP15Reset() // Cache Settings PU_CodeCacheable = 0; PU_DataCacheable = 0; - PU_DataCacheWrite = 0; + PU_WriteBufferability = 0; ICacheLockDown = 0; DCacheLockDown = 0; @@ -119,7 +119,7 @@ void ARMv5::CP15DoSavestate(Savestate* file) file->Var32(&PU_CodeCacheable); file->Var32(&PU_DataCacheable); - file->Var32(&PU_DataCacheWrite); + file->Var32(&PU_WriteBufferability); file->Var32(&PU_CodeRW); file->Var32(&PU_DataRW); @@ -198,9 +198,9 @@ void ARMv5::UpdatePURegion(const u32 n) bool codecache, datacache, datawrite; // datacache/datawrite - // 0/0: goes to memory - // 0/1: goes to memory - // 1/0: goes to memory and cache + // 0/0: goes directly to memory + // 0/1: goes to write buffer + // 1/0: goes to write buffer and cache // 1/1: goes to cache if (CP15Control & CP15_CACHE_CR_ICACHEENABLE) @@ -217,7 +217,7 @@ void ARMv5::UpdatePURegion(const u32 n) datacache = false; } - datawrite = (PU_DataCacheWrite >> n) & 0x1; + datawrite = (PU_WriteBufferability >> n) & 0x1; u32 rgn = PU_Region[n]; if (!(rgn & CP15_REGION_ENABLE)) @@ -1493,10 +1493,10 @@ void ARMv5::CP15Write(u32 id, u32 val) return; - case 0x300: // data cache write-buffer + case 0x300: // write-buffer { - u32 diff = PU_DataCacheWrite ^ val; - PU_DataCacheWrite = val; + u32 diff = PU_WriteBufferability ^ val; + PU_WriteBufferability = val; #if 0 // This code just updates the PU_Map entries of the given region // this works fine, if the regions do not overlap @@ -1996,7 +1996,7 @@ u32 ARMv5::CP15Read(const u32 id) const case 0x201: return PU_CodeCacheable; case 0x300: - return PU_DataCacheWrite; + return PU_WriteBufferability; case 0x500: From d341260e5a39b926aa2b688edd68d07a0cd366d6 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 10 Dec 2024 21:23:02 -0500 Subject: [PATCH 260/306] dma rewrite 1 --- src/ARM.h | 6 +- src/DMA.cpp | 34 +++++++-- src/DMA.h | 18 ++--- src/NDS.cpp | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++-- src/NDS.h | 2 + 5 files changed, 252 insertions(+), 24 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 04020540..413510f8 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -69,7 +69,11 @@ enum class MainRAMType : u8 Fetch, ICacheStream, DCacheStream, - WriteBufferCmds, // all write buffer commands must be above this one; wb cmds not strictly used for main ram + DMA16, + DMA32, + + WriteBufferCmds, // all write buffer commands must be below this one; wb cmds are not strictly used for main ram + WBDrain, WBWrite, WBCheck, diff --git a/src/DMA.cpp b/src/DMA.cpp index 56ec1564..5269a974 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -21,6 +21,7 @@ #include "DSi.h" #include "DMA.h" #include "GPU.h" +#include "ARM.h" #include "GPU3D.h" #include "DMA_Timings.h" #include "Platform.h" @@ -213,8 +214,8 @@ u32 DMA::UnitTimings9_16(u8 burststart) src_s = NDS.ARM9MemTimings[src_id][5]; dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - - if (src_rgn == Mem9_MainRAM) + + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -275,7 +276,7 @@ u32 DMA::UnitTimings9_16(u8 burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -305,7 +306,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -368,7 +369,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -570,14 +571,22 @@ void DMA::Run9() // add NS penalty for first accesses in burst int burststart = Running-1; - Running = 2; NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1< 0 && !Stall) { + u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + if (rgn & Mem9_MainRAM) + { + NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; + NDS.ARM9.MRTrack.Var = Num; + return; + } + Running = 2; + NDS.ARM9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -595,6 +604,15 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { + u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + if (rgn & Mem9_MainRAM) + { + NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; + NDS.ARM9.MRTrack.Var = Num; + return; + } + Running = 2; + NDS.ARM9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -609,7 +627,7 @@ void DMA::Run9() } } - if (burststart == 1) Running = 1; + if (burststart == 0) Running = 1; Executing = false; Stall = false; diff --git a/src/DMA.h b/src/DMA.h index 64d5647f..65aaff82 100644 --- a/src/DMA.h +++ b/src/DMA.h @@ -81,23 +81,23 @@ public: u32 SrcAddr {}; u32 DstAddr {}; u32 Cnt {}; - -private: - melonDS::NDS& NDS; - u32 CPU {}; - u32 Num {}; - - u32 StartMode {}; u32 CurSrcAddr {}; u32 CurDstAddr {}; u32 RemCount {}; u32 IterCount {}; s32 SrcAddrInc {}; s32 DstAddrInc {}; - u32 CountMask {}; - u32 Running {}; bool InProgress {}; + u32 Num {}; + u32 StartMode {}; + +private: + melonDS::NDS& NDS; + u32 CPU {}; + + u32 CountMask {}; + bool Executing {}; bool Stall {}; diff --git a/src/NDS.cpp b/src/NDS.cpp index 3e124663..2f97a9db 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1055,6 +1055,210 @@ void NDS::MainRAMHandleARM9() break; } + case MainRAMType::DMA32: + { + DMA* dma = &DMAs[ARM9.MRTrack.Var]; + int burststart = dma->Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM9Regions[srcaddr>>14]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM9Regions[dstaddr>>14]; + if (!ARM9.MRTrack.Progress) + { + if (srcrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += 6; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 2; + MainRAMTimestamp += 2; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + (ARM9MemTimings[srcaddr>>14][6] == 1); + else A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; + DMALastWasMainRAM = false; + } + + DMAReadHold = ARM9Read32(srcaddr); + + ARM9.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMTimestamp = A9ContentionTS + 9; + A9ContentionTS += 4; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 2; + MainRAMTimestamp += 2; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][6]; + else A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); + DMALastWasMainRAM = false; + } + + ARM9Write32(dstaddr, DMAReadHold); + + dma->CurSrcAddr += dma->SrcAddrInc<<2; + dma->CurDstAddr += dma->DstAddrInc<<2; + dma->IterCount--; + dma->RemCount--; + burststart -= 1; + if (burststart <= 0) dma->Running = 1; + else dma->Running = 2; + ARM9Timestamp = A9ContentionTS << ARM9ClockShift; + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + if (dma->RemCount) + { + if (dma->IterCount == 0) + { + dma->Running = 0; + ResumeCPU(0, 1<Num); + + if (dma->StartMode == 0x07) + GPU.GPU3D.CheckFIFODMA(); + } + + break; + } + + if (!(dma->Cnt & (1<<25))) + dma->Cnt &= ~(1<<31); + + if (dma->Cnt & (1<<30)) + SetIRQ(0, IRQ_DMA0 + dma->Num); + + dma->Running = 0; + dma->InProgress = false; + ResumeCPU(0, 1<Num); + } + break; + } + + case MainRAMType::DMA16: + { + DMA* dma = &DMAs[ARM9.MRTrack.Var]; + int burststart = dma->Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM9Regions[srcaddr>>14]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM9Regions[dstaddr>>14]; + if (!ARM9.MRTrack.Progress) + { + if (srcrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMTimestamp = A9ContentionTS + 8; + A9ContentionTS += 5; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 1; + MainRAMTimestamp += 1; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + (ARM9MemTimings[srcaddr>>14][4] == 1); + else A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; + DMALastWasMainRAM = false; + } + + DMAReadHold = ARM9Read16(srcaddr); + + ARM9.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem9_MainRAM) + { + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + { + if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + MainRAMTimestamp = A9ContentionTS + 8; + A9ContentionTS += 3; + MainRAMLastAccess = A9LAST; + } + else + { + A9ContentionTS += 1; + MainRAMTimestamp += 1; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][4]; + else A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] - (burststart <= 0); + DMALastWasMainRAM = false; + } + + ARM9Write16(dstaddr, DMAReadHold); + + dma->CurSrcAddr += dma->SrcAddrInc<<1; + dma->CurDstAddr += dma->DstAddrInc<<1; + dma->IterCount--; + dma->RemCount--; + burststart -= 1; + if (burststart <= 0) Running = 1; + else dma->Running = 2; + ARM9Timestamp = A9ContentionTS << ARM9ClockShift; + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + ConTSLock = false; + if (dma->RemCount) + { + if (dma->IterCount == 0) + { + dma->Running = 0; + ResumeCPU(0, 1<Num); + + if (dma->StartMode == 0x07) + GPU.GPU3D.CheckFIFODMA(); + } + + break; + } + + if (!(dma->Cnt & (1<<25))) + dma->Cnt &= ~(1<<31); + + if (dma->Cnt & (1<<30)) + SetIRQ(0, IRQ_DMA0 + dma->Num); + + dma->Running = 0; + dma->InProgress = false; + ResumeCPU(0, 1<Num); + } + break; + } + case MainRAMType::WBDrain: { if (!ARM9.WriteBufferHandle()) return; @@ -1334,9 +1538,9 @@ u32 NDS::RunFrame() { u64 ts = ARM9Timestamp; DMAs[0].Run(); - if (!(CPUStop & CPUStop_GXStall)) DMAs[1].Run(); - if (!(CPUStop & CPUStop_GXStall)) DMAs[2].Run(); - if (!(CPUStop & CPUStop_GXStall)) DMAs[3].Run(); + if (!(CPUStop & CPUStop_GXStall) && (ARM9.MRTrack.Type == MainRAMType::Null)) DMAs[1].Run(); + if (!(CPUStop & CPUStop_GXStall) && (ARM9.MRTrack.Type == MainRAMType::Null)) DMAs[2].Run(); + if (!(CPUStop & CPUStop_GXStall) && (ARM9.MRTrack.Type == MainRAMType::Null)) DMAs[3].Run(); if (ConsoleType == 1) { auto& dsi = dynamic_cast(*this); @@ -1378,9 +1582,9 @@ u32 NDS::RunFrame() if (CPUStop & CPUStop_DMA7) { DMAs[4].Run(); - DMAs[5].Run(); - DMAs[6].Run(); - DMAs[7].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[5].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[6].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[7].Run(); if (ConsoleType == 1) { auto& dsi = dynamic_cast(*this); diff --git a/src/NDS.h b/src/NDS.h index 6c430aa5..b0b5a911 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -276,7 +276,9 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed0[2*8]; alignas(u32) u8 ROMSeed1[2*8]; + u32 DMAReadHold; bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 + bool DMALastWasMainRAM; protected: // These BIOS arrays should be declared *before* the component objects (JIT, SPI, etc.) From 73be2f3e01e22c7a695edf98b6d511b1f0a3eba7 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:09:42 -0500 Subject: [PATCH 261/306] tweak dmas to be more accurate (actually less?) --- src/CP15.cpp | 33 +++++++++++++++++++++++++++------ src/DMA.cpp | 17 ++++++++++------- src/NDS.cpp | 20 +++++++------------- src/NDS.h | 2 +- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index 747f2955..e89275c8 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -454,7 +454,8 @@ bool ARMv5::ICacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_ICACHE_LINEFILL) [[unlikely]] return false; - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; WriteBufferDrain(); FetchAddr[16] = addr; QueueFunction(&ARMv5::ICacheLookup_2); @@ -688,6 +689,8 @@ bool ARMv5::DCacheLookup(const u32 addr) // BIST test State register (See arm946e-s Rev 1 technical manual, 2.3.15 "Register 15, test State Register") if (CP15BISTTestStateRegister & CP15_BIST_TR_DISABLE_DCACHE_LINEFILL) [[unlikely]] return false; + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; WriteBufferDrain(); // checkme? FetchAddr[16] = addr; @@ -2214,10 +2217,12 @@ void ARMv5::CodeRead32(u32 addr) // bus reads can only overlap with dcache streaming by 6 cycles if (DCacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = DCacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2346,10 +2351,12 @@ void ARMv5::DRead8_2() // checkme: does dcache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2460,10 +2467,12 @@ void ARMv5::DRead16_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2575,10 +2584,12 @@ void ARMv5::DRead32_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2676,15 +2687,17 @@ void ARMv5::DRead32S_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) // checkme WriteBufferDrain(); else WriteBufferCheck<1>(); - + QueueFunction(&ARMv5::DRead32S_3); } @@ -2809,12 +2822,14 @@ void ARMv5::DWrite8_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } if (!(PU_Map[addr>>12] & (0x30))) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite8_3); } @@ -2922,12 +2937,14 @@ void ARMv5::DWrite16_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } if (!(PU_Map[addr>>12] & 0x30)) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite16_3); } @@ -3040,12 +3057,14 @@ void ARMv5::DWrite32_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } if (!(PU_Map[addr>>12] & 0x30)) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite32_3); } @@ -3153,12 +3172,14 @@ void ARMv5::DWrite32S_2() // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } if (!(PU_Map[addr>>12] & 0x30)) // non-bufferable { + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite32S_3); } diff --git a/src/DMA.cpp b/src/DMA.cpp index 5269a974..4ea122f3 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -565,14 +565,15 @@ u32 DMA::UnitTimings7_32(bool burststart) void DMA::Run9() { - if (NDS.ARM9Timestamp >= NDS.ARM9Target) return; + NDS.DMA9Timestamp = std::max(NDS.DMA9Timestamp, NDS.ARM9Timestamp); + NDS.DMA9Timestamp = (NDS.DMA9Timestamp + ((1<= NDS.ARM9Target) return; Executing = true; // add NS penalty for first accesses in burst int burststart = Running-1; - - NDS.ARM9Timestamp = (NDS.ARM9Timestamp + ((1<= NDS.ARM9Target) break; + if (NDS.DMA9Timestamp-1 >= NDS.ARM9Target) break; } } else @@ -613,7 +614,7 @@ void DMA::Run9() } Running = 2; - NDS.ARM9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); + NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; NDS.ARM9Write32(CurDstAddr, NDS.ARM9Read32(CurSrcAddr)); @@ -623,10 +624,12 @@ void DMA::Run9() IterCount--; RemCount--; - if (NDS.ARM9Timestamp >= NDS.ARM9Target) break; + if (NDS.DMA9Timestamp-1 >= NDS.ARM9Target) break; } } + NDS.DMA9Timestamp -= 1; + if (burststart == 0) Running = 1; Executing = false; diff --git a/src/NDS.cpp b/src/NDS.cpp index 2f97a9db..5d6badef 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -470,7 +470,7 @@ void NDS::Reset() // unitialised on the first run ARM9.CP15Reset(); - ARM9Timestamp = 0; ARM9Target = 0; + ARM9Timestamp = 0; DMA9Timestamp = 0; ARM9Target = 0; ARM7Timestamp = 0; ARM7Target = 0; MainRAMTimestamp = 0; A9ContentionTS = 0; ConTSLock = false; @@ -1127,7 +1127,7 @@ void NDS::MainRAMHandleARM9() burststart -= 1; if (burststart <= 0) dma->Running = 1; else dma->Running = 2; - ARM9Timestamp = A9ContentionTS << ARM9ClockShift; + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; if (dma->RemCount) @@ -1229,7 +1229,7 @@ void NDS::MainRAMHandleARM9() burststart -= 1; if (burststart <= 0) Running = 1; else dma->Running = 2; - ARM9Timestamp = A9ContentionTS << ARM9ClockShift; + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; if (dma->RemCount) @@ -1416,6 +1416,8 @@ bool NDS::MainRAMHandle() if (ARM9.MRTrack.Type > MainRAMType::WriteBufferCmds) A9ContentionTS = (ARM9.WBTimestamp + ((1<> ARM9ClockShift; + else if (ARM9.MRTrack.Type == MainRAMType::DMA16 || ARM9.MRTrack.Type == MainRAMType::DMA32) + A9ContentionTS = (DMA9Timestamp + ((1<> ARM9ClockShift; else A9ContentionTS = (ARM9Timestamp + ((1<> ARM9ClockShift; } @@ -1523,7 +1525,7 @@ u32 NDS::RunFrame() ARM9Target = target << ARM9ClockShift; CurCPU = 0; - while (ARM9Timestamp < ARM9Target) + while (std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) { if (ARM9.MRTrack.Type == MainRAMType::Null) { @@ -1532,11 +1534,10 @@ u32 NDS::RunFrame() // GXFIFO stall s32 cycles = GPU.GPU3D.CyclesToRunFor(); - ARM9Timestamp = std::min(ARM9Target, ARM9Timestamp+(cycles<(*this); dsi.RunNDMAs(0); } - ts = ARM9Timestamp - ts; - for (int i = 0; i < 7; i++) - { - ARM9.ICacheStreamTimes[i] += ts; - ARM9.DCacheStreamTimes[i] += ts; - } - ARM9.WBTimestamp += ts; } else { diff --git a/src/NDS.h b/src/NDS.h index b0b5a911..939a7a67 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -255,7 +255,7 @@ public: // TODO: Encapsulate the rest of these members bool LagFrameFlag; // no need to worry about those overflowing, they can keep going for atleast 4350 years - u64 ARM9Timestamp, ARM9Target; + u64 ARM9Timestamp, DMA9Timestamp, ARM9Target; u64 ARM7Timestamp, ARM7Target; u64 MainRAMTimestamp; u64 A9ContentionTS; bool ConTSLock; From 642f085975ba4ec934a27e7f8813c3cfdae35aef Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:15:38 -0500 Subject: [PATCH 262/306] probably unborks gxfifo stalls --- src/GPU3D.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 4a1426aa..953a2343 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -2378,13 +2378,13 @@ void GPU3D::Run() noexcept if (!GeometryEnabled || FlushRequest || (CmdPIPE.IsEmpty() && !(GXStat & (1<<27)))) { - Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; + Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; return; } - s32 cycles = (NDS.ARM9Timestamp >> NDS.ARM9ClockShift) - Timestamp; + s32 cycles = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift) - Timestamp; CycleCount -= cycles; - Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; + Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; if (CycleCount <= 0) { From 456d07da03b01e20a504d455feb1a909a87d16bd Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:35:44 -0500 Subject: [PATCH 263/306] unbork gxfifo stalls --- src/NDS.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 5d6badef..bc094843 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1533,8 +1533,7 @@ u32 NDS::RunFrame() { // GXFIFO stall s32 cycles = GPU.GPU3D.CyclesToRunFor(); - - DMA9Timestamp = std::min(ARM9Target, std::min(ARM9Timestamp+(cycles< Date: Fri, 13 Dec 2024 21:32:15 -0500 Subject: [PATCH 264/306] probably not any faster --- src/ARM.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index 413510f8..21c06813 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -841,9 +841,9 @@ public: bool (*GetMemRegion)(u32 addr, bool write, MemRegion* region); - alignas(64) void (ARMv5::*FuncQueue[32])(void); - void (ARMv5::*DelayedQueue)(void); // adding more than one new entry to the queue while it's already active does not work. so uh. we use this to work around that. it's less than ideal... + alignas(64) void (ARMv5::*DelayedQueue)(void); // adding more than one new entry to the queue while it's already active does not work. so uh. we use this to work around that. it's less than ideal... void (ARMv5::*StartExec)(void); + void (ARMv5::*FuncQueue[32])(void); u64 ITCMTimestamp; u64 TimestampMemory; u32 PC; @@ -912,9 +912,9 @@ public: template void Execute(); - + + alignas(64) void (ARMv4::*StartExec)(void); void (ARMv4::*FuncQueue[32])(void); - void (ARMv4::*StartExec)(void); bool Nonseq; void CodeRead16(u32 addr); From a445c0d32c43df9df31a342012f0209efe506567 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 13 Dec 2024 22:08:34 -0500 Subject: [PATCH 265/306] this makes a bit more sense --- src/CP15.cpp | 90 +++++++++++++++++++++++----------------------------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index e89275c8..bb7320cb 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2214,15 +2214,14 @@ void ARMv5::CodeRead32(u32 addr) #endif } + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; // bus reads can only overlap with dcache streaming by 6 cycles if (DCacheStreamPtr < 7) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = DCacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2346,17 +2345,16 @@ void ARMv5::DRead8_2() } } #endif - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; // bus reads can only overlap with icache streaming by 6 cycles // checkme: does dcache trigger this? if (ICacheStreamPtr < 7) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2462,17 +2460,16 @@ void ARMv5::DRead16_2() } } #endif - + + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2580,16 +2577,15 @@ void ARMv5::DRead32_2() } #endif + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) WriteBufferDrain(); else @@ -2683,16 +2679,15 @@ void ARMv5::DRead32S_2() } #endif + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; // bus reads can only overlap with icache streaming by 6 cycles // checkme: does cache trigger this? if (ICacheStreamPtr < 7) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; if (PU_Map[addr>>12] & 0x30) // checkme WriteBufferDrain(); else @@ -2817,19 +2812,18 @@ void ARMv5::DWrite8_2() } } #endif - - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: does cache trigger this? - if (ICacheStreamPtr < 7) - { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } if (!(PU_Map[addr>>12] & (0x30))) { if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite8_3); } @@ -2932,19 +2926,18 @@ void ARMv5::DWrite16_2() } } #endif - - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: does cache trigger this? - if (ICacheStreamPtr < 7) - { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } if (!(PU_Map[addr>>12] & 0x30)) { if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite16_3); } @@ -3052,19 +3045,18 @@ void ARMv5::DWrite32_2() } } #endif - - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: does cache trigger this? - if (ICacheStreamPtr < 7) - { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } if (!(PU_Map[addr>>12] & 0x30)) { if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite32_3); } @@ -3167,19 +3159,17 @@ void ARMv5::DWrite32S_2() } } #endif - - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: does cache trigger this? - if (ICacheStreamPtr < 7) - { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } if (!(PU_Map[addr>>12] & 0x30)) // non-bufferable { if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite32S_3); } From ac1d790d7e046dcb74ec42a10c4cd19876cc9d11 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 00:15:17 -0500 Subject: [PATCH 266/306] fix the system timestamp being run wayyyy too fast oh no that was covering up SO many bugs hhhhsdfghhg --- src/NDS.cpp | 131 ++++++++++++++++++++++++++-------------------------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index bc094843..ad3fcf01 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1523,79 +1523,83 @@ u32 NDS::RunFrame() u64 target = NextTarget(); ARM9Target = target << ARM9ClockShift; + ARM7Target = target; CurCPU = 0; - while (std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) + while ((std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) && (ARM7Timestamp < ARM7Target)) { - if (ARM9.MRTrack.Type == MainRAMType::Null) + while (std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) { - if (CPUStop & CPUStop_GXStall) + if (ARM9.MRTrack.Type == MainRAMType::Null) { - // GXFIFO stall - s32 cycles = GPU.GPU3D.CyclesToRunFor(); - DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(0); + // GXFIFO stall + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); + dsi.RunNDMAs(0); + } + } + else + { + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); } } - else - { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; - ARM9.Execute(); - } - } - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - RunTimers(0); - GPU.GPU3D.Run(); + RunTimers(0); + GPU.GPU3D.Run(); - if (MainRAMHandle()) break; + if (MainRAMHandle()) break; - } - - ARM7Target = target; - CurCPU = 1; - - while (ARM7Timestamp < ARM7Target) - { - //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); - - if (ARM7.MRTrack.Type == MainRAMType::Null) - { - if (CPUStop & CPUStop_DMA7) - { - DMAs[4].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[5].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[6].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[7].Run(); - if (ConsoleType == 1) - { - auto& dsi = dynamic_cast(*this); - dsi.RunNDMAs(1); - } - } - else - { - //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; - ARM7.Execute(); - } } - RunTimers(1); + CurCPU = 1; - if (!MainRAMHandle()) break; + while (ARM7Timestamp < ARM7Target) + { + //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + + if (ARM7.MRTrack.Type == MainRAMType::Null) + { + if (CPUStop & CPUStop_DMA7) + { + DMAs[4].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[5].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[6].Run(); + if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[7].Run(); + if (ConsoleType == 1) + { + auto& dsi = dynamic_cast(*this); + dsi.RunNDMAs(1); + } + } + else + { + //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; + ARM7.Execute(); + } + } + + RunTimers(1); + + if (!MainRAMHandle()) break; + } } - RunSystem(target); + + RunSystem(ARM7Target); if (CPUStop & CPUStop_Sleep) { @@ -1610,7 +1614,7 @@ u32 NDS::RunFrame() #ifdef DEBUG_CHECK_DESYNC Log(LogLevel::Debug, "[%08X%08X] ARM9=%ld, ARM7=%ld, GPU=%ld\n", (u32)(SysTimestamp>>32), (u32)SysTimestamp, - (ARM9Timestamp>>1)-SysTimestamp, + (std::max(ARM9Timestamp,DMA9Timestamp)>>ARM9ClockShift)-SysTimestamp, ARM7Timestamp-SysTimestamp, GPU.GPU3D.Timestamp-SysTimestamp); #endif @@ -1651,15 +1655,10 @@ u32 NDS::RunFrame() void NDS::Reschedule(u64 target) { - if (CurCPU == 0) + if (target < ARM7Target) { - if (target < (ARM9Target >> ARM9ClockShift)) - ARM9Target = (target << ARM9ClockShift); - } - else - { - if (target < ARM7Target) - ARM7Target = target; + ARM7Target = target; + ARM9Target = (target << ARM9ClockShift); } } From 610ac2491a33ccbae4b5b5754a621721ded62edc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 09:38:02 -0500 Subject: [PATCH 267/306] disable main ram contention for arm9 dma caused innumerable issues will need a more comprehensive rewrite later --- src/DMA.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 4ea122f3..5abc8ae7 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -215,7 +215,7 @@ u32 DMA::UnitTimings9_16(u8 burststart) dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - /*if (src_rgn == Mem9_MainRAM) + if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -276,7 +276,7 @@ u32 DMA::UnitTimings9_16(u8 burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -306,7 +306,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - /*if (src_rgn == Mem9_MainRAM) + if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -369,7 +369,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -579,13 +579,14 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { + /* u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; return; - } + }*/ Running = 2; NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); @@ -605,13 +606,14 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { + /* u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; return; - } + }*/ Running = 2; NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); From 5e945669f5fae4ca0a7cf1ef765799055e71a8fd Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 10:45:08 -0500 Subject: [PATCH 268/306] hopefully reduce desync potential a little? --- src/NDS.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index ad3fcf01..0a842663 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1524,12 +1524,15 @@ u32 NDS::RunFrame() ARM9Target = target << ARM9ClockShift; ARM7Target = target; - CurCPU = 0; while ((std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) && (ARM7Timestamp < ARM7Target)) { + CurCPU = 0; while (std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) { + RunTimers(0); + GPU.GPU3D.Run(); + if (ARM9.MRTrack.Type == MainRAMType::Null) { if (CPUStop & CPUStop_GXStall) @@ -1556,14 +1559,13 @@ u32 NDS::RunFrame() ARM9.Execute(); } } - + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - + RunTimers(0); GPU.GPU3D.Run(); if (MainRAMHandle()) break; - } CurCPU = 1; @@ -1572,6 +1574,8 @@ u32 NDS::RunFrame() { //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + RunTimers(1); + if (ARM7.MRTrack.Type == MainRAMType::Null) { if (CPUStop & CPUStop_DMA7) @@ -1614,7 +1618,7 @@ u32 NDS::RunFrame() #ifdef DEBUG_CHECK_DESYNC Log(LogLevel::Debug, "[%08X%08X] ARM9=%ld, ARM7=%ld, GPU=%ld\n", (u32)(SysTimestamp>>32), (u32)SysTimestamp, - (std::max(ARM9Timestamp,DMA9Timestamp)>>ARM9ClockShift)-SysTimestamp, + std::max(std::max(ARM9Timestamp,DMA9Timestamp)>>ARM9ClockShift, A9ContentionTS)-SysTimestamp, ARM7Timestamp-SysTimestamp, GPU.GPU3D.Timestamp-SysTimestamp); #endif From 4ea0e60e185f21db88596ed29c6104a44d838c75 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 13:32:54 -0500 Subject: [PATCH 269/306] minor fix(?) this should fix something? --- src/NDS.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 0a842663..b8e4b51a 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -914,6 +914,7 @@ void NDS::RunSystemSleep(u64 timestamp) void NDS::MainRAMHandleARM9() { + CurCPU = 0; switch (ARM9.MRTrack.Type) { default: @@ -1348,6 +1349,7 @@ void NDS::MainRAMHandleARM9() void NDS::MainRAMHandleARM7() { + CurCPU = 1; switch (ARM7.MRTrack.Type) { default: @@ -1527,9 +1529,9 @@ u32 NDS::RunFrame() while ((std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) && (ARM7Timestamp < ARM7Target)) { - CurCPU = 0; while (std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) { + CurCPU = 0; RunTimers(0); GPU.GPU3D.Run(); @@ -1568,12 +1570,10 @@ u32 NDS::RunFrame() if (MainRAMHandle()) break; } - CurCPU = 1; - while (ARM7Timestamp < ARM7Target) { //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); - + CurCPU = 1; RunTimers(1); if (ARM7.MRTrack.Type == MainRAMType::Null) From 2051d412d10a5f4f1c193605683627eeb6b9f9f4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 17:15:06 -0500 Subject: [PATCH 270/306] implement MR cont. for arm7 dma; also a hack? the hack is to make arm9 dma contention work with prior improvements to synchronization --- src/DMA.cpp | 23 ++++-- src/NDS.cpp | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 5 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 5abc8ae7..650eec63 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -579,14 +579,13 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { - /* u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; return; - }*/ + } Running = 2; NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); @@ -606,14 +605,13 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { - /* u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; return; - }*/ + } Running = 2; NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); @@ -670,12 +668,19 @@ void DMA::Run7() // add NS penalty for first accesses in burst bool burststart = (Running == 2); - Running = 1; if (!(Cnt & (1<<26))) { while (IterCount > 0 && !Stall) { + u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + if (rgn & Mem7_MainRAM) + { + NDS.ARM7.MRTrack.Type = MainRAMType::DMA16; + NDS.ARM7.MRTrack.Var = Num; + return; + } + Running = 1; NDS.ARM7Timestamp += UnitTimings7_16(burststart); burststart = false; @@ -693,6 +698,14 @@ void DMA::Run7() { while (IterCount > 0 && !Stall) { + u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + if (rgn & Mem7_MainRAM) + { + NDS.ARM7.MRTrack.Type = MainRAMType::DMA32; + NDS.ARM7.MRTrack.Var = Num; + return; + } + Running = 1; NDS.ARM7Timestamp += UnitTimings7_32(burststart); burststart = false; diff --git a/src/NDS.cpp b/src/NDS.cpp index b8e4b51a..4842d482 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1407,6 +1407,204 @@ void NDS::MainRAMHandleARM7() memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); break; } + + case MainRAMType::DMA32: + { + DMA* dma = &DMAs[ARM7.MRTrack.Var]; + int burststart = dma->Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM7Regions[srcaddr>>15]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM7Regions[dstaddr>>15]; + if (!ARM7.MRTrack.Progress) + { + if (srcrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMTimestamp = ARM7Timestamp + 9; + ARM7Timestamp += 6; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 2; + MainRAMTimestamp += 2; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + (ARM7MemTimings[srcaddr>>15][2] == 1); + else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; + DMALastWasMainRAM = false; + } + + DMAReadHold = ARM7Read32(srcaddr); + + ARM7.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMTimestamp = ARM7Timestamp + 9; + ARM7Timestamp += 4; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 2; + MainRAMTimestamp += 2; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2]; + else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); + DMALastWasMainRAM = false; + } + + ARM7Write32(dstaddr, DMAReadHold); + + dma->CurSrcAddr += dma->SrcAddrInc<<2; + dma->CurDstAddr += dma->DstAddrInc<<2; + dma->IterCount--; + dma->RemCount--; + burststart -= 1; + if (burststart <= 0) dma->Running = 1; + else dma->Running = 2; + //DMA7Timestamp = ARM7Timestamp; + memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + ConTSLock = false; + if (dma->RemCount) + { + if (dma->IterCount == 0) + { + dma->Running = 0; + ResumeCPU(1, 1<Num); + } + + break; + } + + if (!(dma->Cnt & (1<<25))) + dma->Cnt &= ~(1<<31); + + if (dma->Cnt & (1<<30)) + SetIRQ(1, IRQ_DMA0 + dma->Num); + + dma->Running = 0; + dma->InProgress = false; + ResumeCPU(1, 1<Num); + } + break; + } + + case MainRAMType::DMA16: + { + DMA* dma = &DMAs[ARM7.MRTrack.Var]; + int burststart = dma->Running - 1; + + u32 srcaddr = dma->CurSrcAddr; + u32 srcrgn = ARM7Regions[srcaddr>>15]; + u32 dstaddr = dma->CurDstAddr; + u32 dstrgn = ARM7Regions[dstaddr>>15]; + if (!ARM7.MRTrack.Progress) + { + if (srcrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMTimestamp = ARM7Timestamp + 8; + ARM7Timestamp += 5; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 1; + MainRAMTimestamp += 1; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + (ARM7MemTimings[srcaddr>>15][0] == 1); + else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; + DMALastWasMainRAM = false; + } + + DMAReadHold = ARM7Read16(srcaddr); + + ARM7.MRTrack.Progress = 1; + } + else + { + if (dstrgn == Mem7_MainRAM) + { + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + { + if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMTimestamp = ARM7Timestamp + 8; + ARM7Timestamp += 3; + MainRAMLastAccess = A7LAST; + } + else + { + ARM7Timestamp += 1; + MainRAMTimestamp += 1; + } + DMALastWasMainRAM = true; + } + else + { + if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0]; + else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] - (burststart <= 0); + DMALastWasMainRAM = false; + } + + ARM7Write16(dstaddr, DMAReadHold); + + dma->CurSrcAddr += dma->SrcAddrInc<<1; + dma->CurDstAddr += dma->DstAddrInc<<1; + dma->IterCount--; + dma->RemCount--; + burststart -= 1; + if (burststart <= 0) Running = 1; + else dma->Running = 2; + //DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + ConTSLock = false; + if (dma->RemCount) + { + if (dma->IterCount == 0) + { + dma->Running = 0; + ResumeCPU(1, 1<Num); + } + + break; + } + + if (!(dma->Cnt & (1<<25))) + dma->Cnt &= ~(1<<31); + + if (dma->Cnt & (1<<30)) + SetIRQ(1, IRQ_DMA0 + dma->Num); + + dma->Running = 0; + dma->InProgress = false; + ResumeCPU(1, 1<Num); + } + break; + } } } @@ -1570,6 +1768,8 @@ u32 NDS::RunFrame() if (MainRAMHandle()) break; } + ARM7Target = target; // this line proooobably shouldn't be? but if i dont do it then several games wont work with main ram contention implemented for arm9 dmas??? + while (ARM7Timestamp < ARM7Target) { //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); From c902dcfc98ac645bedfd765e45e1fdc179ee2238 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:23:00 -0500 Subject: [PATCH 271/306] improve main ram dma timings --- src/NDS.cpp | 42 ++++++++++++++++++++++++++---------------- src/NDS.h | 4 ++-- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 4842d482..547a9249 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1069,9 +1069,10 @@ void NDS::MainRAMHandleARM9() { if (srcrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); MainRAMTimestamp = A9ContentionTS + 9; A9ContentionTS += 6; MainRAMLastAccess = A9LAST; @@ -1079,7 +1080,7 @@ void NDS::MainRAMHandleARM9() else { A9ContentionTS += 2; - MainRAMTimestamp += 2; + MainRAMTimestamp = A9ContentionTS + 3; } DMALastWasMainRAM = true; } @@ -1098,17 +1099,18 @@ void NDS::MainRAMHandleARM9() { if (dstrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242)) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } MainRAMTimestamp = A9ContentionTS + 9; + DMABurstStart = A9ContentionTS; A9ContentionTS += 4; MainRAMLastAccess = A9LAST; } else { A9ContentionTS += 2; - MainRAMTimestamp += 2; + MainRAMTimestamp = A9ContentionTS + 5; } DMALastWasMainRAM = true; } @@ -1171,9 +1173,11 @@ void NDS::MainRAMHandleARM9() { if (srcrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + DMABurstStart = A9ContentionTS; MainRAMTimestamp = A9ContentionTS + 8; A9ContentionTS += 5; MainRAMLastAccess = A9LAST; @@ -1181,7 +1185,7 @@ void NDS::MainRAMHandleARM9() else { A9ContentionTS += 1; - MainRAMTimestamp += 1; + MainRAMTimestamp = A9ContentionTS + 3; } DMALastWasMainRAM = true; } @@ -1200,9 +1204,10 @@ void NDS::MainRAMHandleARM9() { if (dstrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242)) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } + DMABurstStart = A9ContentionTS; MainRAMTimestamp = A9ContentionTS + 8; A9ContentionTS += 3; MainRAMLastAccess = A9LAST; @@ -1210,7 +1215,7 @@ void NDS::MainRAMHandleARM9() else { A9ContentionTS += 1; - MainRAMTimestamp += 1; + MainRAMTimestamp = A9ContentionTS + 5; } DMALastWasMainRAM = true; } @@ -1421,9 +1426,11 @@ void NDS::MainRAMHandleARM7() { if (srcrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + DMABurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 9; ARM7Timestamp += 6; MainRAMLastAccess = A7LAST; @@ -1431,7 +1438,7 @@ void NDS::MainRAMHandleARM7() else { ARM7Timestamp += 2; - MainRAMTimestamp += 2; + MainRAMTimestamp = ARM7Timestamp + 3; } DMALastWasMainRAM = true; } @@ -1450,9 +1457,10 @@ void NDS::MainRAMHandleARM7() { if (dstrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242)) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + DMABurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 9; ARM7Timestamp += 4; MainRAMLastAccess = A7LAST; @@ -1460,7 +1468,7 @@ void NDS::MainRAMHandleARM7() else { ARM7Timestamp += 2; - MainRAMTimestamp += 2; + MainRAMTimestamp = ARM7Timestamp + 5; } DMALastWasMainRAM = true; } @@ -1520,9 +1528,11 @@ void NDS::MainRAMHandleARM7() { if (srcrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + DMABurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 8; ARM7Timestamp += 5; MainRAMLastAccess = A7LAST; @@ -1530,7 +1540,7 @@ void NDS::MainRAMHandleARM7() else { ARM7Timestamp += 1; - MainRAMTimestamp += 1; + MainRAMTimestamp = ARM7Timestamp + 3; } DMALastWasMainRAM = true; } @@ -1549,7 +1559,7 @@ void NDS::MainRAMHandleARM7() { if (dstrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242)) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } MainRAMTimestamp = ARM7Timestamp + 8; @@ -1559,7 +1569,7 @@ void NDS::MainRAMHandleARM7() else { ARM7Timestamp += 1; - MainRAMTimestamp += 1; + MainRAMTimestamp = ARM7Timestamp + 5; } DMALastWasMainRAM = true; } diff --git a/src/NDS.h b/src/NDS.h index 939a7a67..5bae52b9 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -257,7 +257,7 @@ public: // TODO: Encapsulate the rest of these members // no need to worry about those overflowing, they can keep going for atleast 4350 years u64 ARM9Timestamp, DMA9Timestamp, ARM9Target; u64 ARM7Timestamp, ARM7Target; - u64 MainRAMTimestamp; + u64 MainRAMTimestamp, DMABurstStart; u64 A9ContentionTS; bool ConTSLock; u32 ARM9ClockShift; @@ -276,7 +276,7 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed0[2*8]; alignas(u32) u8 ROMSeed1[2*8]; - u32 DMAReadHold; + u32 DMAReadHold; bool DMABORK; bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 bool DMALastWasMainRAM; From d912429d8c98ff12d235d7e7fe6dbcccb65c1d2f Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:36:28 -0500 Subject: [PATCH 272/306] comment out some replaced stuff --- src/DMA.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 650eec63..4b494248 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -215,7 +215,7 @@ u32 DMA::UnitTimings9_16(u8 burststart) dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -276,7 +276,7 @@ u32 DMA::UnitTimings9_16(u8 burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -306,7 +306,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -369,7 +369,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -401,7 +401,7 @@ u32 DMA::UnitTimings7_16(bool burststart) dst_n = NDS.ARM7MemTimings[dst_id][0]; dst_s = NDS.ARM7MemTimings[dst_id][1]; - if (src_rgn == Mem7_MainRAM) + /*if (src_rgn == Mem7_MainRAM) { if (dst_rgn == Mem7_MainRAM) return 16; @@ -460,7 +460,7 @@ u32 DMA::UnitTimings7_16(bool burststart) return (burststart ? src_n : src_s) + 7; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { return src_n + dst_n + 1; } @@ -487,7 +487,7 @@ u32 DMA::UnitTimings7_32(bool burststart) dst_n = NDS.ARM7MemTimings[dst_id][2]; dst_s = NDS.ARM7MemTimings[dst_id][3]; - if (src_rgn == Mem7_MainRAM) + /*if (src_rgn == Mem7_MainRAM) { if (dst_rgn == Mem7_MainRAM) return 18; @@ -550,7 +550,7 @@ u32 DMA::UnitTimings7_32(bool burststart) return (burststart ? src_n : src_s) + 8; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { return src_n + dst_n + 1; } From 2247f17f4fc7d4628a81d2a13a7891a373f66a24 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 06:34:36 -0500 Subject: [PATCH 273/306] implement a main ram burst restart behavior --- src/NDS.cpp | 30 ++++++++++++++++-------------- src/NDS.h | 3 ++- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 547a9249..afc7c029 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -926,8 +926,9 @@ void NDS::MainRAMHandleARM9() case MainRAMType::Fetch: { u8 var = ARM9.MRTrack.Var; + u32 addr = (var & MRCodeFetch) ? ARM9.FetchAddr[16] : ARM9.FetchAddr[ARM9.MRTrack.Progress]; - if ((var & MRSequential) && A9WENTLAST) + if ((var & MRSequential) && A9WENTLAST && !(MainRAMBork && ((addr & 0x1F) == 0))) { A9ContentionTS += 2; MainRAMTimestamp += 2; @@ -936,7 +937,8 @@ void NDS::MainRAMHandleARM9() else { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } - + + MainRAMBork = !(var & MRWrite) && ((addr & 0x1F) >= 0x1A); MainRAMTimestamp = A9ContentionTS + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? if (var & MRWrite) A9ContentionTS += ((var & MR16) ? 5 : 6); // checkme: is this correct for 133mhz? else @@ -1069,10 +1071,10 @@ void NDS::MainRAMHandleARM9() { if (srcrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } - DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); MainRAMTimestamp = A9ContentionTS + 9; A9ContentionTS += 6; MainRAMLastAccess = A9LAST; @@ -1173,10 +1175,10 @@ void NDS::MainRAMHandleARM9() { if (srcrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } - DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); DMABurstStart = A9ContentionTS; MainRAMTimestamp = A9ContentionTS + 8; A9ContentionTS += 5; @@ -1366,8 +1368,9 @@ void NDS::MainRAMHandleARM7() case MainRAMType::Fetch: { u8 var = ARM7.MRTrack.Var; + u32 addr = (var & MRCodeFetch) ? ARM7.FetchAddr[16] : ARM7.FetchAddr[ARM7.MRTrack.Progress]; - if ((var & MRSequential) && A7WENTLAST) + if ((var & MRSequential) && A7WENTLAST && !(MainRAMBork && ((addr & 0x1F) == 0))) { int cycles = ((var & MR32) ? 2 : 1); MainRAMTimestamp += cycles; @@ -1377,7 +1380,8 @@ void NDS::MainRAMHandleARM7() else { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } - + + MainRAMBork = !(var & MRWrite) && ((addr & 0x1F) >= 0x1A); MainRAMTimestamp = ARM7Timestamp + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? if (var & MRWrite) ARM7Timestamp += ((var & MR16) ? 3 : 4); else ARM7Timestamp += ((var & MR16) ? 5 : 6); @@ -1386,13 +1390,11 @@ void NDS::MainRAMHandleARM7() if (var & MRCodeFetch) { - u32 addr = ARM7.FetchAddr[16]; ARM7.RetVal = ((var & MR32) ? *(u32*)&MainRAM[addr&MainRAMMask] : *(u16*)&MainRAM[addr&MainRAMMask]); } else { u8 reg = ARM7.MRTrack.Progress; - u32 addr = ARM7.FetchAddr[reg]; if (var & MRWrite) // write { u32 val = ARM7.STRVal[reg]; @@ -1426,10 +1428,10 @@ void NDS::MainRAMHandleARM7() { if (srcrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } - DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); DMABurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 9; ARM7Timestamp += 6; @@ -1528,10 +1530,10 @@ void NDS::MainRAMHandleARM7() { if (srcrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (DMABORK && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } - DMABORK = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); DMABurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 8; ARM7Timestamp += 5; diff --git a/src/NDS.h b/src/NDS.h index 5bae52b9..82c21cc7 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -276,7 +276,8 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed0[2*8]; alignas(u32) u8 ROMSeed1[2*8]; - u32 DMAReadHold; bool DMABORK; + u32 DMAReadHold; + bool MainRAMBork; // if a main ram read burst starts in the last 6 bytes of a 32 byte block, and then crosses the 32 byte boundary, the burst forcibly restarts bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 bool DMALastWasMainRAM; From 5b07765eb5aa242bfd133baa6f0d804e144b8c7e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 10:24:36 -0500 Subject: [PATCH 274/306] misc tweaks to dma --- src/DMA.cpp | 44 ++++++++++++++++++++--------------- src/DMA.h | 14 +++++------- src/NDS.cpp | 66 ++++++++++++++++++++++++++++++++++------------------- src/NDS.h | 4 ++-- 4 files changed, 77 insertions(+), 51 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 4b494248..74666081 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -201,7 +201,7 @@ void DMA::Start() if (Num <= 2) NDS.DMAs[(CPU*4)+3].ResetBurst(); } -u32 DMA::UnitTimings9_16(u8 burststart) +u32 DMA::UnitTimings9_16(int burststart) { u32 src_id = CurSrcAddr >> 14; u32 dst_id = CurDstAddr >> 14; @@ -292,7 +292,7 @@ u32 DMA::UnitTimings9_16(u8 burststart) } } -u32 DMA::UnitTimings9_32(u8 burststart) +u32 DMA::UnitTimings9_32(int burststart) { u32 src_id = CurSrcAddr >> 14; u32 dst_id = CurDstAddr >> 14; @@ -387,7 +387,7 @@ u32 DMA::UnitTimings9_32(u8 burststart) // TODO: the ARM7 ones don't take into account that the two wifi regions have different timings -u32 DMA::UnitTimings7_16(bool burststart) +u32 DMA::UnitTimings7_16(int burststart) { u32 src_id = CurSrcAddr >> 15; u32 dst_id = CurDstAddr >> 15; @@ -462,18 +462,21 @@ u32 DMA::UnitTimings7_16(bool burststart) } else*/ if (src_rgn & dst_rgn) { - return src_n + dst_n + 1; + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); + else + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } } -u32 DMA::UnitTimings7_32(bool burststart) +u32 DMA::UnitTimings7_32(int burststart) { u32 src_id = CurSrcAddr >> 15; u32 dst_id = CurDstAddr >> 15; @@ -552,12 +555,15 @@ u32 DMA::UnitTimings7_32(bool burststart) } else*/ if (src_rgn & dst_rgn) { - return src_n + dst_n + 1; + if (burststart != 1) + return src_n + dst_n + (src_n == 1 || burststart <= 0); + else + return src_n + dst_n + (src_n != 1); } else { - if (burststart) - return src_n + dst_n; + if (burststart == 2) + return src_n + dst_n + (src_n == 1); else return src_s + dst_s; } @@ -586,7 +592,6 @@ void DMA::Run9() NDS.ARM9.MRTrack.Var = Num; return; } - Running = 2; NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -612,7 +617,6 @@ void DMA::Run9() NDS.ARM9.MRTrack.Var = Num; return; } - Running = 2; NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -630,7 +634,8 @@ void DMA::Run9() NDS.DMA9Timestamp -= 1; - if (burststart == 0) Running = 1; + if (burststart == 0) Running = 1; + else Running = 2; Executing = false; Stall = false; @@ -667,7 +672,7 @@ void DMA::Run7() Executing = true; // add NS penalty for first accesses in burst - bool burststart = (Running == 2); + int burststart = Running - 1; if (!(Cnt & (1<<26))) { @@ -677,10 +682,10 @@ void DMA::Run7() if (rgn & Mem7_MainRAM) { NDS.ARM7.MRTrack.Type = MainRAMType::DMA16; - NDS.ARM7.MRTrack.Var = Num; + NDS.ARM7.MRTrack.Var = Num+4; return; } - Running = 1; + NDS.ARM7Timestamp += UnitTimings7_16(burststart); burststart = false; @@ -702,10 +707,10 @@ void DMA::Run7() if (rgn & Mem7_MainRAM) { NDS.ARM7.MRTrack.Type = MainRAMType::DMA32; - NDS.ARM7.MRTrack.Var = Num; + NDS.ARM7.MRTrack.Var = Num+4; return; } - Running = 1; + NDS.ARM7Timestamp += UnitTimings7_32(burststart); burststart = false; @@ -720,6 +725,9 @@ void DMA::Run7() } } + if (burststart == 0) Running = 1; + else Running = 2; + Executing = false; Stall = false; diff --git a/src/DMA.h b/src/DMA.h index 65aaff82..3ed0e02c 100644 --- a/src/DMA.h +++ b/src/DMA.h @@ -40,10 +40,10 @@ public: void WriteCnt(u32 val); void Start(); - u32 UnitTimings9_16(u8 burststart); - u32 UnitTimings9_32(u8 burststart); - u32 UnitTimings7_16(bool burststart); - u32 UnitTimings7_32(bool burststart); + u32 UnitTimings9_16(int burststart); + u32 UnitTimings9_32(int burststart); + u32 UnitTimings7_16(int burststart); + u32 UnitTimings7_32(int burststart); void Run(); void Run9(); @@ -91,6 +91,8 @@ public: bool InProgress {}; u32 Num {}; u32 StartMode {}; + bool Executing {}; + bool Stall {}; private: melonDS::NDS& NDS; @@ -98,10 +100,6 @@ private: u32 CountMask {}; - - bool Executing {}; - bool Stall {}; - bool IsGXFIFODMA {}; u32 MRAMBurstCount {}; diff --git a/src/NDS.cpp b/src/NDS.cpp index afc7c029..50a185fd 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1071,10 +1071,11 @@ void NDS::MainRAMHandleARM9() { if (srcrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); + MainRAMBurstStart = A9ContentionTS; MainRAMTimestamp = A9ContentionTS + 9; A9ContentionTS += 6; MainRAMLastAccess = A9LAST; @@ -1093,7 +1094,7 @@ void NDS::MainRAMHandleARM9() DMALastWasMainRAM = false; } - DMAReadHold = ARM9Read32(srcaddr); + DMAReadHold[0] = ARM9Read32(srcaddr); ARM9.MRTrack.Progress = 1; } @@ -1101,11 +1102,11 @@ void NDS::MainRAMHandleARM9() { if (dstrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242)) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242)) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } MainRAMTimestamp = A9ContentionTS + 9; - DMABurstStart = A9ContentionTS; + MainRAMBurstStart = A9ContentionTS; A9ContentionTS += 4; MainRAMLastAccess = A9LAST; } @@ -1123,7 +1124,7 @@ void NDS::MainRAMHandleARM9() DMALastWasMainRAM = false; } - ARM9Write32(dstaddr, DMAReadHold); + ARM9Write32(dstaddr, DMAReadHold[0]); dma->CurSrcAddr += dma->SrcAddrInc<<2; dma->CurDstAddr += dma->DstAddrInc<<2; @@ -1132,6 +1133,10 @@ void NDS::MainRAMHandleARM9() burststart -= 1; if (burststart <= 0) dma->Running = 1; else dma->Running = 2; + + dma->Executing = false; + dma->Stall = false; + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; @@ -1175,11 +1180,11 @@ void NDS::MainRAMHandleARM9() { if (srcrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); - DMABurstStart = A9ContentionTS; + MainRAMBurstStart = A9ContentionTS; MainRAMTimestamp = A9ContentionTS + 8; A9ContentionTS += 5; MainRAMLastAccess = A9LAST; @@ -1198,7 +1203,7 @@ void NDS::MainRAMHandleARM9() DMALastWasMainRAM = false; } - DMAReadHold = ARM9Read16(srcaddr); + DMAReadHold[0] = ARM9Read16(srcaddr); ARM9.MRTrack.Progress = 1; } @@ -1206,10 +1211,10 @@ void NDS::MainRAMHandleARM9() { if (dstrgn == Mem9_MainRAM) { - if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - DMABurstStart) >= 242)) + if (burststart == 2 || A7WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((A9ContentionTS - MainRAMBurstStart) >= 242)) { if (A9ContentionTS < MainRAMTimestamp) { A9ContentionTS = MainRAMTimestamp; if (A7PRIORITY) return; } - DMABurstStart = A9ContentionTS; + MainRAMBurstStart = A9ContentionTS; MainRAMTimestamp = A9ContentionTS + 8; A9ContentionTS += 3; MainRAMLastAccess = A9LAST; @@ -1228,7 +1233,7 @@ void NDS::MainRAMHandleARM9() DMALastWasMainRAM = false; } - ARM9Write16(dstaddr, DMAReadHold); + ARM9Write16(dstaddr, DMAReadHold[0]); dma->CurSrcAddr += dma->SrcAddrInc<<1; dma->CurDstAddr += dma->DstAddrInc<<1; @@ -1237,6 +1242,10 @@ void NDS::MainRAMHandleARM9() burststart -= 1; if (burststart <= 0) Running = 1; else dma->Running = 2; + + dma->Executing = false; + dma->Stall = false; + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; @@ -1370,7 +1379,7 @@ void NDS::MainRAMHandleARM7() u8 var = ARM7.MRTrack.Var; u32 addr = (var & MRCodeFetch) ? ARM7.FetchAddr[16] : ARM7.FetchAddr[ARM7.MRTrack.Progress]; - if ((var & MRSequential) && A7WENTLAST && !(MainRAMBork && ((addr & 0x1F) == 0))) + if ((var & MRSequential) && A7WENTLAST && !(MainRAMBork && ((addr & 0x1F) == 0)) && ((ARM7Timestamp - MainRAMBurstStart) < 242)) { int cycles = ((var & MR32) ? 2 : 1); MainRAMTimestamp += cycles; @@ -1382,6 +1391,8 @@ void NDS::MainRAMHandleARM7() if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } MainRAMBork = !(var & MRWrite) && ((addr & 0x1F) >= 0x1A); + MainRAMBurstStart = ARM7Timestamp; + MainRAMTimestamp = ARM7Timestamp + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? if (var & MRWrite) ARM7Timestamp += ((var & MR16) ? 3 : 4); else ARM7Timestamp += ((var & MR16) ? 5 : 6); @@ -1428,11 +1439,11 @@ void NDS::MainRAMHandleARM7() { if (srcrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); - DMABurstStart = ARM7Timestamp; + MainRAMBurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 9; ARM7Timestamp += 6; MainRAMLastAccess = A7LAST; @@ -1451,7 +1462,7 @@ void NDS::MainRAMHandleARM7() DMALastWasMainRAM = false; } - DMAReadHold = ARM7Read32(srcaddr); + DMAReadHold[1] = ARM7Read32(srcaddr); ARM7.MRTrack.Progress = 1; } @@ -1459,10 +1470,10 @@ void NDS::MainRAMHandleARM7() { if (dstrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242)) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242)) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } - DMABurstStart = ARM7Timestamp; + MainRAMBurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 9; ARM7Timestamp += 4; MainRAMLastAccess = A7LAST; @@ -1481,7 +1492,7 @@ void NDS::MainRAMHandleARM7() DMALastWasMainRAM = false; } - ARM7Write32(dstaddr, DMAReadHold); + ARM7Write32(dstaddr, DMAReadHold[1]); dma->CurSrcAddr += dma->SrcAddrInc<<2; dma->CurDstAddr += dma->DstAddrInc<<2; @@ -1490,6 +1501,10 @@ void NDS::MainRAMHandleARM7() burststart -= 1; if (burststart <= 0) dma->Running = 1; else dma->Running = 2; + + dma->Executing = false; + dma->Stall = false; + //DMA7Timestamp = ARM7Timestamp; memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); ConTSLock = false; @@ -1530,11 +1545,11 @@ void NDS::MainRAMHandleARM7() { if (srcrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->SrcAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242) || (MainRAMBork && ((dma->CurSrcAddr & 0x1F) == 0))) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } MainRAMBork = ((dma->CurSrcAddr & 0x1F) >= 0x1A); - DMABurstStart = ARM7Timestamp; + MainRAMBurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 8; ARM7Timestamp += 5; MainRAMLastAccess = A7LAST; @@ -1553,7 +1568,7 @@ void NDS::MainRAMHandleARM7() DMALastWasMainRAM = false; } - DMAReadHold = ARM7Read16(srcaddr); + DMAReadHold[1] = ARM7Read16(srcaddr); ARM7.MRTrack.Progress = 1; } @@ -1561,9 +1576,10 @@ void NDS::MainRAMHandleARM7() { if (dstrgn == Mem7_MainRAM) { - if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - DMABurstStart) >= 242)) + if (burststart == 2 || A9WENTLAST || DMALastWasMainRAM || dma->DstAddrInc <= 0 || ((ARM7Timestamp - MainRAMBurstStart) >= 242)) { if (ARM7Timestamp < MainRAMTimestamp) { ARM7Timestamp = MainRAMTimestamp; if (A9PRIORITY) return; } + MainRAMBurstStart = ARM7Timestamp; MainRAMTimestamp = ARM7Timestamp + 8; ARM7Timestamp += 3; MainRAMLastAccess = A7LAST; @@ -1582,7 +1598,7 @@ void NDS::MainRAMHandleARM7() DMALastWasMainRAM = false; } - ARM7Write16(dstaddr, DMAReadHold); + ARM7Write16(dstaddr, DMAReadHold[1]); dma->CurSrcAddr += dma->SrcAddrInc<<1; dma->CurDstAddr += dma->DstAddrInc<<1; @@ -1591,6 +1607,10 @@ void NDS::MainRAMHandleARM7() burststart -= 1; if (burststart <= 0) Running = 1; else dma->Running = 2; + + dma->Executing = false; + dma->Stall = false; + //DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); ConTSLock = false; diff --git a/src/NDS.h b/src/NDS.h index 82c21cc7..e933d951 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -257,7 +257,7 @@ public: // TODO: Encapsulate the rest of these members // no need to worry about those overflowing, they can keep going for atleast 4350 years u64 ARM9Timestamp, DMA9Timestamp, ARM9Target; u64 ARM7Timestamp, ARM7Target; - u64 MainRAMTimestamp, DMABurstStart; + u64 MainRAMTimestamp, MainRAMBurstStart; u64 A9ContentionTS; bool ConTSLock; u32 ARM9ClockShift; @@ -276,7 +276,7 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed0[2*8]; alignas(u32) u8 ROMSeed1[2*8]; - u32 DMAReadHold; + u32 DMAReadHold[2]; bool MainRAMBork; // if a main ram read burst starts in the last 6 bytes of a 32 byte block, and then crosses the 32 byte boundary, the burst forcibly restarts bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 bool DMALastWasMainRAM; From db1991276a508052fc98c7d5e5d7d4e6103fe1e1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 14:38:08 -0500 Subject: [PATCH 275/306] tweak scheduling to be a little less gross --- src/NDS.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 50a185fd..12a96cb3 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1757,10 +1757,11 @@ u32 NDS::RunFrame() ARM9Target = target << ARM9ClockShift; ARM7Target = target; - while ((std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) && (ARM7Timestamp < ARM7Target)) + while ((std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) && (ARM7Timestamp < target)) { - while (std::max(ARM9Timestamp, DMA9Timestamp) < ARM9Target) + while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) { + ARM9Target = target << ARM9ClockShift; CurCPU = 0; RunTimers(0); GPU.GPU3D.Run(); @@ -1800,10 +1801,9 @@ u32 NDS::RunFrame() if (MainRAMHandle()) break; } - ARM7Target = target; // this line proooobably shouldn't be? but if i dont do it then several games wont work with main ram contention implemented for arm9 dmas??? - - while (ARM7Timestamp < ARM7Target) + while (ARM7Timestamp < target) { + ARM7Target = target; //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); CurCPU = 1; RunTimers(1); @@ -1835,7 +1835,7 @@ u32 NDS::RunFrame() } } - RunSystem(ARM7Target); + RunSystem(target); if (CPUStop & CPUStop_Sleep) { @@ -1891,11 +1891,13 @@ u32 NDS::RunFrame() void NDS::Reschedule(u64 target) { - if (target < ARM7Target) + if (CurCPU == 0) { - ARM7Target = target; - ARM9Target = (target << ARM9ClockShift); + if (target < (ARM9Target >> ARM9ClockShift)) + ARM9Target = (target << ARM9ClockShift); } + else if (target < ARM7Target) + ARM7Target = target; } void NDS::RegisterEventFuncs(u32 id, void* that, const std::initializer_list& funcs) From 45be951a0f164488d151d87ed6caa4b0633feba7 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 18:48:35 -0500 Subject: [PATCH 276/306] this should be smarter --- src/GPU3D.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 953a2343..4a1426aa 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -2378,13 +2378,13 @@ void GPU3D::Run() noexcept if (!GeometryEnabled || FlushRequest || (CmdPIPE.IsEmpty() && !(GXStat & (1<<27)))) { - Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; + Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; return; } - s32 cycles = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift) - Timestamp; + s32 cycles = (NDS.ARM9Timestamp >> NDS.ARM9ClockShift) - Timestamp; CycleCount -= cycles; - Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; + Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; if (CycleCount <= 0) { From 443ecb313d55c5ab9b256fa0176d560cc1b521ec Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 19:57:04 -0500 Subject: [PATCH 277/306] improve(?) irq handling ...i dont think this fixes anything --- src/ARM.cpp | 23 +++++++++++++---------- src/ARM.h | 2 ++ src/NDS.cpp | 16 ++++++++++------ 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index d4e49723..5e6ce172 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -160,6 +160,7 @@ void ARM::Reset() Halted = 0; DataCycles = 0; + IRQTimestamp = -1; IRQ = 0; for (int i = 0; i < 16; i++) @@ -705,7 +706,7 @@ void ARMv5::StartExecTHUMB() else NullFetch = false; PC = R[15]; - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (!(CPSR & 0x80) && (NDS.ARM9Timestamp > IRQTimestamp)) TriggerIRQ(); else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions { PrefetchAbort(); @@ -728,7 +729,7 @@ void ARMv5::StartExecARM() NullFetch = false; PC = R[15]; - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (!(CPSR & 0x80) && (NDS.ARM9Timestamp > IRQTimestamp)) TriggerIRQ(); else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); @@ -771,14 +772,13 @@ void ARMv5::Execute() else if (NDS.HaltInterrupted(0)) { Halted = 0; + NDS.ARM9Timestamp = IRQTimestamp; +#ifdef JIT_ENABLED if (NDS.IME[0] & 0x1) { -#ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); - else -#endif - IRQ = 1; } +#endif } else { @@ -921,7 +921,7 @@ void ARMv4::StartExecTHUMB() CodeRead16(R[15]); QueueFunction(&ARMv4::UpdateNextInstr1); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (!(CPSR & 0x80) && (NDS.ARM7Timestamp > IRQTimestamp)) TriggerIRQ(); else { // actually execute @@ -939,7 +939,7 @@ void ARMv4::StartExecARM() CodeRead32(R[15]); QueueFunction(&ARMv4::UpdateNextInstr1); - if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); + if (!(CPSR & 0x80) && (NDS.ARM7Timestamp > IRQTimestamp)) TriggerIRQ(); else if (CheckCondition(CurInstr >> 28)) // actually execute { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); @@ -964,13 +964,16 @@ void ARMv4::Execute() else if (NDS.HaltInterrupted(1)) { Halted = 0; + NDS.ARM7Timestamp = IRQTimestamp; if (NDS.IME[1] & 0x1) { #ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); - else #endif - IRQ = 1; + } + else + { + IRQTimestamp = -1; } } else diff --git a/src/ARM.h b/src/ARM.h index 21c06813..eabae95b 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -213,6 +213,8 @@ public: u32 StopExecution; }; + u64 IRQTimestamp; + u32 CodeRegion; s32 CodeCycles; diff --git a/src/NDS.cpp b/src/NDS.cpp index 12a96cb3..c4fb3bbe 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1834,7 +1834,7 @@ u32 NDS::RunFrame() if (!MainRAMHandle()) break; } } - + CurCPU = 2; RunSystem(target); if (CPUStop & CPUStop_Sleep) @@ -2156,16 +2156,20 @@ void NDS::SetGBASlotTimings() void NDS::UpdateIRQ(u32 cpu) { ARM& arm = cpu ? (ARM&)ARM7 : (ARM&)ARM9; + u64 curtime = ((CurCPU == 2) ? SysTimestamp : ((CurCPU == 1) ? ARM7Timestamp : ((ARM9Timestamp + ((1<> ARM9ClockShift))); + if (!cpu) curtime <<= ARM9ClockShift; - if (IME[cpu] & 0x1) + if (IME[cpu] & 0x1 || (arm.Halted == 1)) { - arm.IRQ = !!(IE[cpu] & IF[cpu]); - if ((ConsoleType == 1) && cpu) - arm.IRQ |= !!(IE2 & IF2); + //arm.IRQ = !!(IE[cpu] & IF[cpu]); + if (IE[cpu] & IF[cpu]) { if (curtime < arm.IRQTimestamp) arm.IRQTimestamp = curtime; } + else arm.IRQTimestamp = -1; + if ((ConsoleType == 1) && cpu && (IE2 & IF2) && (curtime < arm.IRQTimestamp)) + arm.IRQTimestamp = curtime; } else { - arm.IRQ = 0; + arm.IRQTimestamp = -1; } } From 93242e11898753d0eea3572ce6fd1c890ad9eb46 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 21:48:22 -0500 Subject: [PATCH 278/306] revert most scheduler changes --- src/NDS.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index c4fb3bbe..7ee9e3f5 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1755,13 +1755,13 @@ u32 NDS::RunFrame() u64 target = NextTarget(); ARM9Target = target << ARM9ClockShift; - ARM7Target = target; + //ARM7Target = target; - while ((std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) && (ARM7Timestamp < target)) + //while ((std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) && (ARM7Timestamp < target)) { - while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) + //while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) { - ARM9Target = target << ARM9ClockShift; + //ARM9Target = target << ARM9ClockShift; CurCPU = 0; RunTimers(0); GPU.GPU3D.Run(); @@ -1798,9 +1798,13 @@ u32 NDS::RunFrame() RunTimers(0); GPU.GPU3D.Run(); - if (MainRAMHandle()) break; + //if (MainRAMHandle()) break; + MainRAMHandle(); } + target = std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); + if (target == ARM7Timestamp) target++; + while (ARM7Timestamp < target) { ARM7Target = target; From c96b49e9cd1d2fd4760a926f332359d502f46a57 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 21:48:27 -0500 Subject: [PATCH 279/306] Revert "improve(?) irq handling" This reverts commit 443ecb313d55c5ab9b256fa0176d560cc1b521ec. --- src/ARM.cpp | 23 ++++++++++------------- src/ARM.h | 2 -- src/NDS.cpp | 16 ++++++---------- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index 5e6ce172..d4e49723 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -160,7 +160,6 @@ void ARM::Reset() Halted = 0; DataCycles = 0; - IRQTimestamp = -1; IRQ = 0; for (int i = 0; i < 16; i++) @@ -706,7 +705,7 @@ void ARMv5::StartExecTHUMB() else NullFetch = false; PC = R[15]; - if (!(CPSR & 0x80) && (NDS.ARM9Timestamp > IRQTimestamp)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (CurInstr > 0xFFFFFFFF) [[unlikely]] // handle aborted instructions { PrefetchAbort(); @@ -729,7 +728,7 @@ void ARMv5::StartExecARM() NullFetch = false; PC = R[15]; - if (!(CPSR & 0x80) && (NDS.ARM9Timestamp > IRQTimestamp)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (CurInstr & ((u64)1<<63)) [[unlikely]] // handle aborted instructions { PrefetchAbort(); @@ -772,13 +771,14 @@ void ARMv5::Execute() else if (NDS.HaltInterrupted(0)) { Halted = 0; - NDS.ARM9Timestamp = IRQTimestamp; -#ifdef JIT_ENABLED if (NDS.IME[0] & 0x1) { +#ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); - } + else #endif + IRQ = 1; + } } else { @@ -921,7 +921,7 @@ void ARMv4::StartExecTHUMB() CodeRead16(R[15]); QueueFunction(&ARMv4::UpdateNextInstr1); - if (!(CPSR & 0x80) && (NDS.ARM7Timestamp > IRQTimestamp)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else { // actually execute @@ -939,7 +939,7 @@ void ARMv4::StartExecARM() CodeRead32(R[15]); QueueFunction(&ARMv4::UpdateNextInstr1); - if (!(CPSR & 0x80) && (NDS.ARM7Timestamp > IRQTimestamp)) TriggerIRQ(); + if (IRQ && !(CPSR & 0x80)) TriggerIRQ(); else if (CheckCondition(CurInstr >> 28)) // actually execute { u32 icode = ((CurInstr >> 4) & 0xF) | ((CurInstr >> 16) & 0xFF0); @@ -964,16 +964,13 @@ void ARMv4::Execute() else if (NDS.HaltInterrupted(1)) { Halted = 0; - NDS.ARM7Timestamp = IRQTimestamp; if (NDS.IME[1] & 0x1) { #ifdef JIT_ENABLED if constexpr (mode == CPUExecuteMode::JIT) TriggerIRQ(); + else #endif - } - else - { - IRQTimestamp = -1; + IRQ = 1; } } else diff --git a/src/ARM.h b/src/ARM.h index eabae95b..21c06813 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -213,8 +213,6 @@ public: u32 StopExecution; }; - u64 IRQTimestamp; - u32 CodeRegion; s32 CodeCycles; diff --git a/src/NDS.cpp b/src/NDS.cpp index 7ee9e3f5..d059b250 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1838,7 +1838,7 @@ u32 NDS::RunFrame() if (!MainRAMHandle()) break; } } - CurCPU = 2; + RunSystem(target); if (CPUStop & CPUStop_Sleep) @@ -2160,20 +2160,16 @@ void NDS::SetGBASlotTimings() void NDS::UpdateIRQ(u32 cpu) { ARM& arm = cpu ? (ARM&)ARM7 : (ARM&)ARM9; - u64 curtime = ((CurCPU == 2) ? SysTimestamp : ((CurCPU == 1) ? ARM7Timestamp : ((ARM9Timestamp + ((1<> ARM9ClockShift))); - if (!cpu) curtime <<= ARM9ClockShift; - if (IME[cpu] & 0x1 || (arm.Halted == 1)) + if (IME[cpu] & 0x1) { - //arm.IRQ = !!(IE[cpu] & IF[cpu]); - if (IE[cpu] & IF[cpu]) { if (curtime < arm.IRQTimestamp) arm.IRQTimestamp = curtime; } - else arm.IRQTimestamp = -1; - if ((ConsoleType == 1) && cpu && (IE2 & IF2) && (curtime < arm.IRQTimestamp)) - arm.IRQTimestamp = curtime; + arm.IRQ = !!(IE[cpu] & IF[cpu]); + if ((ConsoleType == 1) && cpu) + arm.IRQ |= !!(IE2 & IF2); } else { - arm.IRQTimestamp = -1; + arm.IRQ = 0; } } From c40efab62f7606b7ec0770119fd5fec9c0a83635 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 22:01:44 -0500 Subject: [PATCH 280/306] revert main ram dma broke stuff --- src/DMA.cpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 74666081..2a55de1b 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -215,7 +215,7 @@ u32 DMA::UnitTimings9_16(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - /*if (src_rgn == Mem9_MainRAM) + if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -276,7 +276,7 @@ u32 DMA::UnitTimings9_16(int burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -306,7 +306,7 @@ u32 DMA::UnitTimings9_32(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - /*if (src_rgn == Mem9_MainRAM) + if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -369,7 +369,7 @@ u32 DMA::UnitTimings9_32(int burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -401,7 +401,7 @@ u32 DMA::UnitTimings7_16(int burststart) dst_n = NDS.ARM7MemTimings[dst_id][0]; dst_s = NDS.ARM7MemTimings[dst_id][1]; - /*if (src_rgn == Mem7_MainRAM) + if (src_rgn == Mem7_MainRAM) { if (dst_rgn == Mem7_MainRAM) return 16; @@ -460,7 +460,7 @@ u32 DMA::UnitTimings7_16(int burststart) return (burststart ? src_n : src_s) + 7; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -490,7 +490,7 @@ u32 DMA::UnitTimings7_32(int burststart) dst_n = NDS.ARM7MemTimings[dst_id][2]; dst_s = NDS.ARM7MemTimings[dst_id][3]; - /*if (src_rgn == Mem7_MainRAM) + if (src_rgn == Mem7_MainRAM) { if (dst_rgn == Mem7_MainRAM) return 18; @@ -553,7 +553,7 @@ u32 DMA::UnitTimings7_32(int burststart) return (burststart ? src_n : src_s) + 8; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -585,13 +585,13 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { - u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + /*u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; return; - } + }*/ NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -610,13 +610,13 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { - u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + /*u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; return; - } + }*/ NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -678,13 +678,13 @@ void DMA::Run7() { while (IterCount > 0 && !Stall) { - u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + /*u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; if (rgn & Mem7_MainRAM) { NDS.ARM7.MRTrack.Type = MainRAMType::DMA16; NDS.ARM7.MRTrack.Var = Num+4; return; - } + }*/ NDS.ARM7Timestamp += UnitTimings7_16(burststart); burststart = false; @@ -703,13 +703,13 @@ void DMA::Run7() { while (IterCount > 0 && !Stall) { - u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + /*u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; if (rgn & Mem7_MainRAM) { NDS.ARM7.MRTrack.Type = MainRAMType::DMA32; NDS.ARM7.MRTrack.Var = Num+4; return; - } + }*/ NDS.ARM7Timestamp += UnitTimings7_32(burststart); burststart = false; From e77c2011bc9fd82008b1e000520e8474c9ef0163 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sun, 15 Dec 2024 22:02:41 -0500 Subject: [PATCH 281/306] slightly optimize main loop --- src/NDS.cpp | 70 +++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index d059b250..615a574d 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1757,51 +1757,47 @@ u32 NDS::RunFrame() ARM9Target = target << ARM9ClockShift; //ARM7Target = target; - //while ((std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) && (ARM7Timestamp < target)) + while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) { - //while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < (target << ARM9ClockShift)) - { - //ARM9Target = target << ARM9ClockShift; - CurCPU = 0; - RunTimers(0); - GPU.GPU3D.Run(); + CurCPU = 0; + RunTimers(0); + GPU.GPU3D.Run(); - if (ARM9.MRTrack.Type == MainRAMType::Null) + if (ARM9.MRTrack.Type == MainRAMType::Null) + { + if (CPUStop & CPUStop_GXStall) { - if (CPUStop & CPUStop_GXStall) + // GXFIFO stall + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(0); - } - } - else - { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; - ARM9.Execute(); + auto& dsi = dynamic_cast(*this); + dsi.RunNDMAs(0); } } - - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - - RunTimers(0); - GPU.GPU3D.Run(); - - //if (MainRAMHandle()) break; - MainRAMHandle(); + else + { + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); + } } + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); + + RunTimers(0); + GPU.GPU3D.Run(); + + //if (MainRAMHandle()) break; + MainRAMHandle(); + target = std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); if (target == ARM7Timestamp) target++; From 29421f1d27fab69ca394abd0777ae346f751449b Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 09:06:28 -0500 Subject: [PATCH 282/306] re-enable mainram dma it's not bugged, it's also not inaccurate. something else is the issue... --- src/DMA.cpp | 46 +++++++++---------- src/NDS.cpp | 126 +++++----------------------------------------------- 2 files changed, 35 insertions(+), 137 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 2a55de1b..fd961d89 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -215,7 +215,7 @@ u32 DMA::UnitTimings9_16(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -276,7 +276,7 @@ u32 DMA::UnitTimings9_16(int burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -306,7 +306,7 @@ u32 DMA::UnitTimings9_32(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -369,7 +369,7 @@ u32 DMA::UnitTimings9_32(int burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -401,7 +401,7 @@ u32 DMA::UnitTimings7_16(int burststart) dst_n = NDS.ARM7MemTimings[dst_id][0]; dst_s = NDS.ARM7MemTimings[dst_id][1]; - if (src_rgn == Mem7_MainRAM) + /*if (src_rgn == Mem7_MainRAM) { if (dst_rgn == Mem7_MainRAM) return 16; @@ -460,7 +460,7 @@ u32 DMA::UnitTimings7_16(int burststart) return (burststart ? src_n : src_s) + 7; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -490,7 +490,7 @@ u32 DMA::UnitTimings7_32(int burststart) dst_n = NDS.ARM7MemTimings[dst_id][2]; dst_s = NDS.ARM7MemTimings[dst_id][3]; - if (src_rgn == Mem7_MainRAM) + /*if (src_rgn == Mem7_MainRAM) { if (dst_rgn == Mem7_MainRAM) return 18; @@ -553,7 +553,7 @@ u32 DMA::UnitTimings7_32(int burststart) return (burststart ? src_n : src_s) + 8; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -580,18 +580,18 @@ void DMA::Run9() // add NS penalty for first accesses in burst int burststart = Running-1; - + if (!(Cnt & (1<<26))) { while (IterCount > 0 && !Stall) { - /*u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; - return; - }*/ + break; + } NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -610,13 +610,13 @@ void DMA::Run9() { while (IterCount > 0 && !Stall) { - /*u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; + u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; - return; - }*/ + break; + } NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -634,7 +634,7 @@ void DMA::Run9() NDS.DMA9Timestamp -= 1; - if (burststart == 0) Running = 1; + if (burststart <= 0) Running = 1; else Running = 2; Executing = false; @@ -678,13 +678,13 @@ void DMA::Run7() { while (IterCount > 0 && !Stall) { - /*u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; if (rgn & Mem7_MainRAM) { NDS.ARM7.MRTrack.Type = MainRAMType::DMA16; NDS.ARM7.MRTrack.Var = Num+4; - return; - }*/ + break; + } NDS.ARM7Timestamp += UnitTimings7_16(burststart); burststart = false; @@ -703,13 +703,13 @@ void DMA::Run7() { while (IterCount > 0 && !Stall) { - /*u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; + u32 rgn = NDS.ARM7Regions[CurSrcAddr>>15] | NDS.ARM7Regions[CurDstAddr>>15]; if (rgn & Mem7_MainRAM) { NDS.ARM7.MRTrack.Type = MainRAMType::DMA32; NDS.ARM7.MRTrack.Var = Num+4; - return; - }*/ + break; + } NDS.ARM7Timestamp += UnitTimings7_32(burststart); burststart = false; @@ -725,7 +725,7 @@ void DMA::Run7() } } - if (burststart == 0) Running = 1; + if (burststart <= 0) Running = 1; else Running = 2; Executing = false; diff --git a/src/NDS.cpp b/src/NDS.cpp index 615a574d..605c35c7 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1130,39 +1130,13 @@ void NDS::MainRAMHandleARM9() dma->CurDstAddr += dma->DstAddrInc<<2; dma->IterCount--; dma->RemCount--; - burststart -= 1; - if (burststart <= 0) dma->Running = 1; + + if (burststart <= 1) dma->Running = 1; else dma->Running = 2; - - dma->Executing = false; - dma->Stall = false; - - DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; - if (dma->RemCount) - { - if (dma->IterCount == 0) - { - dma->Running = 0; - ResumeCPU(0, 1<Num); - - if (dma->StartMode == 0x07) - GPU.GPU3D.CheckFIFODMA(); - } - - break; - } - - if (!(dma->Cnt & (1<<25))) - dma->Cnt &= ~(1<<31); - - if (dma->Cnt & (1<<30)) - SetIRQ(0, IRQ_DMA0 + dma->Num); - - dma->Running = 0; - dma->InProgress = false; - ResumeCPU(0, 1<Num); } break; } @@ -1239,39 +1213,13 @@ void NDS::MainRAMHandleARM9() dma->CurDstAddr += dma->DstAddrInc<<1; dma->IterCount--; dma->RemCount--; - burststart -= 1; - if (burststart <= 0) Running = 1; + + if (burststart <= 1) Running = 1; else dma->Running = 2; - - dma->Executing = false; - dma->Stall = false; - - DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; - if (dma->RemCount) - { - if (dma->IterCount == 0) - { - dma->Running = 0; - ResumeCPU(0, 1<Num); - - if (dma->StartMode == 0x07) - GPU.GPU3D.CheckFIFODMA(); - } - - break; - } - - if (!(dma->Cnt & (1<<25))) - dma->Cnt &= ~(1<<31); - - if (dma->Cnt & (1<<30)) - SetIRQ(0, IRQ_DMA0 + dma->Num); - - dma->Running = 0; - dma->InProgress = false; - ResumeCPU(0, 1<Num); } break; } @@ -1498,36 +1446,11 @@ void NDS::MainRAMHandleARM7() dma->CurDstAddr += dma->DstAddrInc<<2; dma->IterCount--; dma->RemCount--; - burststart -= 1; - if (burststart <= 0) dma->Running = 1; + + if (burststart <= 1) dma->Running = 1; else dma->Running = 2; - dma->Executing = false; - dma->Stall = false; - - //DMA7Timestamp = ARM7Timestamp; memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); - ConTSLock = false; - if (dma->RemCount) - { - if (dma->IterCount == 0) - { - dma->Running = 0; - ResumeCPU(1, 1<Num); - } - - break; - } - - if (!(dma->Cnt & (1<<25))) - dma->Cnt &= ~(1<<31); - - if (dma->Cnt & (1<<30)) - SetIRQ(1, IRQ_DMA0 + dma->Num); - - dma->Running = 0; - dma->InProgress = false; - ResumeCPU(1, 1<Num); } break; } @@ -1604,36 +1527,11 @@ void NDS::MainRAMHandleARM7() dma->CurDstAddr += dma->DstAddrInc<<1; dma->IterCount--; dma->RemCount--; - burststart -= 1; - if (burststart <= 0) Running = 1; + + if (burststart <= 1) Running = 1; else dma->Running = 2; - dma->Executing = false; - dma->Stall = false; - - //DMA9Timestamp = (A9ContentionTS << ARM9ClockShift) - 1; memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); - ConTSLock = false; - if (dma->RemCount) - { - if (dma->IterCount == 0) - { - dma->Running = 0; - ResumeCPU(1, 1<Num); - } - - break; - } - - if (!(dma->Cnt & (1<<25))) - dma->Cnt &= ~(1<<31); - - if (dma->Cnt & (1<<30)) - SetIRQ(1, IRQ_DMA0 + dma->Num); - - dma->Running = 0; - dma->InProgress = false; - ResumeCPU(1, 1<Num); } break; } From c90f10d056219130d0b481eea3b9e0a9586861e9 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:19:51 -0500 Subject: [PATCH 283/306] revert arm9 main ram dma again (again (again)) --- src/DMA.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index fd961d89..dc78ee35 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -215,7 +215,7 @@ u32 DMA::UnitTimings9_16(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - /*if (src_rgn == Mem9_MainRAM) + if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -276,7 +276,7 @@ u32 DMA::UnitTimings9_16(int burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -306,7 +306,7 @@ u32 DMA::UnitTimings9_32(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - /*if (src_rgn == Mem9_MainRAM) + if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -369,7 +369,7 @@ u32 DMA::UnitTimings9_32(int burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else*/ if (src_rgn & dst_rgn) + else if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -584,14 +584,14 @@ void DMA::Run9() if (!(Cnt & (1<<26))) { while (IterCount > 0 && !Stall) - { + {/* u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; break; - } + }*/ NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -609,14 +609,14 @@ void DMA::Run9() else { while (IterCount > 0 && !Stall) - { + {/* u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; break; - } + }*/ NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; From 5c5f4364db867b6d658849efd45866ae32394f1d Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:20:10 -0500 Subject: [PATCH 284/306] run arm 7 first? it's faster...? --- src/NDS.cpp | 87 ++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 605c35c7..a6444573 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1651,89 +1651,88 @@ u32 NDS::RunFrame() while (Running && GPU.TotalScanlines==0) { u64 target = NextTarget(); - - ARM9Target = target << ARM9ClockShift; + + ARM7Target = target; //ARM7Target = target; - while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) + //while (ARM7Timestamp < ARM7Target) { - CurCPU = 0; - RunTimers(0); - GPU.GPU3D.Run(); + //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + CurCPU = 1; + RunTimers(1); - if (ARM9.MRTrack.Type == MainRAMType::Null) + if (ARM7.MRTrack.Type == MainRAMType::Null) { - if (CPUStop & CPUStop_GXStall) + if (CPUStop & CPUStop_DMA7) { - // GXFIFO stall - s32 cycles = GPU.GPU3D.CyclesToRunFor(); - DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(0); + dsi.RunNDMAs(1); } } else { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; - ARM9.Execute(); + //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; + ARM7.Execute(); } } - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); + RunTimers(1); - RunTimers(0); - GPU.GPU3D.Run(); + //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); //if (MainRAMHandle()) break; MainRAMHandle(); - target = std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); - if (target == ARM7Timestamp) target++; + target = ARM7Timestamp << ARM9ClockShift;//std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); + if (target <= std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift)) target = std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) + 1; - while (ARM7Timestamp < target) + while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < target) { - ARM7Target = target; - //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); - CurCPU = 1; - RunTimers(1); + ARM9Target = target; + CurCPU = 0; + RunTimers(0); + GPU.GPU3D.Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) + if (ARM9.MRTrack.Type == MainRAMType::Null) { - if (CPUStop & CPUStop_DMA7) + if (CPUStop & CPUStop_GXStall) { - DMAs[4].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[5].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[6].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[7].Run(); + // GXFIFO stall + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(1); + dsi.RunNDMAs(0); } } else { - //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; - ARM7.Execute(); + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); } } + RunTimers(0); + GPU.GPU3D.Run(); - RunTimers(1); - - if (!MainRAMHandle()) break; + if (MainRAMHandle()) break; } } - RunSystem(target); + RunSystem(target>>ARM9ClockShift); if (CPUStop & CPUStop_Sleep) { From 67198a72bda723734398078b1f2195644ed28706 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:32:18 -0500 Subject: [PATCH 285/306] why did i remove that This reverts commit 45be951a0f164488d151d87ed6caa4b0633feba7. --- src/GPU3D.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 4a1426aa..953a2343 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -2378,13 +2378,13 @@ void GPU3D::Run() noexcept if (!GeometryEnabled || FlushRequest || (CmdPIPE.IsEmpty() && !(GXStat & (1<<27)))) { - Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; + Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; return; } - s32 cycles = (NDS.ARM9Timestamp >> NDS.ARM9ClockShift) - Timestamp; + s32 cycles = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift) - Timestamp; CycleCount -= cycles; - Timestamp = NDS.ARM9Timestamp >> NDS.ARM9ClockShift; + Timestamp = std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) >> NDS.ARM9ClockShift; if (CycleCount <= 0) { From 7590c48471be202d522a5b8674953e677ae143b1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 11:45:14 -0500 Subject: [PATCH 286/306] Revert "revert arm9 main ram dma again (again (again))" This reverts commit c90f10d056219130d0b481eea3b9e0a9586861e9. --- src/DMA.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index dc78ee35..fd961d89 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -215,7 +215,7 @@ u32 DMA::UnitTimings9_16(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][4]; dst_s = NDS.ARM9MemTimings[dst_id][5]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) { @@ -276,7 +276,7 @@ u32 DMA::UnitTimings9_16(int burststart) return ((burststart == 2) ? src_n : src_s) + 7; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -306,7 +306,7 @@ u32 DMA::UnitTimings9_32(int burststart) dst_n = NDS.ARM9MemTimings[dst_id][6]; dst_s = NDS.ARM9MemTimings[dst_id][7]; - if (src_rgn == Mem9_MainRAM) + /*if (src_rgn == Mem9_MainRAM) { if (dst_rgn == Mem9_MainRAM) return (burststart == 2) ? 13 : 18; @@ -369,7 +369,7 @@ u32 DMA::UnitTimings9_32(int burststart) return ((burststart == 2) ? src_n : src_s) + 8; } } - else if (src_rgn & dst_rgn) + else*/ if (src_rgn & dst_rgn) { if (burststart != 1) return src_n + dst_n + (src_n == 1 || burststart <= 0); @@ -584,14 +584,14 @@ void DMA::Run9() if (!(Cnt & (1<<26))) { while (IterCount > 0 && !Stall) - {/* + { u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; break; - }*/ + } NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); burststart -= 1; @@ -609,14 +609,14 @@ void DMA::Run9() else { while (IterCount > 0 && !Stall) - {/* + { u32 rgn = NDS.ARM9Regions[CurSrcAddr>>14] | NDS.ARM9Regions[CurDstAddr>>14]; if (rgn & Mem9_MainRAM) { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; break; - }*/ + } NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); burststart -= 1; From 6d0ab97612d0b05033197970cac5d9820b11fb6a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:33:13 -0500 Subject: [PATCH 287/306] fix gamecard dma breaking w/ main ram contention main ram dma on the arm9 is officially fully operational. --- src/DMA.cpp | 15 ++++++++++++++- src/DMA.h | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index fd961d89..6809a3dd 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -81,6 +81,7 @@ void DMA::Reset() Running = false; Executing = false; InProgress = false; + DMAQueued = false; MRAMBurstCount = 0; MRAMBurstTable = DMATiming::MRAMDummy; } @@ -156,7 +157,15 @@ void DMA::WriteCnt(u32 val) void DMA::Start() { - if (Running) return; + if (Running) + { + DMAQueued = true; + return; + } + else + { + DMAQueued = false; + } if (!InProgress) { @@ -663,6 +672,8 @@ void DMA::Run9() Running = 0; InProgress = false; NDS.ResumeCPU(0, 1< Date: Mon, 16 Dec 2024 13:02:56 -0500 Subject: [PATCH 288/306] ONLY queue missed gamecard dmas gxfifo dma did not like that prior fix --- src/DMA.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 6809a3dd..3b5a9047 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -139,7 +139,7 @@ void DMA::WriteCnt(u32 val) case 0x01000000: SrcAddrInc = 0; break; case 0x01800000: SrcAddrInc = 1; break; } - + u32 oldstartmode = StartMode; if (CPU == 0) StartMode = (Cnt >> 27) & 0x7; else @@ -159,7 +159,14 @@ void DMA::Start() { if (Running) { - DMAQueued = true; + if (CPU ? StartMode == 0x12 : StartMode == 0x05) + { + DMAQueued = true; + } + else + { + DMAQueued = false; + } return; } else From cacf8913307e8b3bf10c48935ba26aafa14cb55c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:13:24 -0500 Subject: [PATCH 289/306] Revert "run arm 7 first?" Caused performance issues in games??? --- src/NDS.cpp | 87 +++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index a6444573..605c35c7 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1651,88 +1651,89 @@ u32 NDS::RunFrame() while (Running && GPU.TotalScanlines==0) { u64 target = NextTarget(); - - ARM7Target = target; + + ARM9Target = target << ARM9ClockShift; //ARM7Target = target; - //while (ARM7Timestamp < ARM7Target) + while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) { - //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); - CurCPU = 1; - RunTimers(1); + CurCPU = 0; + RunTimers(0); + GPU.GPU3D.Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) + if (ARM9.MRTrack.Type == MainRAMType::Null) { - if (CPUStop & CPUStop_DMA7) + if (CPUStop & CPUStop_GXStall) { - DMAs[4].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[5].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[6].Run(); - if (ARM7.MRTrack.Type == MainRAMType::Null) DMAs[7].Run(); + // GXFIFO stall + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(1); + dsi.RunNDMAs(0); } } else { - //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; - ARM7.Execute(); + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); } } - RunTimers(1); - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); + RunTimers(0); + GPU.GPU3D.Run(); + //if (MainRAMHandle()) break; MainRAMHandle(); - target = ARM7Timestamp << ARM9ClockShift;//std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); - if (target <= std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift)) target = std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) + 1; + target = std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); + if (target == ARM7Timestamp) target++; - while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < target) + while (ARM7Timestamp < target) { - ARM9Target = target; - CurCPU = 0; - RunTimers(0); - GPU.GPU3D.Run(); + ARM7Target = target; + //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + CurCPU = 1; + RunTimers(1); - if (ARM9.MRTrack.Type == MainRAMType::Null) + if (ARM7.MRTrack.Type == MainRAMType::Null) { - if (CPUStop & CPUStop_GXStall) + if (CPUStop & CPUStop_DMA7) { - // GXFIFO stall - s32 cycles = GPU.GPU3D.CyclesToRunFor(); - DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles<(*this); - dsi.RunNDMAs(0); + dsi.RunNDMAs(1); } } else { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; - ARM9.Execute(); + //if (ARM7.abt > 16) ARM7Timestamp = ARM7Target; + ARM7.Execute(); } } - RunTimers(0); - GPU.GPU3D.Run(); - if (MainRAMHandle()) break; + RunTimers(1); + + if (!MainRAMHandle()) break; } } - RunSystem(target>>ARM9ClockShift); + RunSystem(target); if (CPUStop & CPUStop_Sleep) { From e3c874622926555babdf080a66f1f9d3a224760c Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Mon, 16 Dec 2024 20:50:39 -0500 Subject: [PATCH 290/306] fix a bug preventing main ram dmas from doing their first cycle properly --- src/DMA.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/DMA.cpp b/src/DMA.cpp index 3b5a9047..5c8aaaf2 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -606,7 +606,7 @@ void DMA::Run9() { NDS.ARM9.MRTrack.Type = MainRAMType::DMA16; NDS.ARM9.MRTrack.Var = Num; - break; + return; } NDS.DMA9Timestamp += (UnitTimings9_16(burststart) << NDS.ARM9ClockShift); @@ -631,7 +631,7 @@ void DMA::Run9() { NDS.ARM9.MRTrack.Type = MainRAMType::DMA32; NDS.ARM9.MRTrack.Var = Num; - break; + return; } NDS.DMA9Timestamp += (UnitTimings9_32(burststart) << NDS.ARM9ClockShift); @@ -701,7 +701,7 @@ void DMA::Run7() { NDS.ARM7.MRTrack.Type = MainRAMType::DMA16; NDS.ARM7.MRTrack.Var = Num+4; - break; + return; } NDS.ARM7Timestamp += UnitTimings7_16(burststart); @@ -726,7 +726,7 @@ void DMA::Run7() { NDS.ARM7.MRTrack.Type = MainRAMType::DMA32; NDS.ARM7.MRTrack.Var = Num+4; - break; + return; } NDS.ARM7Timestamp += UnitTimings7_32(burststart); From 0a5499cb1954243deb09a2c8cc61e8fd502653e3 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 17 Dec 2024 11:02:14 -0500 Subject: [PATCH 291/306] fix wrong "running" variable being updated during 16 bit main ram dmas this probably fixes something --- src/NDS.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 605c35c7..0ac01e60 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1214,7 +1214,7 @@ void NDS::MainRAMHandleARM9() dma->IterCount--; dma->RemCount--; - if (burststart <= 1) Running = 1; + if (burststart <= 1) dma->Running = 1; else dma->Running = 2; DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); @@ -1528,7 +1528,7 @@ void NDS::MainRAMHandleARM7() dma->IterCount--; dma->RemCount--; - if (burststart <= 1) Running = 1; + if (burststart <= 1) dma->Running = 1; else dma->Running = 2; memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); From 6ef7a337cb9b8346437d218555391a14aa1084fe Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 17 Dec 2024 11:07:37 -0500 Subject: [PATCH 292/306] small optimization to main ram dma --- src/NDS.cpp | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 0ac01e60..38648fdc 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1093,6 +1093,9 @@ void NDS::MainRAMHandleARM9() else A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; DMALastWasMainRAM = false; } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; DMAReadHold[0] = ARM9Read32(srcaddr); @@ -1123,6 +1126,9 @@ void NDS::MainRAMHandleARM9() else A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); DMALastWasMainRAM = false; } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; ARM9Write32(dstaddr, DMAReadHold[0]); @@ -1134,9 +1140,10 @@ void NDS::MainRAMHandleARM9() if (burststart <= 1) dma->Running = 1; else dma->Running = 2; - DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); - memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); - ConTSLock = false; + if ((dma->IterCount == 0) || ((ARM9Regions[dma->CurSrcAddr>>14] != Mem9_MainRAM) && (ARM9Regions[dma->CurDstAddr>>14] != Mem9_MainRAM))) + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + else + ARM9.MRTrack.Progress = 0; } break; } @@ -1176,6 +1183,9 @@ void NDS::MainRAMHandleARM9() else A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; DMALastWasMainRAM = false; } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; DMAReadHold[0] = ARM9Read16(srcaddr); @@ -1206,6 +1216,9 @@ void NDS::MainRAMHandleARM9() else A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] - (burststart <= 0); DMALastWasMainRAM = false; } + + DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); + ConTSLock = false; ARM9Write16(dstaddr, DMAReadHold[0]); @@ -1217,9 +1230,10 @@ void NDS::MainRAMHandleARM9() if (burststart <= 1) dma->Running = 1; else dma->Running = 2; - DMA9Timestamp = (A9ContentionTS << ARM9ClockShift); - memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); - ConTSLock = false; + if ((dma->IterCount == 0) || ((ARM9Regions[dma->CurSrcAddr>>14] != Mem9_MainRAM) && (ARM9Regions[dma->CurDstAddr>>14] != Mem9_MainRAM))) + memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); + else + ARM9.MRTrack.Progress = 0; } break; } @@ -1449,8 +1463,11 @@ void NDS::MainRAMHandleARM7() if (burststart <= 1) dma->Running = 1; else dma->Running = 2; - - memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + + if ((dma->IterCount == 0) || ((ARM7Regions[dma->CurSrcAddr>>15] != Mem7_MainRAM) && (ARM7Regions[dma->CurDstAddr>>15] != Mem7_MainRAM))) + memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + else + ARM7.MRTrack.Progress = 0; } break; } @@ -1530,8 +1547,11 @@ void NDS::MainRAMHandleARM7() if (burststart <= 1) dma->Running = 1; else dma->Running = 2; - - memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + + if ((dma->IterCount == 0) || ((ARM7Regions[dma->CurSrcAddr>>15] != Mem7_MainRAM) && (ARM7Regions[dma->CurDstAddr>>15] != Mem7_MainRAM))) + memset(&ARM7.MRTrack, 0, sizeof(ARM7.MRTrack)); + else + ARM7.MRTrack.Progress = 0; } break; } From 4a598294a5fa76d75bb978a4f8b7600a8a29b4a4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Tue, 17 Dec 2024 23:29:47 -0500 Subject: [PATCH 293/306] clean up a few errors --- src/CP15.cpp | 4 ++-- src/NDS.cpp | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/CP15.cpp b/src/CP15.cpp index bb7320cb..8903dab8 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2967,7 +2967,7 @@ void ARMv5::DWrite16_3() else { NDS.ARM9Timestamp += MemTimings[addr >> 14][0]; - DataCycles = NDS.ARM9ClockShift; + DataCycles = 3<>14]; if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>14][2]; + NDS.ARM9Timestamp += MemTimings[addr>>14][2]; DataRegion = NDS.ARM9Regions[addr>>14]; if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<= 0x1A); MainRAMTimestamp = A9ContentionTS + ((var & MR16) ? 8 : 9); // checkme: are these correct for 8bit? - if (var & MRWrite) A9ContentionTS += ((var & MR16) ? 5 : 6); // checkme: is this correct for 133mhz? + if (var & MRWrite) A9ContentionTS += ((var & MR16) ? 6 : 7); // checkme: is this correct for 133mhz? else { if (ARM9ClockShift == 1) A9ContentionTS += ((var & MR16) ? 8 : 9); @@ -958,6 +958,7 @@ void NDS::MainRAMHandleARM9() } else { + ARM9.DataRegion = Mem9_MainRAM; u8 reg = ARM9.MRTrack.Progress; u32 addr = ARM9.FetchAddr[reg]; if (var & MRWrite) // write @@ -1051,6 +1052,8 @@ void NDS::MainRAMHandleARM9() (*prog)++; if (*prog >= 8) { + ARM9.DataRegion = Mem9_MainRAM; + ARM9.DataCycles = 3 << ARM9ClockShift; ARM9.RetVal = dcache[(ARM9.FetchAddr[16] & 0x1F) / 4]; memset(&ARM9.MRTrack, 0, sizeof(ARM9.MRTrack)); ConTSLock = false; From 4493634a5b34d754482cf8fff8177f6983923df1 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:03:15 -0500 Subject: [PATCH 294/306] improve dma accuracy slightly --- src/NDS.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index c7e3b0ad..99b9878b 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1092,8 +1092,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + (ARM9MemTimings[srcaddr>>14][6] == 1); - else A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); + } + else + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][7]; + } DMALastWasMainRAM = false; } @@ -1125,8 +1133,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][6]; - else A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][6] - (burststart <= 0); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][6] + (burststart == 1); + } + else + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][7] + (burststart == 1); + } DMALastWasMainRAM = false; } @@ -1182,8 +1198,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + (ARM9MemTimings[srcaddr>>14][4] == 1); - else A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); + } + else + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][5]; + } DMALastWasMainRAM = false; } @@ -1215,8 +1239,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][4]; - else A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][4] + (burststart == 1); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][4]; + } + else + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] + (burststart == 1); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][5]; + } DMALastWasMainRAM = false; } @@ -1422,8 +1454,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + (ARM7MemTimings[srcaddr>>15][2] == 1); - else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); + } + else + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][3]; + } DMALastWasMainRAM = false; } @@ -1452,8 +1492,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2]; - else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2] - (burststart <= 0); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][2] + (burststart == 1); + } + else + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][3] + (burststart == 1); + } DMALastWasMainRAM = false; } @@ -1506,8 +1554,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + (ARM7MemTimings[srcaddr>>15][0] == 1); - else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); + } + else + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][1]; + } DMALastWasMainRAM = false; } @@ -1536,8 +1592,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0]; - else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0] + (burststart == 1); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][0]; + } + else + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] + (burststart == 1); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][1]; + } DMALastWasMainRAM = false; } From 068652354389c503e9ae8372f964118fbc6b6c49 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:36:13 -0500 Subject: [PATCH 295/306] card reads are "double buffered" means they can fetch a word in the background while a word is still waiting to be read. Thanks Gericom! --- src/NDS.cpp | 2 +- src/NDSCart.cpp | 16 +++++++++++++++- src/NDSCart.h | 2 ++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 99b9878b..c0225bb6 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1922,7 +1922,7 @@ void NDS::ScheduleEvent(u32 id, bool periodic, s32 delay, u32 funcid, u32 param) else { if (CurCPU == 0) - evt.Timestamp = (ARM9Timestamp >> ARM9ClockShift) + delay; + evt.Timestamp = ((ARM9Timestamp + ((1<> ARM9ClockShift) + delay; else evt.Timestamp = ARM7Timestamp + delay; } diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index 1fa0fbfe..29518540 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -1799,6 +1799,7 @@ void NDSCartSlot::ResetCart() noexcept TransferDir = 0; memset(TransferCmd.data(), 0, sizeof(TransferCmd)); TransferCmd[0] = 0xFF; + LastRomTransferTime = 0; if (Cart) Cart->Reset(); } @@ -1922,6 +1923,9 @@ void NDSCartSlot::WriteROMCnt(u32 val) noexcept NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*cmddelay, ROMTransfer_End, 0); else NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*(cmddelay+4), ROMTransfer_PrepareData, 0); + + if (NDS.CurCPU) LastRomTransferTime = NDS.ARM7Timestamp + xfercycle*(cmddelay+4); + else LastRomTransferTime = ((std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift) + (xfercycle*(cmddelay+4)); } void NDSCartSlot::AdvanceROMTransfer() noexcept @@ -1938,7 +1942,17 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept delay += ((ROMCnt >> 16) & 0x3F); } - NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*delay, ROMTransfer_PrepareData, 0); + u64 curts; + if (NDS.CurCPU) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + + s64 nexttransfer = (xfercycle*delay) - (curts - LastRomTransferTime); + + if (nexttransfer < 1) nexttransfer = 1; // CHECKME: the value of 1 here was kinda just a guess? it seems right though. + + NDS.ScheduleEvent(Event_ROMTransfer, false, nexttransfer, ROMTransfer_PrepareData, 0); + + LastRomTransferTime = curts; } else ROMEndTransfer(0); diff --git a/src/NDSCart.h b/src/NDSCart.h index 3704f659..c84aec90 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -442,6 +442,8 @@ private: u64 Key2_X = 0; u64 Key2_Y = 0; + u64 LastRomTransferTime; + void Key1_Encrypt(u32* data) const noexcept; void Key1_Decrypt(u32* data) const noexcept; void Key1_ApplyKeycode(u32* keycode, u32 mod) noexcept; From 14c765d5ed34554157251d91b6338bfa4e57777e Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:54:20 -0500 Subject: [PATCH 296/306] Revert "improve dma accuracy slightly" apparently it completely broke sm64ds for some reason --- src/NDS.cpp | 96 +++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 80 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index c0225bb6..5a68b865 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1092,16 +1092,8 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) - { - A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); - MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); - } - else - { - A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; - MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][7]; - } + if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + (ARM9MemTimings[srcaddr>>14][6] == 1); + else A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; DMALastWasMainRAM = false; } @@ -1133,16 +1125,8 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) - { - A9ContentionTS += ARM9MemTimings[dstaddr>>14][6] - (burststart <= 0); - MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][6] + (burststart == 1); - } - else - { - A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); - MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][7] + (burststart == 1); - } + if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][6]; + else A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); DMALastWasMainRAM = false; } @@ -1198,16 +1182,8 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) - { - A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); - MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); - } - else - { - A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; - MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][5]; - } + if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + (ARM9MemTimings[srcaddr>>14][4] == 1); + else A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; DMALastWasMainRAM = false; } @@ -1239,16 +1215,8 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) - { - A9ContentionTS += ARM9MemTimings[dstaddr>>14][4] + (burststart == 1); - MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][4]; - } - else - { - A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] + (burststart == 1); - MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][5]; - } + if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][4]; + else A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] - (burststart <= 0); DMALastWasMainRAM = false; } @@ -1454,16 +1422,8 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) - { - ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); - MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); - } - else - { - ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; - MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][3]; - } + if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + (ARM7MemTimings[srcaddr>>15][2] == 1); + else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; DMALastWasMainRAM = false; } @@ -1492,16 +1452,8 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) - { - ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2] - (burststart <= 0); - MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][2] + (burststart == 1); - } - else - { - ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); - MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][3] + (burststart == 1); - } + if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2]; + else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); DMALastWasMainRAM = false; } @@ -1554,16 +1506,8 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) - { - ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); - MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); - } - else - { - ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; - MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][1]; - } + if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + (ARM7MemTimings[srcaddr>>15][0] == 1); + else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; DMALastWasMainRAM = false; } @@ -1592,16 +1536,8 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) - { - ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0] + (burststart == 1); - MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][0]; - } - else - { - ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] + (burststart == 1); - MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][1]; - } + if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0]; + else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] - (burststart <= 0); DMALastWasMainRAM = false; } From 6897e4a8be23c9dc15388cbd161da5d3d75129a4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:55:50 -0500 Subject: [PATCH 297/306] rework card read timings --- src/NDS.cpp | 4 +++- src/NDSCart.cpp | 63 ++++++++++++++++++++++++++++++++++++++----------- src/NDSCart.h | 10 +++++--- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 5a68b865..a5618d6f 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1712,7 +1712,8 @@ u32 NDS::RunFrame() } //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - + + NDSCartSlot.ROMPrepareData(); RunTimers(0); GPU.GPU3D.Run(); @@ -1751,6 +1752,7 @@ u32 NDS::RunFrame() } RunTimers(1); + NDSCartSlot.ROMPrepareData(); if (!MainRAMHandle()) break; } diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index 29518540..3388933f 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -37,8 +37,7 @@ namespace NDSCart enum { - ROMTransfer_PrepareData = 0, - ROMTransfer_End + ROMTransfer_End = 0 }; // SRAM TODO: emulate write delays??? @@ -1443,7 +1442,6 @@ NDSCartSlot::NDSCartSlot(melonDS::NDS& nds, std::unique_ptr&& rom) n { NDS.RegisterEventFuncs(Event_ROMTransfer, this, { - MakeEventThunk(NDSCartSlot, ROMPrepareData), MakeEventThunk(NDSCartSlot, ROMEndTransfer) }); NDS.RegisterEventFuncs(Event_ROMSPITransfer, this, {MakeEventThunk(NDSCartSlot, SPITransferDone)}); @@ -1799,7 +1797,10 @@ void NDSCartSlot::ResetCart() noexcept TransferDir = 0; memset(TransferCmd.data(), 0, sizeof(TransferCmd)); TransferCmd[0] = 0xFF; - LastRomTransferTime = 0; + ROMTransferTime[0] = 0; + ROMTransferTime[1] = 0; + QueueIRQ = false; + ScheduledIRQ = false; if (Cart) Cart->Reset(); } @@ -1816,8 +1817,14 @@ void NDSCartSlot::ROMEndTransfer(u32 param) noexcept Cart->ROMCommandFinish(TransferCmd.data(), TransferData.data(), TransferLen); } -void NDSCartSlot::ROMPrepareData(u32 param) noexcept +void NDSCartSlot::ROMPrepareData() noexcept { + u64 curts; + if (NDS.CurCPU) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + + if (curts < ROMTransferTime[0]) return; + if (TransferDir == 0) { if (TransferPos >= TransferLen) @@ -1828,6 +1835,8 @@ void NDSCartSlot::ROMPrepareData(u32 param) noexcept TransferPos += 4; } + ROMTransferTime[0] = -1; + ROMCnt |= (1<<23); if (NDS.ExMemCnt[0] & (1<<11)) @@ -1836,6 +1845,12 @@ void NDSCartSlot::ROMPrepareData(u32 param) noexcept NDS.CheckDMAs(0, 0x05); } +u32 NDSCartSlot::GetROMCnt() noexcept +{ + ROMPrepareData(); + return ROMCnt; +} + void NDSCartSlot::WriteROMCnt(u32 val) noexcept { u32 xferstart = (val & ~ROMCnt) & (1<<31); @@ -1922,10 +1937,18 @@ void NDSCartSlot::WriteROMCnt(u32 val) noexcept if (datasize == 0) NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*cmddelay, ROMTransfer_End, 0); else - NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*(cmddelay+4), ROMTransfer_PrepareData, 0); + { + u64 curts; + if (NDS.CurCPU) curts = NDS.ARM7Timestamp; + else curts = ((std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift); - if (NDS.CurCPU) LastRomTransferTime = NDS.ARM7Timestamp + xfercycle*(cmddelay+4); - else LastRomTransferTime = ((std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift) + (xfercycle*(cmddelay+4)); + ROMTransferTime[0] = (xfercycle*(cmddelay+4)) + curts; + + if ((TransferPos + 4) < TransferLen) + ROMTransferTime[1] = (xfercycle*(cmddelay+8)) + curts; + else + ROMTransferTime[1] = -1; + } } void NDSCartSlot::AdvanceROMTransfer() noexcept @@ -1938,7 +1961,7 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept u32 delay = 4; if (!(ROMCnt & (1<<30))) { - if (!(TransferPos & 0x1FF)) + if (!((TransferPos+4) & 0x1FF)) delay += ((ROMCnt >> 16) & 0x3F); } @@ -1946,13 +1969,13 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept if (NDS.CurCPU) curts = NDS.ARM7Timestamp; else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; - s64 nexttransfer = (xfercycle*delay) - (curts - LastRomTransferTime); + ROMTransferTime[0] = ROMTransferTime[1]; - if (nexttransfer < 1) nexttransfer = 1; // CHECKME: the value of 1 here was kinda just a guess? it seems right though. + if ((TransferPos + 4) < TransferLen) + ROMTransferTime[1] = (xfercycle*delay) + curts; + else + ROMTransferTime[1] = -1; - NDS.ScheduleEvent(Event_ROMTransfer, false, nexttransfer, ROMTransfer_PrepareData, 0); - - LastRomTransferTime = curts; } else ROMEndTransfer(0); @@ -1961,6 +1984,12 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept u32 NDSCartSlot::ReadROMData() noexcept { if (ROMCnt & (1<<30)) return 0; + + u64 curts; + if (NDS.CurCPU) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + + ROMPrepareData(); if (ROMCnt & (1<<23)) { @@ -1974,6 +2003,12 @@ void NDSCartSlot::WriteROMData(u32 val) noexcept { if (!(ROMCnt & (1<<30))) return; + u64 curts; + if (NDS.CurCPU) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + + ROMPrepareData(); + ROMData = val; if (ROMCnt & (1<<23)) diff --git a/src/NDSCart.h b/src/NDSCart.h index c84aec90..7ee03252 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -410,13 +410,16 @@ public: void WriteROMCnt(u32 val) noexcept; [[nodiscard]] u8 ReadSPIData() const noexcept; void WriteSPIData(u8 val) noexcept; + void ROMPrepareData() noexcept; [[nodiscard]] u8 GetROMCommand(u8 index) const noexcept { return ROMCommand[index]; } void SetROMCommand(u8 index, u8 val) noexcept { ROMCommand[index] = val; } - [[nodiscard]] u32 GetROMCnt() const noexcept { return ROMCnt; } + [[nodiscard]] u32 GetROMCnt() noexcept; + [[nodiscard]] u16 GetSPICnt() const noexcept { return SPICnt; } void SetSPICnt(u16 val) noexcept { SPICnt = val; } + private: friend class CartCommon; melonDS::NDS& NDS; @@ -426,6 +429,8 @@ private: u8 SPIData = 0; u32 SPIDataPos = 0; bool SPIHold = false; + bool QueueIRQ; + bool ScheduledIRQ; u32 ROMData = 0; @@ -442,7 +447,7 @@ private: u64 Key2_X = 0; u64 Key2_Y = 0; - u64 LastRomTransferTime; + u64 ROMTransferTime[2]; void Key1_Encrypt(u32* data) const noexcept; void Key1_Decrypt(u32* data) const noexcept; @@ -451,7 +456,6 @@ private: void Key1_InitKeycode(bool dsi, u32 idcode, u32 level, u32 mod) noexcept; void Key2_Encrypt(const u8* data, u32 len) noexcept; void ROMEndTransfer(u32 param) noexcept; - void ROMPrepareData(u32 param) noexcept; void AdvanceROMTransfer() noexcept; void SPITransferDone(u32 param) noexcept; }; From dd857e89ef7548a523f7f7948c60385a1cdbc41a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Dec 2024 00:33:08 -0500 Subject: [PATCH 298/306] fix an oversight --- src/NDSCart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index 3388933f..f3780af5 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -1972,7 +1972,7 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept ROMTransferTime[0] = ROMTransferTime[1]; if ((TransferPos + 4) < TransferLen) - ROMTransferTime[1] = (xfercycle*delay) + curts; + ROMTransferTime[1] = (xfercycle*delay) + std::max(curts, ROMTransferTime[0]); else ROMTransferTime[1] = -1; From 40527f889789be42fccf213c58b37533a4163bfc Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Dec 2024 09:48:25 -0500 Subject: [PATCH 299/306] remove leftover junk, fix an error with resets --- src/NDSCart.cpp | 6 ++---- src/NDSCart.h | 2 -- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index f3780af5..b49c9e15 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -1797,10 +1797,8 @@ void NDSCartSlot::ResetCart() noexcept TransferDir = 0; memset(TransferCmd.data(), 0, sizeof(TransferCmd)); TransferCmd[0] = 0xFF; - ROMTransferTime[0] = 0; - ROMTransferTime[1] = 0; - QueueIRQ = false; - ScheduledIRQ = false; + ROMTransferTime[0] = -1; + ROMTransferTime[1] = -1; if (Cart) Cart->Reset(); } diff --git a/src/NDSCart.h b/src/NDSCart.h index 7ee03252..560b5d62 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -429,8 +429,6 @@ private: u8 SPIData = 0; u32 SPIDataPos = 0; bool SPIHold = false; - bool QueueIRQ; - bool ScheduledIRQ; u32 ROMData = 0; From e0ac68c9f26a2e2188ddec584a0e1b0b314e3a08 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Dec 2024 10:00:47 -0500 Subject: [PATCH 300/306] Reapply "improve dma accuracy slightly" This reverts commit 14c765d5ed34554157251d91b6338bfa4e57777e. --- src/NDS.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index a5618d6f..da89b750 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1092,8 +1092,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + (ARM9MemTimings[srcaddr>>14][6] == 1); - else A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][6] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][6] == 1)); + } + else + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][7]; + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][7]; + } DMALastWasMainRAM = false; } @@ -1125,8 +1133,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][6]; - else A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][6] - (burststart <= 0); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][6] + (burststart == 1); + } + else + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][7] - (burststart <= 0); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][7] + (burststart == 1); + } DMALastWasMainRAM = false; } @@ -1182,8 +1198,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + (ARM9MemTimings[srcaddr>>14][4] == 1); - else A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][4] + ((burststart == 2) && (ARM9MemTimings[srcaddr>>14][4] == 1)); + } + else + { + A9ContentionTS += ARM9MemTimings[srcaddr>>14][5]; + MainRAMTimestamp += ARM9MemTimings[srcaddr>>14][5]; + } DMALastWasMainRAM = false; } @@ -1215,8 +1239,16 @@ void NDS::MainRAMHandleARM9() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) A9ContentionTS += ARM9MemTimings[dstaddr>>14][4]; - else A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][4] + (burststart == 1); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][4]; + } + else + { + A9ContentionTS += ARM9MemTimings[dstaddr>>14][5] + (burststart == 1); + MainRAMTimestamp += ARM9MemTimings[dstaddr>>14][5]; + } DMALastWasMainRAM = false; } @@ -1422,8 +1454,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + (ARM7MemTimings[srcaddr>>15][2] == 1); - else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][2] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][2] == 1)); + } + else + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][3]; + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][3]; + } DMALastWasMainRAM = false; } @@ -1452,8 +1492,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2]; - else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][2] - (burststart <= 0); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][2] + (burststart == 1); + } + else + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][3] - (burststart <= 0); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][3] + (burststart == 1); + } DMALastWasMainRAM = false; } @@ -1506,8 +1554,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->SrcAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + (ARM7MemTimings[srcaddr>>15][0] == 1); - else ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; + if (burststart == 2 || dma->SrcAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][0] + ((burststart == 2) && (ARM7MemTimings[srcaddr>>15][0] == 1)); + } + else + { + ARM7Timestamp += ARM7MemTimings[srcaddr>>15][1]; + MainRAMTimestamp += ARM7MemTimings[srcaddr>>15][1]; + } DMALastWasMainRAM = false; } @@ -1536,8 +1592,16 @@ void NDS::MainRAMHandleARM7() } else { - if (burststart == 2 || dma->DstAddrInc <= 0) ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0]; - else ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] - (burststart <= 0); + if (burststart == 2 || dma->DstAddrInc <= 0) + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][0] + (burststart == 1); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][0]; + } + else + { + ARM7Timestamp += ARM7MemTimings[dstaddr>>15][1] + (burststart == 1); + MainRAMTimestamp += ARM7MemTimings[dstaddr>>15][1]; + } DMALastWasMainRAM = false; } From 98d0a6b37146693750c3ab828f2290638bfebec2 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Fri, 20 Dec 2024 10:42:34 -0500 Subject: [PATCH 301/306] fix gxfifo stalls being borked under certain circumstances --- src/NDS.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index da89b750..432d0710 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1645,7 +1645,7 @@ bool NDS::MainRAMHandle() { if (A9ContentionTS < ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; + if (ARM9.MRTrack.Type == MainRAMType::Null || (CPUStop & CPUStop_GXStall)) return 0; MainRAMHandleARM9(); } else @@ -1661,7 +1661,7 @@ bool NDS::MainRAMHandle() { if (A9ContentionTS <= ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; + if (ARM9.MRTrack.Type == MainRAMType::Null || (CPUStop & CPUStop_GXStall)) return 0; MainRAMHandleARM9(); } else @@ -1748,15 +1748,15 @@ u32 NDS::RunFrame() RunTimers(0); GPU.GPU3D.Run(); - if (ARM9.MRTrack.Type == MainRAMType::Null) + if (CPUStop & CPUStop_GXStall) { - if (CPUStop & CPUStop_GXStall) - { - // GXFIFO stall - s32 cycles = GPU.GPU3D.CyclesToRunFor(); - DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp+(cycles< Date: Fri, 20 Dec 2024 20:10:23 -0500 Subject: [PATCH 302/306] theoretically improve dma responsiveness? in practice seems to make no difference... --- src/ARM.cpp | 16 +++++ src/ARM.h | 27 ++++---- src/CP15.cpp | 169 ++++++++++++++++++++++++++++++++---------------- src/NDS.cpp | 11 ++-- src/NDSCart.cpp | 24 +++---- src/NDSCart.h | 4 +- 6 files changed, 161 insertions(+), 90 deletions(-) diff --git a/src/ARM.cpp b/src/ARM.cpp index d4e49723..e6e7ecc4 100644 --- a/src/ARM.cpp +++ b/src/ARM.cpp @@ -1447,6 +1447,22 @@ void ARMv5::ForceInterlock_2() NDS.ARM9Timestamp = TimestampMemory + ILForceDelay; } +void ARMv5::QueueFunction(void (ARMv5::*QueueEntry)(void)) +{ + if ((NDS.ARM9Timestamp >= NDS.ARM9Target) || (MRTrack.Type != MainRAMType::Null)) + FuncQueue[FuncQueueFill++] = QueueEntry; + else + (this->*QueueEntry)(); +} + +void ARMv4::QueueFunction(void (ARMv4::*QueueEntry)(void)) +{ + if ((NDS.ARM7Timestamp >= NDS.ARM7Target) || (MRTrack.Type != MainRAMType::Null)) + FuncQueue[FuncQueueFill++] = QueueEntry; + else + (this->*QueueEntry)(); +} + void ARMv4::CodeRead16(u32 addr) { if ((addr >> 24) == 0x02) diff --git a/src/ARM.h b/src/ARM.h index 21c06813..a7092e4c 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -723,14 +723,8 @@ public: * @return Value of the cp15 register */ u32 CP15Read(const u32 id) const; - - inline void QueueFunction(void (ARMv5::*QueueEntry)(void)) - { - if (MRTrack.Type != MainRAMType::Null) - FuncQueue[FuncQueueFill++] = QueueEntry; - else - (this->*QueueEntry)(); - } + + void QueueFunction(void (ARMv5::*QueueEntry)(void)); // Queue Functions void StartExecARM(); @@ -744,27 +738,36 @@ public: void JumpTo_3C(); void JumpTo_4(); void CodeRead32_2(); + void CodeRead32_3(); void ICacheLookup_2(); void DAbortHandle(); void DCacheFin8(); void DRead8_2(); void DRead8_3(); + void DRead8_4(); void DCacheFin16(); void DRead16_2(); void DRead16_3(); + void DRead16_4(); void DCacheFin32(); void DRead32_2(); void DRead32_3(); + void DRead32_4(); void DRead32S_2(); void DRead32S_3(); + void DRead32S_4(); void DWrite8_2(); void DWrite8_3(); + void DWrite8_4(); void DWrite16_2(); void DWrite16_3(); + void DWrite16_4(); void DWrite32_2(); void DWrite32_3(); + void DWrite32_4(); void DWrite32S_2(); void DWrite32S_3(); + void DWrite32S_4(); void WBCheck_2(); void ICachePrefetch_2(); void DCacheLookup_2(); @@ -933,13 +936,7 @@ public: void AddCycles_CDI() override; void AddCycles_CD() override; - inline void QueueFunction(void (ARMv4::*QueueEntry)(void)) - { - if (MRTrack.Type != MainRAMType::Null) - FuncQueue[FuncQueueFill++] = QueueEntry; - else - (this->*QueueEntry)(); - } + void QueueFunction(void (ARMv4::*QueueEntry)(void)); void StartExecARM(); void StartExecTHUMB(); diff --git a/src/CP15.cpp b/src/CP15.cpp index 8903dab8..e0ca7e83 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2213,7 +2213,13 @@ void ARMv5::CodeRead32(u32 addr) } #endif } - + + FetchAddr[16] = addr; + QueueFunction(&ARMv5::CodeRead32_2); +} + +void ARMv5::CodeRead32_2() +{ if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; // bus reads can only overlap with dcache streaming by 6 cycles if (DCacheStreamPtr < 7) @@ -2222,16 +2228,15 @@ void ARMv5::CodeRead32(u32 addr) if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; } - if (PU_Map[addr>>12] & 0x30) + if (PU_Map[FetchAddr[16]>>12] & 0x30) WriteBufferDrain(); else WriteBufferCheck<3>(); - FetchAddr[16] = addr; - QueueFunction(&ARMv5::CodeRead32_2); + QueueFunction(&ARMv5::CodeRead32_3); } -void ARMv5::CodeRead32_2() +void ARMv5::CodeRead32_3() { u32 addr = FetchAddr[16]; @@ -2346,6 +2351,15 @@ void ARMv5::DRead8_2() } #endif + QueueFunction(&ARMv5::DRead8_3); +} + +void ARMv5::DRead8_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<(); - QueueFunction(&ARMv5::DRead8_3); + QueueFunction(&ARMv5::DRead8_4); } -void ARMv5::DRead8_3() +void ARMv5::DRead8_4() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; @@ -2461,6 +2475,15 @@ void ARMv5::DRead16_2() } #endif + QueueFunction(&ARMv5::DRead16_3); +} + +void ARMv5::DRead16_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<(); - QueueFunction(&ARMv5::DRead16_3); + QueueFunction(&ARMv5::DRead16_4); } -void ARMv5::DRead16_3() +void ARMv5::DRead16_4() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; @@ -2576,7 +2599,16 @@ void ARMv5::DRead32_2() } } #endif - + + QueueFunction(&ARMv5::DRead32_3); +} + +void ARMv5::DRead32_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<(); - QueueFunction(&ARMv5::DRead32_3); + QueueFunction(&ARMv5::DRead32_4); } -void ARMv5::DRead32_3() +void ARMv5::DRead32_4() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; @@ -2678,7 +2710,16 @@ void ARMv5::DRead32S_2() } } #endif - + + QueueFunction(&ARMv5::DRead32S_3); +} + +void ARMv5::DRead32S_3() +{ + u8 reg = __builtin_ctz(LDRRegs); + u32 addr = FetchAddr[reg]; + u32 dummy; u32* val = (LDRFailedRegs & (1<(); - QueueFunction(&ARMv5::DRead32S_3); + QueueFunction(&ARMv5::DRead32S_4); } -void ARMv5::DRead32S_3() +void ARMv5::DRead32S_4() { u8 reg = __builtin_ctz(LDRRegs); u32 addr = FetchAddr[reg]; @@ -2815,16 +2856,6 @@ void ARMv5::DWrite8_2() if (!(PU_Map[addr>>12] & (0x30))) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: do buffered writes trigger this? - if (ICacheStreamPtr < 7) - { - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } - - WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite8_3); } else @@ -2837,6 +2868,21 @@ void ARMv5::DWrite8_2() } void ARMv5::DWrite8_3() +{ + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + WriteBufferCheck<2>(); + QueueFunction(&ARMv5::DWrite8_4); +} + +void ARMv5::DWrite8_4() { u8 reg = __builtin_ctz(STRRegs); u32 addr = FetchAddr[reg]; @@ -2929,16 +2975,6 @@ void ARMv5::DWrite16_2() if (!(PU_Map[addr>>12] & 0x30)) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: do buffered writes trigger this? - if (ICacheStreamPtr < 7) - { - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } - - WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite16_3); } else @@ -2951,6 +2987,21 @@ void ARMv5::DWrite16_2() } void ARMv5::DWrite16_3() +{ + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + WriteBufferCheck<2>(); + QueueFunction(&ARMv5::DWrite16_4); +} + +void ARMv5::DWrite16_4() { u8 reg = __builtin_ctz(STRRegs); u32 addr = FetchAddr[reg]; @@ -3048,16 +3099,6 @@ void ARMv5::DWrite32_2() if (!(PU_Map[addr>>12] & 0x30)) { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: do buffered writes trigger this? - if (ICacheStreamPtr < 7) - { - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } - - WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite32_3); } else @@ -3071,6 +3112,21 @@ void ARMv5::DWrite32_2() } void ARMv5::DWrite32_3() +{ + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + + WriteBufferCheck<2>(); + QueueFunction(&ARMv5::DWrite32_4); +} + +void ARMv5::DWrite32_4() { u8 reg = __builtin_ctz(STRRegs); u32 addr = FetchAddr[reg]; @@ -3162,15 +3218,6 @@ void ARMv5::DWrite32S_2() if (!(PU_Map[addr>>12] & 0x30)) // non-bufferable { - if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; - // bus reads can only overlap with icache streaming by 6 cycles - // checkme: do buffered writes trigger this? - if (ICacheStreamPtr < 7) - { - u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? - if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; - } - WriteBufferCheck<2>(); QueueFunction(&ARMv5::DWrite32S_3); } else @@ -3181,6 +3228,20 @@ void ARMv5::DWrite32S_2() } void ARMv5::DWrite32S_3() +{ + if (NDS.ARM9Timestamp < NDS.DMA9Timestamp) NDS.ARM9Timestamp = NDS.DMA9Timestamp; + // bus reads can only overlap with icache streaming by 6 cycles + // checkme: do buffered writes trigger this? + if (ICacheStreamPtr < 7) + { + u64 time = ICacheStreamTimes[6] - 6; // checkme: minus 6? + if (NDS.ARM9Timestamp < time) NDS.ARM9Timestamp = time; + } + WriteBufferCheck<2>(); + QueueFunction(&ARMv5::DWrite32S_4); +} + +void ARMv5::DWrite32S_4() { u8 reg = __builtin_ctz(STRRegs); u32 addr = FetchAddr[reg]; diff --git a/src/NDS.cpp b/src/NDS.cpp index 432d0710..91aee9d4 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -792,7 +792,7 @@ void NDS::SetARM9BIOS(const std::array& bios) noexcept u64 NDS::NextTarget() { - u64 minEvent = UINT64_MAX; + u64 minEvent = std::max(SysTimestamp+1, NDSCartSlot.ROMTransferTime[0]); u32 mask = SchedListMask; for (int i = 0; i < Event_MAX; i++) @@ -1740,7 +1740,6 @@ u32 NDS::RunFrame() u64 target = NextTarget(); ARM9Target = target << ARM9ClockShift; - //ARM7Target = target; while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) { @@ -1776,12 +1775,10 @@ u32 NDS::RunFrame() } //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - - NDSCartSlot.ROMPrepareData(); + RunTimers(0); GPU.GPU3D.Run(); - //if (MainRAMHandle()) break; MainRAMHandle(); target = std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); @@ -1816,12 +1813,12 @@ u32 NDS::RunFrame() } RunTimers(1); - NDSCartSlot.ROMPrepareData(); if (!MainRAMHandle()) break; } } - + + NDSCartSlot.ROMPrepareData(); RunSystem(target); if (CPUStop & CPUStop_Sleep) diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index b49c9e15..ef24cdb1 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -1818,8 +1818,8 @@ void NDSCartSlot::ROMEndTransfer(u32 param) noexcept void NDSCartSlot::ROMPrepareData() noexcept { u64 curts; - if (NDS.CurCPU) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; if (curts < ROMTransferTime[0]) return; @@ -1937,8 +1937,8 @@ void NDSCartSlot::WriteROMCnt(u32 val) noexcept else { u64 curts; - if (NDS.CurCPU) curts = NDS.ARM7Timestamp; - else curts = ((std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift); + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; ROMTransferTime[0] = (xfercycle*(cmddelay+4)) + curts; @@ -1962,10 +1962,10 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept if (!((TransferPos+4) & 0x1FF)) delay += ((ROMCnt >> 16) & 0x3F); } - + u64 curts; - if (NDS.CurCPU) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; ROMTransferTime[0] = ROMTransferTime[1]; @@ -1984,8 +1984,8 @@ u32 NDSCartSlot::ReadROMData() noexcept if (ROMCnt & (1<<30)) return 0; u64 curts; - if (NDS.CurCPU) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; ROMPrepareData(); @@ -2000,10 +2000,10 @@ u32 NDSCartSlot::ReadROMData() noexcept void NDSCartSlot::WriteROMData(u32 val) noexcept { if (!(ROMCnt & (1<<30))) return; - + u64 curts; - if (NDS.CurCPU) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; + else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; ROMPrepareData(); diff --git a/src/NDSCart.h b/src/NDSCart.h index 560b5d62..3db076e4 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -420,6 +420,8 @@ public: [[nodiscard]] u16 GetSPICnt() const noexcept { return SPICnt; } void SetSPICnt(u16 val) noexcept { SPICnt = val; } + u64 ROMTransferTime[2]; + private: friend class CartCommon; melonDS::NDS& NDS; @@ -445,8 +447,6 @@ private: u64 Key2_X = 0; u64 Key2_Y = 0; - u64 ROMTransferTime[2]; - void Key1_Encrypt(u32* data) const noexcept; void Key1_Decrypt(u32* data) const noexcept; void Key1_ApplyKeycode(u32* keycode, u32 mod) noexcept; From 65e2f6469527165351666c3bf3675294cc6dc67a Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 21 Dec 2024 12:29:37 -0500 Subject: [PATCH 303/306] "immediate mode" dma start should be delayed 1 cycle --- src/ARM.h | 11 ++ src/CP15.cpp | 299 ++++++++++++++++++++++++++++++++++++------------ src/DMA.cpp | 21 +++- src/DMA.h | 2 +- src/NDS.cpp | 16 ++- src/NDS.h | 10 +- src/NDSCart.cpp | 10 +- 7 files changed, 277 insertions(+), 92 deletions(-) diff --git a/src/ARM.h b/src/ARM.h index a7092e4c..49030d78 100644 --- a/src/ARM.h +++ b/src/ARM.h @@ -739,35 +739,46 @@ public: void JumpTo_4(); void CodeRead32_2(); void CodeRead32_3(); + void CodeRead32_4(); void ICacheLookup_2(); void DAbortHandle(); void DCacheFin8(); void DRead8_2(); void DRead8_3(); void DRead8_4(); + void DRead8_5(); void DCacheFin16(); void DRead16_2(); void DRead16_3(); void DRead16_4(); + void DRead16_5(); void DCacheFin32(); void DRead32_2(); void DRead32_3(); void DRead32_4(); + void DRead32_5(); void DRead32S_2(); void DRead32S_3(); void DRead32S_4(); + void DRead32S_5A(); + void DRead32S_5B(); void DWrite8_2(); void DWrite8_3(); void DWrite8_4(); + void DWrite8_5(); void DWrite16_2(); void DWrite16_3(); void DWrite16_4(); + void DWrite16_5(); void DWrite32_2(); void DWrite32_3(); void DWrite32_4(); + void DWrite32_5(); void DWrite32S_2(); void DWrite32S_3(); void DWrite32S_4(); + void DWrite32S_5A(); + void DWrite32S_5B(); void WBCheck_2(); void ICachePrefetch_2(); void DCacheLookup_2(); diff --git a/src/CP15.cpp b/src/CP15.cpp index e0ca7e83..bafa6a48 100644 --- a/src/CP15.cpp +++ b/src/CP15.cpp @@ -2242,32 +2242,42 @@ void ARMv5::CodeRead32_3() NDS.ARM9Timestamp = NDS.ARM9Timestamp + ((1<> 14][1]; - if ((addr >> 24) == 0x02) { FetchAddr[16] = addr; MRTrack.Type = MainRAMType::Fetch; MRTrack.Var = MRCodeFetch | MR32; + + QueueFunction(DelayedQueue); } else { if (((NDS.ARM9Timestamp <= WBReleaseTS) && (NDS.ARM9Regions[addr>>14] == WBLastRegion)) // check write buffer || (Store && (NDS.ARM9Regions[addr>>14] == DataRegion))) //check the actual store NDS.ARM9Timestamp += 1<> 14][1]; + + NDS.ARM9Timestamp += cycles; + + if (WBTimestamp < ((NDS.ARM9Timestamp - (3<> 14][0]; - DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 14][0]; + DataCycles = 3<> 14][0]; - DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 14][0]; + DataCycles = 3<> 14][1]; - DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<> 14][1]; + DataCycles = 3<(); - + QueueFunction(&ARMv5::DRead32S_4); } @@ -2751,18 +2802,16 @@ void ARMv5::DRead32S_4() MRTrack.Type = MainRAMType::Fetch; MRTrack.Var = MR32 | MRSequential; MRTrack.Progress = reg; + + LDRRegs &= ~1<>14][2]; - DataCycles = MemTimings[addr>>14][2]; DataRegion = NDS.ARM9Regions[addr>>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14][1]; - DataCycles = 3<>14]; if ((NDS.ARM9Timestamp <= WBReleaseTS) && (DataRegion == WBLastRegion)) // check write buffer NDS.ARM9Timestamp += 1<>14][2]; + DataCycles = MemTimings[addr>>14][2]; + + if (WBTimestamp < ((NDS.ARM9Timestamp - (3<>14][1]; + DataCycles = 3<> 14][0]; - DataCycles = 3<>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 14][0]; + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 14][0]; - DataCycles = 3<>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 14][0]; + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 14][1]; - DataCycles = 3<>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 14][1]; + DataCycles = 3<>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>14][2]; - DataRegion = NDS.ARM9Regions[addr>>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>14][1]; - DataCycles = 3 << NDS.ARM9ClockShift; - DataRegion = NDS.ARM9Regions[addr>>14]; - - if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>14][2]; + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<>14][1]; + DataCycles = 3 << NDS.ARM9ClockShift; + DataRegion = NDS.ARM9Regions[addr>>14]; + + if (WBTimestamp < ((NDS.ARM9Timestamp + ((1<> 28) & 0x3) | 0x10; if ((StartMode & 0x7) == 0) - Start(); + { + NDS.DMAsQueued[NDS.DMAQueuePtr++] = (CPU*4)+Num; + if (!(NDS.SchedListMask & (1<= NDS.ARM9Target) return; diff --git a/src/DMA.h b/src/DMA.h index 3e9ebd50..4bad6110 100644 --- a/src/DMA.h +++ b/src/DMA.h @@ -75,7 +75,7 @@ public: void ResetBurst() { - if (Running > 0) Running = (CPU ? 2 : 3); + if (Running > 0) Running = 3; } u32 SrcAddr {}; diff --git a/src/NDS.cpp b/src/NDS.cpp index 91aee9d4..90008e04 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -124,6 +124,7 @@ NDS::NDS(NDSArgs&& args, int type, void* userdata) noexcept : { RegisterEventFuncs(Event_Div, this, {MakeEventThunk(NDS, DivDone)}); RegisterEventFuncs(Event_Sqrt, this, {MakeEventThunk(NDS, SqrtDone)}); + RegisterEventFuncs(Event_DMA, this, {MakeEventThunk(NDS, QueueDMAs)}); MainRAM = JIT.Memory.GetMainRAM(); SharedWRAM = JIT.Memory.GetSharedWRAM(); @@ -134,6 +135,7 @@ NDS::~NDS() noexcept { UnregisterEventFuncs(Event_Div); UnregisterEventFuncs(Event_Sqrt); + UnregisterEventFuncs(Event_DMA); // The destructor for each component is automatically called by the compiler } @@ -548,6 +550,9 @@ void NDS::Reset() KeyCnt[1] = 0; RCnt = 0; + memset(DMAsQueued, 0, sizeof(DMAsQueued)); + DMAQueuePtr = 0; + GPU.Reset(); NDSCartSlot.Reset(); GBACartSlot.Reset(); @@ -1818,8 +1823,8 @@ u32 NDS::RunFrame() } } - NDSCartSlot.ROMPrepareData(); RunSystem(target); + NDSCartSlot.ROMPrepareData(); if (CPUStop & CPUStop_Sleep) { @@ -2526,6 +2531,15 @@ void NDS::StopDMAs(u32 cpu, u32 mode) DMAs[cpu+3].StopIfNeeded(mode); } +void NDS::QueueDMAs(u32 param) +{ + DMAs[DMAsQueued[0]].Start(); + for(int i = 0; i < 7; i++) DMAsQueued[i] = DMAsQueued[i+1]; + DMAQueuePtr--; + + if (DMAQueuePtr != 0) ScheduleEvent(Event_DMA, false, 1, 0, 0); +} + void NDS::DivDone(u32 param) diff --git a/src/NDS.h b/src/NDS.h index e933d951..722f91b4 100644 --- a/src/NDS.h +++ b/src/NDS.h @@ -64,6 +64,7 @@ enum Event_SPITransfer, Event_Div, Event_Sqrt, + Event_DMA, // DSi Event_DSi_SDMMCTransfer, @@ -243,7 +244,8 @@ public: // TODO: Encapsulate the rest of these members int ConsoleType; int CurCPU; - + + u32 SchedListMask; SchedEvent SchedList[Event_MAX] {}; u8 ARM9MemTimings[0x40000][8]; u32 ARM9Regions[0x40000]; @@ -259,6 +261,7 @@ public: // TODO: Encapsulate the rest of these members u64 ARM7Timestamp, ARM7Target; u64 MainRAMTimestamp, MainRAMBurstStart; u64 A9ContentionTS; bool ConTSLock; + u64 SysTimestamp; u32 ARM9ClockShift; u32 IME[2]; @@ -277,6 +280,8 @@ public: // TODO: Encapsulate the rest of these members alignas(u32) u8 ROMSeed1[2*8]; u32 DMAReadHold[2]; + u8 DMAsQueued[8]; + u8 DMAQueuePtr; bool MainRAMBork; // if a main ram read burst starts in the last 6 bytes of a 32 byte block, and then crosses the 32 byte boundary, the burst forcibly restarts bool MainRAMLastAccess; // 0 == ARM9 | 1 == ARM7 bool DMALastWasMainRAM; @@ -506,8 +511,6 @@ public: // TODO: Encapsulate the rest of these members private: void InitTimings(); - u32 SchedListMask; - u64 SysTimestamp; u8 WRAMCnt; u8 PostFlag9; u8 PostFlag7; @@ -542,6 +545,7 @@ private: void HandleTimerOverflow(u32 tid); u16 TimerGetCounter(u32 timer); void TimerStart(u32 id, u16 cnt); + void QueueDMAs(u32 param); void StartDiv(); void DivDone(u32 param); void SqrtDone(u32 param); diff --git a/src/NDSCart.cpp b/src/NDSCart.cpp index ef24cdb1..25458914 100644 --- a/src/NDSCart.cpp +++ b/src/NDSCart.cpp @@ -1982,10 +1982,6 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept u32 NDSCartSlot::ReadROMData() noexcept { if (ROMCnt & (1<<30)) return 0; - - u64 curts; - if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; ROMPrepareData(); @@ -2000,11 +1996,7 @@ u32 NDSCartSlot::ReadROMData() noexcept void NDSCartSlot::WriteROMData(u32 val) noexcept { if (!(ROMCnt & (1<<30))) return; - - u64 curts; - if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; - + ROMPrepareData(); ROMData = val; From caa009ca033739b08af2794728c5e046ddd451b4 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:19:34 -0500 Subject: [PATCH 304/306] better logic for gamecard reads --- src/NDS.cpp | 18 ++++++++++++++++-- src/NDS.h | 1 + src/NDSCart.cpp | 46 ++++++++++++++++------------------------------ src/NDSCart.h | 5 ++--- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 90008e04..04af0079 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -797,7 +797,7 @@ void NDS::SetARM9BIOS(const std::array& bios) noexcept u64 NDS::NextTarget() { - u64 minEvent = std::max(SysTimestamp+1, NDSCartSlot.ROMTransferTime[0]); + u64 minEvent = UINT64_MAX; u32 mask = SchedListMask; for (int i = 0; i < Event_MAX; i++) @@ -845,6 +845,21 @@ void NDS::RunSystem(u64 timestamp) } } +void NDS::RunEventManual(u32 id) +{ + if (SchedListMask & (1<> ARM9ClockShift); + SchedEvent& evt = SchedList[id]; + + if (evt.Timestamp <= curts) + { + evt.Funcs[evt.FuncID](evt.That, evt.Param); + SchedListMask &= ~(1<&& rom) n { NDS.RegisterEventFuncs(Event_ROMTransfer, this, { + MakeEventThunk(NDSCartSlot, ROMPrepareData), MakeEventThunk(NDSCartSlot, ROMEndTransfer) }); NDS.RegisterEventFuncs(Event_ROMSPITransfer, this, {MakeEventThunk(NDSCartSlot, SPITransferDone)}); @@ -1797,8 +1799,7 @@ void NDSCartSlot::ResetCart() noexcept TransferDir = 0; memset(TransferCmd.data(), 0, sizeof(TransferCmd)); TransferCmd[0] = 0xFF; - ROMTransferTime[0] = -1; - ROMTransferTime[1] = -1; + ROMTransferTime = -1; if (Cart) Cart->Reset(); } @@ -1815,14 +1816,8 @@ void NDSCartSlot::ROMEndTransfer(u32 param) noexcept Cart->ROMCommandFinish(TransferCmd.data(), TransferData.data(), TransferLen); } -void NDSCartSlot::ROMPrepareData() noexcept +void NDSCartSlot::ROMPrepareData(u32 param) noexcept { - u64 curts; - if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; - else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; - - if (curts < ROMTransferTime[0]) return; - if (TransferDir == 0) { if (TransferPos >= TransferLen) @@ -1833,8 +1828,6 @@ void NDSCartSlot::ROMPrepareData() noexcept TransferPos += 4; } - ROMTransferTime[0] = -1; - ROMCnt |= (1<<23); if (NDS.ExMemCnt[0] & (1<<11)) @@ -1845,7 +1838,7 @@ void NDSCartSlot::ROMPrepareData() noexcept u32 NDSCartSlot::GetROMCnt() noexcept { - ROMPrepareData(); + NDS.RunEventManual(Event_ROMTransfer); return ROMCnt; } @@ -1936,16 +1929,13 @@ void NDSCartSlot::WriteROMCnt(u32 val) noexcept NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*cmddelay, ROMTransfer_End, 0); else { + NDS.ScheduleEvent(Event_ROMTransfer, false, xfercycle*(cmddelay+4), ROMTransfer_PrepareData, 0); + u64 curts; if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; - ROMTransferTime[0] = (xfercycle*(cmddelay+4)) + curts; - - if ((TransferPos + 4) < TransferLen) - ROMTransferTime[1] = (xfercycle*(cmddelay+8)) + curts; - else - ROMTransferTime[1] = -1; + ROMTransferTime = (xfercycle*(cmddelay+8)) + curts; } } @@ -1966,14 +1956,10 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept u64 curts; if (NDS.ExMemCnt[0] & (1<<11)) curts = NDS.ARM7Timestamp; else curts = (std::max(NDS.ARM9Timestamp, NDS.DMA9Timestamp) + ((1<> NDS.ARM9ClockShift; + + NDS.ScheduleEvent(Event_ROMTransfer, false, ROMTransferTime-curts, ROMTransfer_PrepareData, 0); - ROMTransferTime[0] = ROMTransferTime[1]; - - if ((TransferPos + 4) < TransferLen) - ROMTransferTime[1] = (xfercycle*delay) + std::max(curts, ROMTransferTime[0]); - else - ROMTransferTime[1] = -1; - + ROMTransferTime = (xfercycle*delay) + std::max(curts, ROMTransferTime); } else ROMEndTransfer(0); @@ -1982,8 +1968,8 @@ void NDSCartSlot::AdvanceROMTransfer() noexcept u32 NDSCartSlot::ReadROMData() noexcept { if (ROMCnt & (1<<30)) return 0; - - ROMPrepareData(); + + NDS.RunEventManual(Event_ROMTransfer); if (ROMCnt & (1<<23)) { @@ -1996,8 +1982,8 @@ u32 NDSCartSlot::ReadROMData() noexcept void NDSCartSlot::WriteROMData(u32 val) noexcept { if (!(ROMCnt & (1<<30))) return; - - ROMPrepareData(); + + NDS.RunEventManual(Event_ROMTransfer); ROMData = val; diff --git a/src/NDSCart.h b/src/NDSCart.h index 3db076e4..37bbea27 100644 --- a/src/NDSCart.h +++ b/src/NDSCart.h @@ -410,7 +410,6 @@ public: void WriteROMCnt(u32 val) noexcept; [[nodiscard]] u8 ReadSPIData() const noexcept; void WriteSPIData(u8 val) noexcept; - void ROMPrepareData() noexcept; [[nodiscard]] u8 GetROMCommand(u8 index) const noexcept { return ROMCommand[index]; } void SetROMCommand(u8 index, u8 val) noexcept { ROMCommand[index] = val; } @@ -420,8 +419,6 @@ public: [[nodiscard]] u16 GetSPICnt() const noexcept { return SPICnt; } void SetSPICnt(u16 val) noexcept { SPICnt = val; } - u64 ROMTransferTime[2]; - private: friend class CartCommon; melonDS::NDS& NDS; @@ -446,6 +443,7 @@ private: u64 Key2_X = 0; u64 Key2_Y = 0; + u64 ROMTransferTime; void Key1_Encrypt(u32* data) const noexcept; void Key1_Decrypt(u32* data) const noexcept; @@ -454,6 +452,7 @@ private: void Key1_InitKeycode(bool dsi, u32 idcode, u32 level, u32 mod) noexcept; void Key2_Encrypt(const u8* data, u32 len) noexcept; void ROMEndTransfer(u32 param) noexcept; + void ROMPrepareData(u32 param) noexcept; void AdvanceROMTransfer() noexcept; void SPITransferDone(u32 param) noexcept; }; From 4940f940c797ce3312cb86578499a0f9ec843adb Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 21 Dec 2024 17:28:32 -0500 Subject: [PATCH 305/306] improve accuracy and also performance --- src/NDS.cpp | 88 ++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 04af0079..183903bf 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1760,54 +1760,55 @@ u32 NDS::RunFrame() u64 target = NextTarget(); ARM9Target = target << ARM9ClockShift; + ARM7Target = target; - while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) + while (((std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) && (ARM9.MRTrack.Type == MainRAMType::Null)) + || (ARM7Timestamp < ARM7Target) && (ARM7.MRTrack.Type == MainRAMType::Null)) { - CurCPU = 0; - RunTimers(0); - GPU.GPU3D.Run(); + while (std::max(std::max(ARM9Timestamp, DMA9Timestamp), A9ContentionTS << ARM9ClockShift) < ARM9Target) + { + CurCPU = 0; + RunTimers(0); + GPU.GPU3D.Run(); - if (CPUStop & CPUStop_GXStall) - { - // GXFIFO stall - s32 cycles = GPU.GPU3D.CyclesToRunFor(); - DMA9Timestamp = std::min(ARM9Target, std::max(ARM9Timestamp, DMA9Timestamp)+(cycles<(*this); - dsi.RunNDMAs(0); + DMAs[0].Run(); + if (!(CPUStop & CPUStop_GXStall) && (ARM9.MRTrack.Type == MainRAMType::Null)) DMAs[1].Run(); + if (!(CPUStop & CPUStop_GXStall) && (ARM9.MRTrack.Type == MainRAMType::Null)) DMAs[2].Run(); + if (!(CPUStop & CPUStop_GXStall) && (ARM9.MRTrack.Type == MainRAMType::Null)) DMAs[3].Run(); + if (ConsoleType == 1) + { + auto& dsi = dynamic_cast(*this); + dsi.RunNDMAs(0); + } + } + else + { + //if (ARM9.abt) ARM9Timestamp = ARM9Target; + ARM9.Execute(); } } - else - { - //if (ARM9.abt) ARM9Timestamp = ARM9Target; - ARM9.Execute(); - } + + //printf("MAIN LOOP: 9 %lli %lli %08X %08llX %i 7 %lli %lli %08X %08llX %i\n", ARM9Timestamp, ARM9Target, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7Target, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type); + + RunTimers(0); + GPU.GPU3D.Run(); + + if (MainRAMHandle()) break; } - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); - - RunTimers(0); - GPU.GPU3D.Run(); - - MainRAMHandle(); - - target = std::max(std::max(ARM9Timestamp, DMA9Timestamp) >> ARM9ClockShift, A9ContentionTS); - if (target == ARM7Timestamp) target++; - - while (ARM7Timestamp < target) + while (ARM7Timestamp < ARM7Target) { - ARM7Target = target; - //printf("A7 LOOP: %lli %lli\n", ARM9Timestamp>>ARM9ClockShift, ARM7Timestamp); + //printf("A7 LOOP: 9 %lli %lli %08X %08llX %i 7 %lli %lli %08X %08llX %i\n", ARM9Timestamp, ARM9Target, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7Target, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type); CurCPU = 1; RunTimers(1); @@ -1838,7 +1839,7 @@ u32 NDS::RunFrame() } } - RunSystem(target); + RunSystem(ARM7Target); if (CPUStop & CPUStop_Sleep) { @@ -1860,7 +1861,6 @@ u32 NDS::RunFrame() SPU.TransferOutput(); break; } - //printf("MAIN LOOP: 9 %lli %08X %08llX %i 7 %lli %08X %08llX %i %i %08X\n", ARM9Timestamp>>ARM9ClockShift, ARM9.PC, ARM9.CurInstr, (u8)ARM9.MRTrack.Type, ARM7Timestamp, ARM7.R[15], ARM7.CurInstr, (u8)ARM7.MRTrack.Type, IME[1], IE[1]); // In the context of TASes, frame count is traditionally the primary measure of emulated time, // so it needs to be tracked even if NDS is powered off. @@ -1894,13 +1894,11 @@ u32 NDS::RunFrame() void NDS::Reschedule(u64 target) { - if (CurCPU == 0) + if (target < ARM7Target) { - if (target < (ARM9Target >> ARM9ClockShift)) - ARM9Target = (target << ARM9ClockShift); - } - else if (target < ARM7Target) ARM7Target = target; + ARM9Target = (target << ARM9ClockShift); + } } void NDS::RegisterEventFuncs(u32 id, void* that, const std::initializer_list& funcs) From 93dd0f47bde745f7dd00ca62952953c48f3f9b17 Mon Sep 17 00:00:00 2001 From: Jaklyy <102590697+Jaklyy@users.noreply.github.com> Date: Sat, 21 Dec 2024 18:06:03 -0500 Subject: [PATCH 306/306] fix an issue related to gx stalls --- src/NDS.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/NDS.cpp b/src/NDS.cpp index 183903bf..ea77610f 100644 --- a/src/NDS.cpp +++ b/src/NDS.cpp @@ -1665,13 +1665,19 @@ bool NDS::MainRAMHandle() { if (A9ContentionTS < ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null || (CPUStop & CPUStop_GXStall)) return 0; - MainRAMHandleARM9(); + if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; + else if (CPUStop & CPUStop_GXStall) + { + // gx stalls can occur during this, and if not handled properly will cause issues + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + A9ContentionTS = std::min(ARM9Target, A9ContentionTS+cycles); + } + else MainRAMHandleARM9(); } else { if (ARM7.MRTrack.Type == MainRAMType::Null) return 1; - MainRAMHandleARM7(); + else MainRAMHandleARM7(); } } } @@ -1681,13 +1687,19 @@ bool NDS::MainRAMHandle() { if (A9ContentionTS <= ARM7Timestamp) { - if (ARM9.MRTrack.Type == MainRAMType::Null || (CPUStop & CPUStop_GXStall)) return 0; - MainRAMHandleARM9(); + if (ARM9.MRTrack.Type == MainRAMType::Null) return 0; + else if (CPUStop & CPUStop_GXStall) + { + // gx stalls can occur during this, and if not handled properly will cause issues + s32 cycles = GPU.GPU3D.CyclesToRunFor(); + A9ContentionTS = std::min(ARM9Target, A9ContentionTS+cycles); + } + else MainRAMHandleARM9(); } else { if (ARM7.MRTrack.Type == MainRAMType::Null) return 1; - MainRAMHandleARM7(); + else MainRAMHandleARM7(); } } }