diff --git a/desmume/src/MMU_timing.h b/desmume/src/MMU_timing.h index a9520c10f..9952d27e4 100644 --- a/desmume/src/MMU_timing.h +++ b/desmume/src/MMU_timing.h @@ -29,39 +29,37 @@ #include "debug.h" #include "NDSSystem.h" -//////////////////////////////////////////////////////////////// -// MEMORY TIMING ACCURACY CONFIGURATION -// -// the more of these are enabled, -// the more accurate memory access timing _should_ become. -// they should be listed roughly in order of most to least important. -// it's reasonable to disable some of these as a speed hack. -// obviously, these defines don't cover all the variables or features needed, -// and in particular, DMA or code+data access bus contention is still missing. +/* + * MEMORY TIMING ACCURACY CONFIGURATION + * + * the more of these are enabled, + * the more accurate memory access timing _should_ become. + * they should be listed roughly in order of most to least important. + * it's reasonable to disable some of these as a speed hack. + * obviously, these defines don't cover all the variables or features needed, + * and in particular, DMA or code+data access bus contention is still missing. */ - //disable this to prevent the advanced timing logic from ever running at all +/* disable this to prevent the advanced timing logic from ever running at all */ #define ENABLE_ADVANCED_TIMING #ifdef ENABLE_ADVANCED_TIMING - // makes non-sequential accesses slower than sequential ones. +/* makes non-sequential accesses slower than sequential ones. */ #define ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS - //(SOMETIMES THIS IS A BIG SPEED HIT!) +/* (SOMETIMES THIS IS A BIG SPEED HIT!) */ - // enables emulation of code fetch waits. +/* enables emulation of code fetch waits. */ #define ACCOUNT_FOR_CODE_FETCH_CYCLES - // makes access to DTCM (arm9 only) fast. +/* makes access to DTCM (arm9 only) fast. */ #define ACCOUNT_FOR_DATA_TCM_SPEED - // enables simulation of cache hits and cache misses. +/* enables simulation of cache hits and cache misses. */ #define ENABLE_CACHE_CONTROLLER_EMULATION -#endif //ENABLE_ADVANCED_TIMING +#endif /* ENABLE_ADVANCED_TIMING */ -// -//////////////////////////////////////////////////////////////// - -FORCEINLINE bool USE_TIMING() { +FORCEINLINE bool USE_TIMING(void) +{ #ifdef ENABLE_ADVANCED_TIMING return CommonSettings.advanced_timing; #else @@ -69,16 +67,16 @@ FORCEINLINE bool USE_TIMING() { #endif } - enum MMU_ACCESS_DIRECTION { - MMU_AD_READ, MMU_AD_WRITE + MMU_AD_READ = 0, + MMU_AD_WRITE }; -// note that we don't actually emulate the cache contents here, -// only enough to guess what would be a cache hit or a cache miss. -// this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. +/* note that we don't actually emulate the cache contents here, + * only enough to guess what would be a cache hit or a cache miss. + * this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. */ template class CacheController { @@ -89,13 +87,13 @@ public: u32 blockMasked = addr & BLOCKMASK; if(blockMasked == m_cacheCache) return true; - else - return this->CachedInternal(addr, blockMasked); + return this->CachedInternal(addr, blockMasked); } void Reset() { - for(int blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++) + unsigned blockIndex; + for(blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++) m_blocks[blockIndex].Reset(); m_cacheCache = ~0; } @@ -106,20 +104,24 @@ public: void savestate(EMUFILE* os, int version) { + unsigned i; write32le(m_cacheCache, os); - for(int i = 0; i < NUMBLOCKS; i++) + for(i = 0; i < NUMBLOCKS; i++) { - for(int j = 0; j < ASSOCIATIVITY; j++) + unsigned j; + for(j = 0; j < ASSOCIATIVITY; j++) write32le(m_blocks[i].tag[j],os); write32le(m_blocks[i].nextWay,os); } } bool loadstate(EMUFILE* is, int version) { + unsigned i; read32le(&m_cacheCache, is); - for(int i = 0; i < NUMBLOCKS; i++) + for(i = 0; i < NUMBLOCKS; i++) { - for(int j = 0; j < ASSOCIATIVITY; j++) + unsigned j; + for(j = 0; j < ASSOCIATIVITY; j++) read32le(&m_blocks[i].tag[j],is); read32le(&m_blocks[i].nextWay,is); } @@ -130,20 +132,24 @@ private: template bool CachedInternal(u32 addr, u32 blockMasked) { + unsigned way; u32 blockIndex = blockMasked >> BLOCKSIZESHIFT; CacheBlock& block = m_blocks[blockIndex]; addr &= TAGMASK; - for(int way = 0; way < ASSOCIATIVITY; way++) - if(addr == block.tag[way]) - { - // found it, already allocated - m_cacheCache = blockMasked; - return true; - } + for(way = 0; way < ASSOCIATIVITY; way++) + { + if(addr != block.tag[way]) + continue; + + /* found it, already allocated */ + m_cacheCache = blockMasked; + return true; + } + if(DIR == MMU_AD_READ) { - // TODO: support other allocation orders? + /* TODO: support other allocation orders? */ block.tag[block.nextWay++] = addr; block.nextWay %= ASSOCIATIVITY; m_cacheCache = blockMasked; @@ -170,13 +176,14 @@ private: void Reset() { + unsigned way; nextWay = 0; - for(int way = 0; way < ASSOCIATIVITY; way++) + for(way = 0; way < ASSOCIATIVITY; way++) tag[way] = 0; } }; - u32 m_cacheCache; // optimization + u32 m_cacheCache; /* optimization */ CacheBlock m_blocks [NUMBLOCKS]; }; @@ -200,9 +207,7 @@ public: #endif if(AT == MMU_AT_CODE && !prohibit) - { return 1; - } u32 time = _MMU_accesstime(address, #ifdef ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS @@ -248,12 +253,12 @@ private: struct MMU_struct_timing { - // technically part of the cp15, but I didn't want the dereferencing penalty. - // these template values correspond with the value of armcp15->cacheType. - CacheController<13,2,5> arm9codeCache; // 8192 bytes, 4-way associative, 32-byte blocks - CacheController<12,2,5> arm9dataCache; // 4096 bytes, 4-way associative, 32-byte blocks + /* technically part of the cp15, but I didn't want the dereferencing penalty. + * these template values correspond with the value of armcp15->cacheType. */ + CacheController<13,2,5> arm9codeCache; /* 8192 bytes, 4-way associative, 32-byte blocks */ + CacheController<12,2,5> arm9dataCache; /* 4096 bytes, 4-way associative, 32-byte blocks */ - // technically part of armcpu_t, but that struct isn't templated on PROCNUM + /* technically part of armcpu_t, but that struct isn't templated on PROCNUM */ FetchAccessUnit<0,MMU_AT_CODE> arm9codeFetch; FetchAccessUnit<0,MMU_AT_DATA> arm9dataFetch; FetchAccessUnit<1,MMU_AT_CODE> arm7codeFetch; @@ -270,19 +275,17 @@ template<> FORCEINLINE FetchAccessUnit<1,MMU_AT_DATA>& MMU_struct_timing::armDat extern MMU_struct_timing MMU_timing; - - -// calculates the time a single memory access takes, -// in units of cycles of the current processor. -// this function replaces what used to be MMU_WAIT16 and MMU_WAIT32. -// this may have side effects, so don't call it more than necessary. +/* calculates the time a single memory access takes, + * in units of cycles of the current processor. + * this function replaces what used to be MMU_WAIT16 and MMU_WAIT32. + * this may have side effects, so don't call it more than necessary. */ template FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) { - static const int MC = 1; // cached or tcm memory speed - static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; // access through 32-bit bus - static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); // access through 16-bit bus - static const int MSLW = M16 * 8; // this needs tuning + static const int MC = 1; /* cached or tcm memory speed */ + static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; /* access through 32-bit bus */ + static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); /* access through 16-bit bus */ + static const int MSLW = M16 * 8; /* this needs tuning */ if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE && addr < 0x02000000) return MC; // ITCM @@ -292,7 +295,7 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) return MC; // DTCM #endif - // for now, assume the cache is always enabled for all of main memory + /* for now, assume the cache is always enabled for all of main memory */ if(AT != MMU_AT_DMA && TIMING && PROCNUM==ARMCPU_ARM9 && (addr & 0x0F000000) == 0x02000000) { #ifdef ENABLE_CACHE_CONTROLLER_EMULATION @@ -305,15 +308,15 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) return MC; u32 c; if(sequential && AT==MMU_AT_DATA) - c = M16; // bonus for sequential data access + c = M16; /* bonus for sequential data access */ else if(DIRECTION == MMU_AD_READ) c = M16 * 5; else - c = M16 * 2; // should be 4, but write buffer isn't emulated yet. + c = M16 * 2; /* should be 4, but write buffer isn't emulated yet. */ if(DIRECTION == MMU_AD_READ) { - // cache miss while reading means it has to fill a whole cache line - // by reading 32 bytes... + /* cache miss while reading means it has to fill a whole cache line + * by reading 32 bytes... */ c += 8 * M32*2; } @@ -326,9 +329,9 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) return c; #elif defined(ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS) - // this is the closest approximation I could find - // to the with-cache-controller timing - // that doesn't do any actual caching logic. + /* this is the closest approximation I could find + * to the with-cache-controller timing + * that doesn't do any actual caching logic. */ return sequential ? MC : M16; #endif } @@ -347,29 +350,22 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) if(TIMING && !sequential) { //if(c != MC || PROCNUM==ARMCPU_ARM7) // check not needed anymore because ITCM/DTCM return earlier - { - c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1; - } + c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1; } #endif return c; } - - - - -// calculates the cycle time of a single memory access in the MEM stage. -// to be used to calculate the memCycles argument for MMU_aluMemCycles. -// this may have side effects, so don't call it more than necessary. +/* calculates the cycle time of a single memory access in the MEM stage. + * to be used to calculate the memCycles argument for MMU_aluMemCycles. + * this may have side effects, so don't call it more than necessary. */ template FORCEINLINE u32 MMU_memAccessCycles(u32 addr) { if(TIMING) return MMU_timing.armDataFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); - else - return MMU_timing.armDataFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); + return MMU_timing.armDataFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); } template @@ -377,48 +373,44 @@ FORCEINLINE u32 MMU_memAccessCycles(u32 addr) { if(USE_TIMING()) return MMU_memAccessCycles(addr); - else - return MMU_memAccessCycles(addr); + return MMU_memAccessCycles(addr); } -// calculates the cycle time of a single code fetch in the FETCH stage -// to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles. -// this may have side effects, so don't call it more than necessary. +/* calculates the cycle time of a single code fetch in the FETCH stage + * to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles. + * this may have side effects, so don't call it more than necessary. */ template FORCEINLINE u32 MMU_codeFetchCycles(u32 addr) { if(USE_TIMING()) return MMU_timing.armCodeFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); - else - return MMU_timing.armCodeFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); + return MMU_timing.armCodeFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); } -// calculates the cycle contribution of ALU + MEM stages (= EXECUTE) -// given ALU cycle time and the summation of multiple memory access cycle times. -// this function might belong more in armcpu, but I don't think it matters. +/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE) + * given ALU cycle time and the summation of multiple memory access cycle times. + * this function might belong more in armcpu, but I don't think it matters. */ template FORCEINLINE u32 MMU_aluMemCycles(u32 aluCycles, u32 memCycles) { - if(PROCNUM==ARMCPU_ARM9) - { - // ALU and MEM are different stages of the 5-stage pipeline. - // we approximate the pipeline throughput using max, - // since simply adding the cycles of each instruction together - // fails to take into account the parallelism of the arm pipeline - // and would make the emulated system unnaturally slow. - return std::max(aluCycles, memCycles); - } - else - { - // ALU and MEM are part of the same stage of the 3-stage pipeline, - // thus they occur in sequence and we can simply add the counts together. - return aluCycles + memCycles; - } + if(PROCNUM==ARMCPU_ARM9) + { + /* ALU and MEM are different stages of the 5-stage pipeline. + * we approximate the pipeline throughput using max, + * since simply adding the cycles of each instruction together + * fails to take into account the parallelism of the arm pipeline + * and would make the emulated system unnaturally slow. */ + return std::max(aluCycles, memCycles); + } + + /* ALU and MEM are part of the same stage of the 3-stage pipeline, + * thus they occur in sequence and we can simply add the counts together. */ + return aluCycles + memCycles; } -// calculates the cycle contribution of ALU + MEM stages (= EXECUTE) -// given ALU cycle time and the description of a single memory access. -// this may have side effects, so don't call it more than necessary. +/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE) + * given ALU cycle time and the description of a single memory access. + * this may have side effects, so don't call it more than necessary. */ template FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr) { @@ -429,30 +421,28 @@ FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr) return MMU_aluMemCycles(aluCycles, memCycles); } -// calculates the cycle contribution of FETCH + EXECUTE stages -// given executeCycles = the combined ALU+MEM cycles -// and fetchCycles = the cycle time of the FETCH stage -// this function might belong more in armcpu, but I don't think it matters. +/* calculates the cycle contribution of FETCH + EXECUTE stages + * given executeCycles = the combined ALU+MEM cycles + * and fetchCycles = the cycle time of the FETCH stage + * this function might belong more in armcpu, but I don't think it matters. */ template FORCEINLINE u32 MMU_fetchExecuteCycles(u32 executeCycles, u32 fetchCycles) { - #ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES - const bool allow = true; - #else - const bool allow = false; - #endif +#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES + if(USE_TIMING()) + { + /* execute and fetch are different stages of the pipeline for both arm7 and arm9. + * again, we approximate the pipeline throughput using max. */ + return std::max(executeCycles, fetchCycles); + /* TODO: add an option to support conflict between MEM and FETCH cycles + * if they're both using the same data bus. + * in the case of a conflict this should be: + * return std::max(aluCycles, memCycles + fetchCycles); + */ + } +#endif - if(USE_TIMING() && allow) - { - // execute and fetch are different stages of the pipeline for both arm7 and arm9. - // again, we approximate the pipeline throughput using max. - return std::max(executeCycles, fetchCycles); - // TODO: add an option to support conflict between MEM and FETCH cycles - // if they're both using the same data bus. - // in the case of a conflict this should be: - // return std::max(aluCycles, memCycles + fetchCycles); - } - return executeCycles; + return executeCycles; }