From 602da31591ec1f6461315470567b045e456e4e72 Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 16 Aug 2016 14:16:59 -0500 Subject: [PATCH] Revert "(MMU_timings.h) Style nits/cleanups" This reverts commit 04c8b0915e85da85a8f08a7b2ad6814d95b6618f. --- desmume/src/MMU_timing.h | 244 ++++++++++++++++++++------------------- 1 file changed, 127 insertions(+), 117 deletions(-) diff --git a/desmume/src/MMU_timing.h b/desmume/src/MMU_timing.h index 9952d27e4..a9520c10f 100644 --- a/desmume/src/MMU_timing.h +++ b/desmume/src/MMU_timing.h @@ -29,37 +29,39 @@ #include "debug.h" #include "NDSSystem.h" -/* - * MEMORY TIMING ACCURACY CONFIGURATION - * - * the more of these are enabled, - * the more accurate memory access timing _should_ become. - * they should be listed roughly in order of most to least important. - * it's reasonable to disable some of these as a speed hack. - * obviously, these defines don't cover all the variables or features needed, - * and in particular, DMA or code+data access bus contention is still missing. */ +//////////////////////////////////////////////////////////////// +// MEMORY TIMING ACCURACY CONFIGURATION +// +// the more of these are enabled, +// the more accurate memory access timing _should_ become. +// they should be listed roughly in order of most to least important. +// it's reasonable to disable some of these as a speed hack. +// obviously, these defines don't cover all the variables or features needed, +// and in particular, DMA or code+data access bus contention is still missing. -/* disable this to prevent the advanced timing logic from ever running at all */ + //disable this to prevent the advanced timing logic from ever running at all #define ENABLE_ADVANCED_TIMING #ifdef ENABLE_ADVANCED_TIMING -/* makes non-sequential accesses slower than sequential ones. */ + // makes non-sequential accesses slower than sequential ones. #define ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS -/* (SOMETIMES THIS IS A BIG SPEED HIT!) */ + //(SOMETIMES THIS IS A BIG SPEED HIT!) -/* enables emulation of code fetch waits. */ + // enables emulation of code fetch waits. #define ACCOUNT_FOR_CODE_FETCH_CYCLES -/* makes access to DTCM (arm9 only) fast. */ + // makes access to DTCM (arm9 only) fast. #define ACCOUNT_FOR_DATA_TCM_SPEED -/* enables simulation of cache hits and cache misses. */ + // enables simulation of cache hits and cache misses. #define ENABLE_CACHE_CONTROLLER_EMULATION -#endif /* ENABLE_ADVANCED_TIMING */ +#endif //ENABLE_ADVANCED_TIMING -FORCEINLINE bool USE_TIMING(void) -{ +// +//////////////////////////////////////////////////////////////// + +FORCEINLINE bool USE_TIMING() { #ifdef ENABLE_ADVANCED_TIMING return CommonSettings.advanced_timing; #else @@ -67,16 +69,16 @@ FORCEINLINE bool USE_TIMING(void) #endif } + enum MMU_ACCESS_DIRECTION { - MMU_AD_READ = 0, - MMU_AD_WRITE + MMU_AD_READ, MMU_AD_WRITE }; -/* note that we don't actually emulate the cache contents here, - * only enough to guess what would be a cache hit or a cache miss. - * this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. */ +// note that we don't actually emulate the cache contents here, +// only enough to guess what would be a cache hit or a cache miss. +// this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. template class CacheController { @@ -87,13 +89,13 @@ public: u32 blockMasked = addr & BLOCKMASK; if(blockMasked == m_cacheCache) return true; - return this->CachedInternal(addr, blockMasked); + else + return this->CachedInternal(addr, blockMasked); } void Reset() { - unsigned blockIndex; - for(blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++) + for(int blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++) m_blocks[blockIndex].Reset(); m_cacheCache = ~0; } @@ -104,24 +106,20 @@ public: void savestate(EMUFILE* os, int version) { - unsigned i; write32le(m_cacheCache, os); - for(i = 0; i < NUMBLOCKS; i++) + for(int i = 0; i < NUMBLOCKS; i++) { - unsigned j; - for(j = 0; j < ASSOCIATIVITY; j++) + for(int j = 0; j < ASSOCIATIVITY; j++) write32le(m_blocks[i].tag[j],os); write32le(m_blocks[i].nextWay,os); } } bool loadstate(EMUFILE* is, int version) { - unsigned i; read32le(&m_cacheCache, is); - for(i = 0; i < NUMBLOCKS; i++) + for(int i = 0; i < NUMBLOCKS; i++) { - unsigned j; - for(j = 0; j < ASSOCIATIVITY; j++) + for(int j = 0; j < ASSOCIATIVITY; j++) read32le(&m_blocks[i].tag[j],is); read32le(&m_blocks[i].nextWay,is); } @@ -132,24 +130,20 @@ private: template bool CachedInternal(u32 addr, u32 blockMasked) { - unsigned way; u32 blockIndex = blockMasked >> BLOCKSIZESHIFT; CacheBlock& block = m_blocks[blockIndex]; addr &= TAGMASK; - for(way = 0; way < ASSOCIATIVITY; way++) - { - if(addr != block.tag[way]) - continue; - - /* found it, already allocated */ - m_cacheCache = blockMasked; - return true; - } - + for(int way = 0; way < ASSOCIATIVITY; way++) + if(addr == block.tag[way]) + { + // found it, already allocated + m_cacheCache = blockMasked; + return true; + } if(DIR == MMU_AD_READ) { - /* TODO: support other allocation orders? */ + // TODO: support other allocation orders? block.tag[block.nextWay++] = addr; block.nextWay %= ASSOCIATIVITY; m_cacheCache = blockMasked; @@ -176,14 +170,13 @@ private: void Reset() { - unsigned way; nextWay = 0; - for(way = 0; way < ASSOCIATIVITY; way++) + for(int way = 0; way < ASSOCIATIVITY; way++) tag[way] = 0; } }; - u32 m_cacheCache; /* optimization */ + u32 m_cacheCache; // optimization CacheBlock m_blocks [NUMBLOCKS]; }; @@ -207,7 +200,9 @@ public: #endif if(AT == MMU_AT_CODE && !prohibit) + { return 1; + } u32 time = _MMU_accesstime(address, #ifdef ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS @@ -253,12 +248,12 @@ private: struct MMU_struct_timing { - /* technically part of the cp15, but I didn't want the dereferencing penalty. - * these template values correspond with the value of armcp15->cacheType. */ - CacheController<13,2,5> arm9codeCache; /* 8192 bytes, 4-way associative, 32-byte blocks */ - CacheController<12,2,5> arm9dataCache; /* 4096 bytes, 4-way associative, 32-byte blocks */ + // technically part of the cp15, but I didn't want the dereferencing penalty. + // these template values correspond with the value of armcp15->cacheType. + CacheController<13,2,5> arm9codeCache; // 8192 bytes, 4-way associative, 32-byte blocks + CacheController<12,2,5> arm9dataCache; // 4096 bytes, 4-way associative, 32-byte blocks - /* technically part of armcpu_t, but that struct isn't templated on PROCNUM */ + // technically part of armcpu_t, but that struct isn't templated on PROCNUM FetchAccessUnit<0,MMU_AT_CODE> arm9codeFetch; FetchAccessUnit<0,MMU_AT_DATA> arm9dataFetch; FetchAccessUnit<1,MMU_AT_CODE> arm7codeFetch; @@ -275,17 +270,19 @@ template<> FORCEINLINE FetchAccessUnit<1,MMU_AT_DATA>& MMU_struct_timing::armDat extern MMU_struct_timing MMU_timing; -/* calculates the time a single memory access takes, - * in units of cycles of the current processor. - * this function replaces what used to be MMU_WAIT16 and MMU_WAIT32. - * this may have side effects, so don't call it more than necessary. */ + + +// calculates the time a single memory access takes, +// in units of cycles of the current processor. +// this function replaces what used to be MMU_WAIT16 and MMU_WAIT32. +// this may have side effects, so don't call it more than necessary. template FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) { - static const int MC = 1; /* cached or tcm memory speed */ - static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; /* access through 32-bit bus */ - static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); /* access through 16-bit bus */ - static const int MSLW = M16 * 8; /* this needs tuning */ + static const int MC = 1; // cached or tcm memory speed + static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; // access through 32-bit bus + static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); // access through 16-bit bus + static const int MSLW = M16 * 8; // this needs tuning if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE && addr < 0x02000000) return MC; // ITCM @@ -295,7 +292,7 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) return MC; // DTCM #endif - /* for now, assume the cache is always enabled for all of main memory */ + // for now, assume the cache is always enabled for all of main memory if(AT != MMU_AT_DMA && TIMING && PROCNUM==ARMCPU_ARM9 && (addr & 0x0F000000) == 0x02000000) { #ifdef ENABLE_CACHE_CONTROLLER_EMULATION @@ -308,15 +305,15 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) return MC; u32 c; if(sequential && AT==MMU_AT_DATA) - c = M16; /* bonus for sequential data access */ + c = M16; // bonus for sequential data access else if(DIRECTION == MMU_AD_READ) c = M16 * 5; else - c = M16 * 2; /* should be 4, but write buffer isn't emulated yet. */ + c = M16 * 2; // should be 4, but write buffer isn't emulated yet. if(DIRECTION == MMU_AD_READ) { - /* cache miss while reading means it has to fill a whole cache line - * by reading 32 bytes... */ + // cache miss while reading means it has to fill a whole cache line + // by reading 32 bytes... c += 8 * M32*2; } @@ -329,9 +326,9 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) return c; #elif defined(ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS) - /* this is the closest approximation I could find - * to the with-cache-controller timing - * that doesn't do any actual caching logic. */ + // this is the closest approximation I could find + // to the with-cache-controller timing + // that doesn't do any actual caching logic. return sequential ? MC : M16; #endif } @@ -350,22 +347,29 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) if(TIMING && !sequential) { //if(c != MC || PROCNUM==ARMCPU_ARM7) // check not needed anymore because ITCM/DTCM return earlier - c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1; + { + c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1; + } } #endif return c; } -/* calculates the cycle time of a single memory access in the MEM stage. - * to be used to calculate the memCycles argument for MMU_aluMemCycles. - * this may have side effects, so don't call it more than necessary. */ + + + + +// calculates the cycle time of a single memory access in the MEM stage. +// to be used to calculate the memCycles argument for MMU_aluMemCycles. +// this may have side effects, so don't call it more than necessary. template FORCEINLINE u32 MMU_memAccessCycles(u32 addr) { if(TIMING) return MMU_timing.armDataFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); - return MMU_timing.armDataFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); + else + return MMU_timing.armDataFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); } template @@ -373,44 +377,48 @@ FORCEINLINE u32 MMU_memAccessCycles(u32 addr) { if(USE_TIMING()) return MMU_memAccessCycles(addr); - return MMU_memAccessCycles(addr); + else + return MMU_memAccessCycles(addr); } -/* calculates the cycle time of a single code fetch in the FETCH stage - * to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles. - * this may have side effects, so don't call it more than necessary. */ +// calculates the cycle time of a single code fetch in the FETCH stage +// to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles. +// this may have side effects, so don't call it more than necessary. template FORCEINLINE u32 MMU_codeFetchCycles(u32 addr) { if(USE_TIMING()) return MMU_timing.armCodeFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); - return MMU_timing.armCodeFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); + else + return MMU_timing.armCodeFetch().template Fetch((addr)&(~((READSIZE>>3)-1))); } -/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE) - * given ALU cycle time and the summation of multiple memory access cycle times. - * this function might belong more in armcpu, but I don't think it matters. */ +// calculates the cycle contribution of ALU + MEM stages (= EXECUTE) +// given ALU cycle time and the summation of multiple memory access cycle times. +// this function might belong more in armcpu, but I don't think it matters. template FORCEINLINE u32 MMU_aluMemCycles(u32 aluCycles, u32 memCycles) { - if(PROCNUM==ARMCPU_ARM9) - { - /* ALU and MEM are different stages of the 5-stage pipeline. - * we approximate the pipeline throughput using max, - * since simply adding the cycles of each instruction together - * fails to take into account the parallelism of the arm pipeline - * and would make the emulated system unnaturally slow. */ - return std::max(aluCycles, memCycles); - } - - /* ALU and MEM are part of the same stage of the 3-stage pipeline, - * thus they occur in sequence and we can simply add the counts together. */ - return aluCycles + memCycles; + if(PROCNUM==ARMCPU_ARM9) + { + // ALU and MEM are different stages of the 5-stage pipeline. + // we approximate the pipeline throughput using max, + // since simply adding the cycles of each instruction together + // fails to take into account the parallelism of the arm pipeline + // and would make the emulated system unnaturally slow. + return std::max(aluCycles, memCycles); + } + else + { + // ALU and MEM are part of the same stage of the 3-stage pipeline, + // thus they occur in sequence and we can simply add the counts together. + return aluCycles + memCycles; + } } -/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE) - * given ALU cycle time and the description of a single memory access. - * this may have side effects, so don't call it more than necessary. */ +// calculates the cycle contribution of ALU + MEM stages (= EXECUTE) +// given ALU cycle time and the description of a single memory access. +// this may have side effects, so don't call it more than necessary. template FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr) { @@ -421,28 +429,30 @@ FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr) return MMU_aluMemCycles(aluCycles, memCycles); } -/* calculates the cycle contribution of FETCH + EXECUTE stages - * given executeCycles = the combined ALU+MEM cycles - * and fetchCycles = the cycle time of the FETCH stage - * this function might belong more in armcpu, but I don't think it matters. */ +// calculates the cycle contribution of FETCH + EXECUTE stages +// given executeCycles = the combined ALU+MEM cycles +// and fetchCycles = the cycle time of the FETCH stage +// this function might belong more in armcpu, but I don't think it matters. template FORCEINLINE u32 MMU_fetchExecuteCycles(u32 executeCycles, u32 fetchCycles) { -#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES - if(USE_TIMING()) - { - /* execute and fetch are different stages of the pipeline for both arm7 and arm9. - * again, we approximate the pipeline throughput using max. */ - return std::max(executeCycles, fetchCycles); - /* TODO: add an option to support conflict between MEM and FETCH cycles - * if they're both using the same data bus. - * in the case of a conflict this should be: - * return std::max(aluCycles, memCycles + fetchCycles); - */ - } -#endif + #ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES + const bool allow = true; + #else + const bool allow = false; + #endif - return executeCycles; + if(USE_TIMING() && allow) + { + // execute and fetch are different stages of the pipeline for both arm7 and arm9. + // again, we approximate the pipeline throughput using max. + return std::max(executeCycles, fetchCycles); + // TODO: add an option to support conflict between MEM and FETCH cycles + // if they're both using the same data bus. + // in the case of a conflict this should be: + // return std::max(aluCycles, memCycles + fetchCycles); + } + return executeCycles; }