Revert "(MMU_timings.h) Style nits/cleanups"

This reverts commit 04c8b0915e85da85a8f08a7b2ad6814d95b6618f.
This commit is contained in:
zeromus 2016-08-16 14:16:59 -05:00
parent b554b05b00
commit 602da31591
1 changed files with 127 additions and 117 deletions

View File

@ -29,37 +29,39 @@
#include "debug.h" #include "debug.h"
#include "NDSSystem.h" #include "NDSSystem.h"
/* ////////////////////////////////////////////////////////////////
* MEMORY TIMING ACCURACY CONFIGURATION // MEMORY TIMING ACCURACY CONFIGURATION
* //
* the more of these are enabled, // the more of these are enabled,
* the more accurate memory access timing _should_ become. // the more accurate memory access timing _should_ become.
* they should be listed roughly in order of most to least important. // they should be listed roughly in order of most to least important.
* it's reasonable to disable some of these as a speed hack. // it's reasonable to disable some of these as a speed hack.
* obviously, these defines don't cover all the variables or features needed, // obviously, these defines don't cover all the variables or features needed,
* and in particular, DMA or code+data access bus contention is still missing. */ // and in particular, DMA or code+data access bus contention is still missing.
/* disable this to prevent the advanced timing logic from ever running at all */ //disable this to prevent the advanced timing logic from ever running at all
#define ENABLE_ADVANCED_TIMING #define ENABLE_ADVANCED_TIMING
#ifdef ENABLE_ADVANCED_TIMING #ifdef ENABLE_ADVANCED_TIMING
/* makes non-sequential accesses slower than sequential ones. */ // makes non-sequential accesses slower than sequential ones.
#define ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS #define ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS
/* (SOMETIMES THIS IS A BIG SPEED HIT!) */ //(SOMETIMES THIS IS A BIG SPEED HIT!)
/* enables emulation of code fetch waits. */ // enables emulation of code fetch waits.
#define ACCOUNT_FOR_CODE_FETCH_CYCLES #define ACCOUNT_FOR_CODE_FETCH_CYCLES
/* makes access to DTCM (arm9 only) fast. */ // makes access to DTCM (arm9 only) fast.
#define ACCOUNT_FOR_DATA_TCM_SPEED #define ACCOUNT_FOR_DATA_TCM_SPEED
/* enables simulation of cache hits and cache misses. */ // enables simulation of cache hits and cache misses.
#define ENABLE_CACHE_CONTROLLER_EMULATION #define ENABLE_CACHE_CONTROLLER_EMULATION
#endif /* ENABLE_ADVANCED_TIMING */ #endif //ENABLE_ADVANCED_TIMING
FORCEINLINE bool USE_TIMING(void) //
{ ////////////////////////////////////////////////////////////////
FORCEINLINE bool USE_TIMING() {
#ifdef ENABLE_ADVANCED_TIMING #ifdef ENABLE_ADVANCED_TIMING
return CommonSettings.advanced_timing; return CommonSettings.advanced_timing;
#else #else
@ -67,16 +69,16 @@ FORCEINLINE bool USE_TIMING(void)
#endif #endif
} }
enum MMU_ACCESS_DIRECTION enum MMU_ACCESS_DIRECTION
{ {
MMU_AD_READ = 0, MMU_AD_READ, MMU_AD_WRITE
MMU_AD_WRITE
}; };
/* note that we don't actually emulate the cache contents here, // note that we don't actually emulate the cache contents here,
* only enough to guess what would be a cache hit or a cache miss. // only enough to guess what would be a cache hit or a cache miss.
* this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. */ // this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined.
template<int SIZESHIFT, int ASSOCIATIVESHIFT, int BLOCKSIZESHIFT> template<int SIZESHIFT, int ASSOCIATIVESHIFT, int BLOCKSIZESHIFT>
class CacheController class CacheController
{ {
@ -87,13 +89,13 @@ public:
u32 blockMasked = addr & BLOCKMASK; u32 blockMasked = addr & BLOCKMASK;
if(blockMasked == m_cacheCache) if(blockMasked == m_cacheCache)
return true; return true;
return this->CachedInternal<DIR>(addr, blockMasked); else
return this->CachedInternal<DIR>(addr, blockMasked);
} }
void Reset() void Reset()
{ {
unsigned blockIndex; for(int blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++)
for(blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++)
m_blocks[blockIndex].Reset(); m_blocks[blockIndex].Reset();
m_cacheCache = ~0; m_cacheCache = ~0;
} }
@ -104,24 +106,20 @@ public:
void savestate(EMUFILE* os, int version) void savestate(EMUFILE* os, int version)
{ {
unsigned i;
write32le(m_cacheCache, os); write32le(m_cacheCache, os);
for(i = 0; i < NUMBLOCKS; i++) for(int i = 0; i < NUMBLOCKS; i++)
{ {
unsigned j; for(int j = 0; j < ASSOCIATIVITY; j++)
for(j = 0; j < ASSOCIATIVITY; j++)
write32le(m_blocks[i].tag[j],os); write32le(m_blocks[i].tag[j],os);
write32le(m_blocks[i].nextWay,os); write32le(m_blocks[i].nextWay,os);
} }
} }
bool loadstate(EMUFILE* is, int version) bool loadstate(EMUFILE* is, int version)
{ {
unsigned i;
read32le(&m_cacheCache, is); read32le(&m_cacheCache, is);
for(i = 0; i < NUMBLOCKS; i++) for(int i = 0; i < NUMBLOCKS; i++)
{ {
unsigned j; for(int j = 0; j < ASSOCIATIVITY; j++)
for(j = 0; j < ASSOCIATIVITY; j++)
read32le(&m_blocks[i].tag[j],is); read32le(&m_blocks[i].tag[j],is);
read32le(&m_blocks[i].nextWay,is); read32le(&m_blocks[i].nextWay,is);
} }
@ -132,24 +130,20 @@ private:
template<MMU_ACCESS_DIRECTION DIR> template<MMU_ACCESS_DIRECTION DIR>
bool CachedInternal(u32 addr, u32 blockMasked) bool CachedInternal(u32 addr, u32 blockMasked)
{ {
unsigned way;
u32 blockIndex = blockMasked >> BLOCKSIZESHIFT; u32 blockIndex = blockMasked >> BLOCKSIZESHIFT;
CacheBlock& block = m_blocks[blockIndex]; CacheBlock& block = m_blocks[blockIndex];
addr &= TAGMASK; addr &= TAGMASK;
for(way = 0; way < ASSOCIATIVITY; way++) for(int way = 0; way < ASSOCIATIVITY; way++)
{ if(addr == block.tag[way])
if(addr != block.tag[way]) {
continue; // found it, already allocated
m_cacheCache = blockMasked;
/* found it, already allocated */ return true;
m_cacheCache = blockMasked; }
return true;
}
if(DIR == MMU_AD_READ) if(DIR == MMU_AD_READ)
{ {
/* TODO: support other allocation orders? */ // TODO: support other allocation orders?
block.tag[block.nextWay++] = addr; block.tag[block.nextWay++] = addr;
block.nextWay %= ASSOCIATIVITY; block.nextWay %= ASSOCIATIVITY;
m_cacheCache = blockMasked; m_cacheCache = blockMasked;
@ -176,14 +170,13 @@ private:
void Reset() void Reset()
{ {
unsigned way;
nextWay = 0; nextWay = 0;
for(way = 0; way < ASSOCIATIVITY; way++) for(int way = 0; way < ASSOCIATIVITY; way++)
tag[way] = 0; tag[way] = 0;
} }
}; };
u32 m_cacheCache; /* optimization */ u32 m_cacheCache; // optimization
CacheBlock m_blocks [NUMBLOCKS]; CacheBlock m_blocks [NUMBLOCKS];
}; };
@ -207,7 +200,9 @@ public:
#endif #endif
if(AT == MMU_AT_CODE && !prohibit) if(AT == MMU_AT_CODE && !prohibit)
{
return 1; return 1;
}
u32 time = _MMU_accesstime<PROCNUM, AT, READSIZE, DIRECTION,TIMING>(address, u32 time = _MMU_accesstime<PROCNUM, AT, READSIZE, DIRECTION,TIMING>(address,
#ifdef ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS #ifdef ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS
@ -253,12 +248,12 @@ private:
struct MMU_struct_timing struct MMU_struct_timing
{ {
/* technically part of the cp15, but I didn't want the dereferencing penalty. // technically part of the cp15, but I didn't want the dereferencing penalty.
* these template values correspond with the value of armcp15->cacheType. */ // these template values correspond with the value of armcp15->cacheType.
CacheController<13,2,5> arm9codeCache; /* 8192 bytes, 4-way associative, 32-byte blocks */ CacheController<13,2,5> arm9codeCache; // 8192 bytes, 4-way associative, 32-byte blocks
CacheController<12,2,5> arm9dataCache; /* 4096 bytes, 4-way associative, 32-byte blocks */ CacheController<12,2,5> arm9dataCache; // 4096 bytes, 4-way associative, 32-byte blocks
/* technically part of armcpu_t, but that struct isn't templated on PROCNUM */ // technically part of armcpu_t, but that struct isn't templated on PROCNUM
FetchAccessUnit<0,MMU_AT_CODE> arm9codeFetch; FetchAccessUnit<0,MMU_AT_CODE> arm9codeFetch;
FetchAccessUnit<0,MMU_AT_DATA> arm9dataFetch; FetchAccessUnit<0,MMU_AT_DATA> arm9dataFetch;
FetchAccessUnit<1,MMU_AT_CODE> arm7codeFetch; FetchAccessUnit<1,MMU_AT_CODE> arm7codeFetch;
@ -275,17 +270,19 @@ template<> FORCEINLINE FetchAccessUnit<1,MMU_AT_DATA>& MMU_struct_timing::armDat
extern MMU_struct_timing MMU_timing; extern MMU_struct_timing MMU_timing;
/* calculates the time a single memory access takes,
* in units of cycles of the current processor.
* this function replaces what used to be MMU_WAIT16 and MMU_WAIT32. // calculates the time a single memory access takes,
* this may have side effects, so don't call it more than necessary. */ // in units of cycles of the current processor.
// this function replaces what used to be MMU_WAIT16 and MMU_WAIT32.
// this may have side effects, so don't call it more than necessary.
template<int PROCNUM, MMU_ACCESS_TYPE AT, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING> template<int PROCNUM, MMU_ACCESS_TYPE AT, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING>
FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential) FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
{ {
static const int MC = 1; /* cached or tcm memory speed */ static const int MC = 1; // cached or tcm memory speed
static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; /* access through 32-bit bus */ static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; // access through 32-bit bus
static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); /* access through 16-bit bus */ static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); // access through 16-bit bus
static const int MSLW = M16 * 8; /* this needs tuning */ static const int MSLW = M16 * 8; // this needs tuning
if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE && addr < 0x02000000) if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE && addr < 0x02000000)
return MC; // ITCM return MC; // ITCM
@ -295,7 +292,7 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
return MC; // DTCM return MC; // DTCM
#endif #endif
/* for now, assume the cache is always enabled for all of main memory */ // for now, assume the cache is always enabled for all of main memory
if(AT != MMU_AT_DMA && TIMING && PROCNUM==ARMCPU_ARM9 && (addr & 0x0F000000) == 0x02000000) if(AT != MMU_AT_DMA && TIMING && PROCNUM==ARMCPU_ARM9 && (addr & 0x0F000000) == 0x02000000)
{ {
#ifdef ENABLE_CACHE_CONTROLLER_EMULATION #ifdef ENABLE_CACHE_CONTROLLER_EMULATION
@ -308,15 +305,15 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
return MC; return MC;
u32 c; u32 c;
if(sequential && AT==MMU_AT_DATA) if(sequential && AT==MMU_AT_DATA)
c = M16; /* bonus for sequential data access */ c = M16; // bonus for sequential data access
else if(DIRECTION == MMU_AD_READ) else if(DIRECTION == MMU_AD_READ)
c = M16 * 5; c = M16 * 5;
else else
c = M16 * 2; /* should be 4, but write buffer isn't emulated yet. */ c = M16 * 2; // should be 4, but write buffer isn't emulated yet.
if(DIRECTION == MMU_AD_READ) if(DIRECTION == MMU_AD_READ)
{ {
/* cache miss while reading means it has to fill a whole cache line // cache miss while reading means it has to fill a whole cache line
* by reading 32 bytes... */ // by reading 32 bytes...
c += 8 * M32*2; c += 8 * M32*2;
} }
@ -329,9 +326,9 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
return c; return c;
#elif defined(ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS) #elif defined(ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS)
/* this is the closest approximation I could find // this is the closest approximation I could find
* to the with-cache-controller timing // to the with-cache-controller timing
* that doesn't do any actual caching logic. */ // that doesn't do any actual caching logic.
return sequential ? MC : M16; return sequential ? MC : M16;
#endif #endif
} }
@ -350,22 +347,29 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
if(TIMING && !sequential) if(TIMING && !sequential)
{ {
//if(c != MC || PROCNUM==ARMCPU_ARM7) // check not needed anymore because ITCM/DTCM return earlier //if(c != MC || PROCNUM==ARMCPU_ARM7) // check not needed anymore because ITCM/DTCM return earlier
c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1; {
c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1;
}
} }
#endif #endif
return c; return c;
} }
/* calculates the cycle time of a single memory access in the MEM stage.
* to be used to calculate the memCycles argument for MMU_aluMemCycles.
* this may have side effects, so don't call it more than necessary. */
// calculates the cycle time of a single memory access in the MEM stage.
// to be used to calculate the memCycles argument for MMU_aluMemCycles.
// this may have side effects, so don't call it more than necessary.
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING> template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING>
FORCEINLINE u32 MMU_memAccessCycles(u32 addr) FORCEINLINE u32 MMU_memAccessCycles(u32 addr)
{ {
if(TIMING) if(TIMING)
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,true>((addr)&(~((READSIZE>>3)-1))); return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,true>((addr)&(~((READSIZE>>3)-1)));
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,false>((addr)&(~((READSIZE>>3)-1))); else
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,false>((addr)&(~((READSIZE>>3)-1)));
} }
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION> template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION>
@ -373,44 +377,48 @@ FORCEINLINE u32 MMU_memAccessCycles(u32 addr)
{ {
if(USE_TIMING()) if(USE_TIMING())
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,true>(addr); return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,true>(addr);
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,false>(addr); else
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,false>(addr);
} }
/* calculates the cycle time of a single code fetch in the FETCH stage // calculates the cycle time of a single code fetch in the FETCH stage
* to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles. // to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles.
* this may have side effects, so don't call it more than necessary. */ // this may have side effects, so don't call it more than necessary.
template<int PROCNUM, int READSIZE> template<int PROCNUM, int READSIZE>
FORCEINLINE u32 MMU_codeFetchCycles(u32 addr) FORCEINLINE u32 MMU_codeFetchCycles(u32 addr)
{ {
if(USE_TIMING()) if(USE_TIMING())
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,true>((addr)&(~((READSIZE>>3)-1))); return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,true>((addr)&(~((READSIZE>>3)-1)));
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,false>((addr)&(~((READSIZE>>3)-1))); else
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,false>((addr)&(~((READSIZE>>3)-1)));
} }
/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE) // calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
* given ALU cycle time and the summation of multiple memory access cycle times. // given ALU cycle time and the summation of multiple memory access cycle times.
* this function might belong more in armcpu, but I don't think it matters. */ // this function might belong more in armcpu, but I don't think it matters.
template<int PROCNUM> template<int PROCNUM>
FORCEINLINE u32 MMU_aluMemCycles(u32 aluCycles, u32 memCycles) FORCEINLINE u32 MMU_aluMemCycles(u32 aluCycles, u32 memCycles)
{ {
if(PROCNUM==ARMCPU_ARM9) if(PROCNUM==ARMCPU_ARM9)
{ {
/* ALU and MEM are different stages of the 5-stage pipeline. // ALU and MEM are different stages of the 5-stage pipeline.
* we approximate the pipeline throughput using max, // we approximate the pipeline throughput using max,
* since simply adding the cycles of each instruction together // since simply adding the cycles of each instruction together
* fails to take into account the parallelism of the arm pipeline // fails to take into account the parallelism of the arm pipeline
* and would make the emulated system unnaturally slow. */ // and would make the emulated system unnaturally slow.
return std::max(aluCycles, memCycles); return std::max(aluCycles, memCycles);
} }
else
/* ALU and MEM are part of the same stage of the 3-stage pipeline, {
* thus they occur in sequence and we can simply add the counts together. */ // ALU and MEM are part of the same stage of the 3-stage pipeline,
return aluCycles + memCycles; // thus they occur in sequence and we can simply add the counts together.
return aluCycles + memCycles;
}
} }
/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE) // calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
* given ALU cycle time and the description of a single memory access. // given ALU cycle time and the description of a single memory access.
* this may have side effects, so don't call it more than necessary. */ // this may have side effects, so don't call it more than necessary.
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION> template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION>
FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr) FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr)
{ {
@ -421,28 +429,30 @@ FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr)
return MMU_aluMemCycles<PROCNUM>(aluCycles, memCycles); return MMU_aluMemCycles<PROCNUM>(aluCycles, memCycles);
} }
/* calculates the cycle contribution of FETCH + EXECUTE stages // calculates the cycle contribution of FETCH + EXECUTE stages
* given executeCycles = the combined ALU+MEM cycles // given executeCycles = the combined ALU+MEM cycles
* and fetchCycles = the cycle time of the FETCH stage // and fetchCycles = the cycle time of the FETCH stage
* this function might belong more in armcpu, but I don't think it matters. */ // this function might belong more in armcpu, but I don't think it matters.
template<int PROCNUM> template<int PROCNUM>
FORCEINLINE u32 MMU_fetchExecuteCycles(u32 executeCycles, u32 fetchCycles) FORCEINLINE u32 MMU_fetchExecuteCycles(u32 executeCycles, u32 fetchCycles)
{ {
#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES #ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES
if(USE_TIMING()) const bool allow = true;
{ #else
/* execute and fetch are different stages of the pipeline for both arm7 and arm9. const bool allow = false;
* again, we approximate the pipeline throughput using max. */ #endif
return std::max(executeCycles, fetchCycles);
/* TODO: add an option to support conflict between MEM and FETCH cycles
* if they're both using the same data bus.
* in the case of a conflict this should be:
* return std::max(aluCycles, memCycles + fetchCycles);
*/
}
#endif
return executeCycles; if(USE_TIMING() && allow)
{
// execute and fetch are different stages of the pipeline for both arm7 and arm9.
// again, we approximate the pipeline throughput using max.
return std::max(executeCycles, fetchCycles);
// TODO: add an option to support conflict between MEM and FETCH cycles
// if they're both using the same data bus.
// in the case of a conflict this should be:
// return std::max(aluCycles, memCycles + fetchCycles);
}
return executeCycles;
} }