(MMU_timings.h) Style nits/cleanups
This commit is contained in:
parent
a5ab52c3fb
commit
09879832b3
|
@ -29,39 +29,37 @@
|
|||
#include "debug.h"
|
||||
#include "NDSSystem.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// MEMORY TIMING ACCURACY CONFIGURATION
|
||||
//
|
||||
// the more of these are enabled,
|
||||
// the more accurate memory access timing _should_ become.
|
||||
// they should be listed roughly in order of most to least important.
|
||||
// it's reasonable to disable some of these as a speed hack.
|
||||
// obviously, these defines don't cover all the variables or features needed,
|
||||
// and in particular, DMA or code+data access bus contention is still missing.
|
||||
/*
|
||||
* MEMORY TIMING ACCURACY CONFIGURATION
|
||||
*
|
||||
* the more of these are enabled,
|
||||
* the more accurate memory access timing _should_ become.
|
||||
* they should be listed roughly in order of most to least important.
|
||||
* it's reasonable to disable some of these as a speed hack.
|
||||
* obviously, these defines don't cover all the variables or features needed,
|
||||
* and in particular, DMA or code+data access bus contention is still missing. */
|
||||
|
||||
//disable this to prevent the advanced timing logic from ever running at all
|
||||
/* disable this to prevent the advanced timing logic from ever running at all */
|
||||
#define ENABLE_ADVANCED_TIMING
|
||||
|
||||
#ifdef ENABLE_ADVANCED_TIMING
|
||||
// makes non-sequential accesses slower than sequential ones.
|
||||
/* makes non-sequential accesses slower than sequential ones. */
|
||||
#define ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS
|
||||
//(SOMETIMES THIS IS A BIG SPEED HIT!)
|
||||
/* (SOMETIMES THIS IS A BIG SPEED HIT!) */
|
||||
|
||||
// enables emulation of code fetch waits.
|
||||
/* enables emulation of code fetch waits. */
|
||||
#define ACCOUNT_FOR_CODE_FETCH_CYCLES
|
||||
|
||||
// makes access to DTCM (arm9 only) fast.
|
||||
/* makes access to DTCM (arm9 only) fast. */
|
||||
#define ACCOUNT_FOR_DATA_TCM_SPEED
|
||||
|
||||
// enables simulation of cache hits and cache misses.
|
||||
/* enables simulation of cache hits and cache misses. */
|
||||
#define ENABLE_CACHE_CONTROLLER_EMULATION
|
||||
|
||||
#endif //ENABLE_ADVANCED_TIMING
|
||||
#endif /* ENABLE_ADVANCED_TIMING */
|
||||
|
||||
//
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
FORCEINLINE bool USE_TIMING() {
|
||||
FORCEINLINE bool USE_TIMING(void)
|
||||
{
|
||||
#ifdef ENABLE_ADVANCED_TIMING
|
||||
return CommonSettings.advanced_timing;
|
||||
#else
|
||||
|
@ -69,16 +67,16 @@ FORCEINLINE bool USE_TIMING() {
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
enum MMU_ACCESS_DIRECTION
|
||||
{
|
||||
MMU_AD_READ, MMU_AD_WRITE
|
||||
MMU_AD_READ = 0,
|
||||
MMU_AD_WRITE
|
||||
};
|
||||
|
||||
|
||||
// note that we don't actually emulate the cache contents here,
|
||||
// only enough to guess what would be a cache hit or a cache miss.
|
||||
// this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined.
|
||||
/* note that we don't actually emulate the cache contents here,
|
||||
* only enough to guess what would be a cache hit or a cache miss.
|
||||
* this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. */
|
||||
template<int SIZESHIFT, int ASSOCIATIVESHIFT, int BLOCKSIZESHIFT>
|
||||
class CacheController
|
||||
{
|
||||
|
@ -89,13 +87,13 @@ public:
|
|||
u32 blockMasked = addr & BLOCKMASK;
|
||||
if(blockMasked == m_cacheCache)
|
||||
return true;
|
||||
else
|
||||
return this->CachedInternal<DIR>(addr, blockMasked);
|
||||
return this->CachedInternal<DIR>(addr, blockMasked);
|
||||
}
|
||||
|
||||
void Reset()
|
||||
{
|
||||
for(int blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++)
|
||||
unsigned blockIndex;
|
||||
for(blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++)
|
||||
m_blocks[blockIndex].Reset();
|
||||
m_cacheCache = ~0;
|
||||
}
|
||||
|
@ -106,20 +104,24 @@ public:
|
|||
|
||||
void savestate(EMUFILE* os, int version)
|
||||
{
|
||||
unsigned i;
|
||||
write32le(m_cacheCache, os);
|
||||
for(int i = 0; i < NUMBLOCKS; i++)
|
||||
for(i = 0; i < NUMBLOCKS; i++)
|
||||
{
|
||||
for(int j = 0; j < ASSOCIATIVITY; j++)
|
||||
unsigned j;
|
||||
for(j = 0; j < ASSOCIATIVITY; j++)
|
||||
write32le(m_blocks[i].tag[j],os);
|
||||
write32le(m_blocks[i].nextWay,os);
|
||||
}
|
||||
}
|
||||
bool loadstate(EMUFILE* is, int version)
|
||||
{
|
||||
unsigned i;
|
||||
read32le(&m_cacheCache, is);
|
||||
for(int i = 0; i < NUMBLOCKS; i++)
|
||||
for(i = 0; i < NUMBLOCKS; i++)
|
||||
{
|
||||
for(int j = 0; j < ASSOCIATIVITY; j++)
|
||||
unsigned j;
|
||||
for(j = 0; j < ASSOCIATIVITY; j++)
|
||||
read32le(&m_blocks[i].tag[j],is);
|
||||
read32le(&m_blocks[i].nextWay,is);
|
||||
}
|
||||
|
@ -130,20 +132,24 @@ private:
|
|||
template<MMU_ACCESS_DIRECTION DIR>
|
||||
bool CachedInternal(u32 addr, u32 blockMasked)
|
||||
{
|
||||
unsigned way;
|
||||
u32 blockIndex = blockMasked >> BLOCKSIZESHIFT;
|
||||
CacheBlock& block = m_blocks[blockIndex];
|
||||
addr &= TAGMASK;
|
||||
|
||||
for(int way = 0; way < ASSOCIATIVITY; way++)
|
||||
if(addr == block.tag[way])
|
||||
{
|
||||
// found it, already allocated
|
||||
m_cacheCache = blockMasked;
|
||||
return true;
|
||||
}
|
||||
for(way = 0; way < ASSOCIATIVITY; way++)
|
||||
{
|
||||
if(addr != block.tag[way])
|
||||
continue;
|
||||
|
||||
/* found it, already allocated */
|
||||
m_cacheCache = blockMasked;
|
||||
return true;
|
||||
}
|
||||
|
||||
if(DIR == MMU_AD_READ)
|
||||
{
|
||||
// TODO: support other allocation orders?
|
||||
/* TODO: support other allocation orders? */
|
||||
block.tag[block.nextWay++] = addr;
|
||||
block.nextWay %= ASSOCIATIVITY;
|
||||
m_cacheCache = blockMasked;
|
||||
|
@ -170,13 +176,14 @@ private:
|
|||
|
||||
void Reset()
|
||||
{
|
||||
unsigned way;
|
||||
nextWay = 0;
|
||||
for(int way = 0; way < ASSOCIATIVITY; way++)
|
||||
for(way = 0; way < ASSOCIATIVITY; way++)
|
||||
tag[way] = 0;
|
||||
}
|
||||
};
|
||||
|
||||
u32 m_cacheCache; // optimization
|
||||
u32 m_cacheCache; /* optimization */
|
||||
|
||||
CacheBlock m_blocks [NUMBLOCKS];
|
||||
};
|
||||
|
@ -200,9 +207,7 @@ public:
|
|||
#endif
|
||||
|
||||
if(AT == MMU_AT_CODE && !prohibit)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
u32 time = _MMU_accesstime<PROCNUM, AT, READSIZE, DIRECTION,TIMING>(address,
|
||||
#ifdef ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS
|
||||
|
@ -248,12 +253,12 @@ private:
|
|||
|
||||
struct MMU_struct_timing
|
||||
{
|
||||
// technically part of the cp15, but I didn't want the dereferencing penalty.
|
||||
// these template values correspond with the value of armcp15->cacheType.
|
||||
CacheController<13,2,5> arm9codeCache; // 8192 bytes, 4-way associative, 32-byte blocks
|
||||
CacheController<12,2,5> arm9dataCache; // 4096 bytes, 4-way associative, 32-byte blocks
|
||||
/* technically part of the cp15, but I didn't want the dereferencing penalty.
|
||||
* these template values correspond with the value of armcp15->cacheType. */
|
||||
CacheController<13,2,5> arm9codeCache; /* 8192 bytes, 4-way associative, 32-byte blocks */
|
||||
CacheController<12,2,5> arm9dataCache; /* 4096 bytes, 4-way associative, 32-byte blocks */
|
||||
|
||||
// technically part of armcpu_t, but that struct isn't templated on PROCNUM
|
||||
/* technically part of armcpu_t, but that struct isn't templated on PROCNUM */
|
||||
FetchAccessUnit<0,MMU_AT_CODE> arm9codeFetch;
|
||||
FetchAccessUnit<0,MMU_AT_DATA> arm9dataFetch;
|
||||
FetchAccessUnit<1,MMU_AT_CODE> arm7codeFetch;
|
||||
|
@ -270,19 +275,17 @@ template<> FORCEINLINE FetchAccessUnit<1,MMU_AT_DATA>& MMU_struct_timing::armDat
|
|||
|
||||
extern MMU_struct_timing MMU_timing;
|
||||
|
||||
|
||||
|
||||
// calculates the time a single memory access takes,
|
||||
// in units of cycles of the current processor.
|
||||
// this function replaces what used to be MMU_WAIT16 and MMU_WAIT32.
|
||||
// this may have side effects, so don't call it more than necessary.
|
||||
/* calculates the time a single memory access takes,
|
||||
* in units of cycles of the current processor.
|
||||
* this function replaces what used to be MMU_WAIT16 and MMU_WAIT32.
|
||||
* this may have side effects, so don't call it more than necessary. */
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING>
|
||||
FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
|
||||
{
|
||||
static const int MC = 1; // cached or tcm memory speed
|
||||
static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; // access through 32-bit bus
|
||||
static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); // access through 16-bit bus
|
||||
static const int MSLW = M16 * 8; // this needs tuning
|
||||
static const int MC = 1; /* cached or tcm memory speed */
|
||||
static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; /* access through 32-bit bus */
|
||||
static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); /* access through 16-bit bus */
|
||||
static const int MSLW = M16 * 8; /* this needs tuning */
|
||||
|
||||
if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE && addr < 0x02000000)
|
||||
return MC; // ITCM
|
||||
|
@ -292,7 +295,7 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
|
|||
return MC; // DTCM
|
||||
#endif
|
||||
|
||||
// for now, assume the cache is always enabled for all of main memory
|
||||
/* for now, assume the cache is always enabled for all of main memory */
|
||||
if(AT != MMU_AT_DMA && TIMING && PROCNUM==ARMCPU_ARM9 && (addr & 0x0F000000) == 0x02000000)
|
||||
{
|
||||
#ifdef ENABLE_CACHE_CONTROLLER_EMULATION
|
||||
|
@ -305,15 +308,15 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
|
|||
return MC;
|
||||
u32 c;
|
||||
if(sequential && AT==MMU_AT_DATA)
|
||||
c = M16; // bonus for sequential data access
|
||||
c = M16; /* bonus for sequential data access */
|
||||
else if(DIRECTION == MMU_AD_READ)
|
||||
c = M16 * 5;
|
||||
else
|
||||
c = M16 * 2; // should be 4, but write buffer isn't emulated yet.
|
||||
c = M16 * 2; /* should be 4, but write buffer isn't emulated yet. */
|
||||
if(DIRECTION == MMU_AD_READ)
|
||||
{
|
||||
// cache miss while reading means it has to fill a whole cache line
|
||||
// by reading 32 bytes...
|
||||
/* cache miss while reading means it has to fill a whole cache line
|
||||
* by reading 32 bytes... */
|
||||
c += 8 * M32*2;
|
||||
}
|
||||
|
||||
|
@ -326,9 +329,9 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
|
|||
|
||||
return c;
|
||||
#elif defined(ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS)
|
||||
// this is the closest approximation I could find
|
||||
// to the with-cache-controller timing
|
||||
// that doesn't do any actual caching logic.
|
||||
/* this is the closest approximation I could find
|
||||
* to the with-cache-controller timing
|
||||
* that doesn't do any actual caching logic. */
|
||||
return sequential ? MC : M16;
|
||||
#endif
|
||||
}
|
||||
|
@ -347,29 +350,22 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
|
|||
if(TIMING && !sequential)
|
||||
{
|
||||
//if(c != MC || PROCNUM==ARMCPU_ARM7) // check not needed anymore because ITCM/DTCM return earlier
|
||||
{
|
||||
c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1;
|
||||
}
|
||||
c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// calculates the cycle time of a single memory access in the MEM stage.
|
||||
// to be used to calculate the memCycles argument for MMU_aluMemCycles.
|
||||
// this may have side effects, so don't call it more than necessary.
|
||||
/* calculates the cycle time of a single memory access in the MEM stage.
|
||||
* to be used to calculate the memCycles argument for MMU_aluMemCycles.
|
||||
* this may have side effects, so don't call it more than necessary. */
|
||||
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING>
|
||||
FORCEINLINE u32 MMU_memAccessCycles(u32 addr)
|
||||
{
|
||||
if(TIMING)
|
||||
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,true>((addr)&(~((READSIZE>>3)-1)));
|
||||
else
|
||||
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,false>((addr)&(~((READSIZE>>3)-1)));
|
||||
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,false>((addr)&(~((READSIZE>>3)-1)));
|
||||
}
|
||||
|
||||
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION>
|
||||
|
@ -377,48 +373,44 @@ FORCEINLINE u32 MMU_memAccessCycles(u32 addr)
|
|||
{
|
||||
if(USE_TIMING())
|
||||
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,true>(addr);
|
||||
else
|
||||
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,false>(addr);
|
||||
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,false>(addr);
|
||||
}
|
||||
|
||||
// calculates the cycle time of a single code fetch in the FETCH stage
|
||||
// to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles.
|
||||
// this may have side effects, so don't call it more than necessary.
|
||||
/* calculates the cycle time of a single code fetch in the FETCH stage
|
||||
* to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles.
|
||||
* this may have side effects, so don't call it more than necessary. */
|
||||
template<int PROCNUM, int READSIZE>
|
||||
FORCEINLINE u32 MMU_codeFetchCycles(u32 addr)
|
||||
{
|
||||
if(USE_TIMING())
|
||||
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,true>((addr)&(~((READSIZE>>3)-1)));
|
||||
else
|
||||
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,false>((addr)&(~((READSIZE>>3)-1)));
|
||||
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,false>((addr)&(~((READSIZE>>3)-1)));
|
||||
}
|
||||
|
||||
// calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
|
||||
// given ALU cycle time and the summation of multiple memory access cycle times.
|
||||
// this function might belong more in armcpu, but I don't think it matters.
|
||||
/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
|
||||
* given ALU cycle time and the summation of multiple memory access cycle times.
|
||||
* this function might belong more in armcpu, but I don't think it matters. */
|
||||
template<int PROCNUM>
|
||||
FORCEINLINE u32 MMU_aluMemCycles(u32 aluCycles, u32 memCycles)
|
||||
{
|
||||
if(PROCNUM==ARMCPU_ARM9)
|
||||
{
|
||||
// ALU and MEM are different stages of the 5-stage pipeline.
|
||||
// we approximate the pipeline throughput using max,
|
||||
// since simply adding the cycles of each instruction together
|
||||
// fails to take into account the parallelism of the arm pipeline
|
||||
// and would make the emulated system unnaturally slow.
|
||||
return std::max(aluCycles, memCycles);
|
||||
}
|
||||
else
|
||||
{
|
||||
// ALU and MEM are part of the same stage of the 3-stage pipeline,
|
||||
// thus they occur in sequence and we can simply add the counts together.
|
||||
return aluCycles + memCycles;
|
||||
}
|
||||
if(PROCNUM==ARMCPU_ARM9)
|
||||
{
|
||||
/* ALU and MEM are different stages of the 5-stage pipeline.
|
||||
* we approximate the pipeline throughput using max,
|
||||
* since simply adding the cycles of each instruction together
|
||||
* fails to take into account the parallelism of the arm pipeline
|
||||
* and would make the emulated system unnaturally slow. */
|
||||
return std::max(aluCycles, memCycles);
|
||||
}
|
||||
|
||||
/* ALU and MEM are part of the same stage of the 3-stage pipeline,
|
||||
* thus they occur in sequence and we can simply add the counts together. */
|
||||
return aluCycles + memCycles;
|
||||
}
|
||||
|
||||
// calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
|
||||
// given ALU cycle time and the description of a single memory access.
|
||||
// this may have side effects, so don't call it more than necessary.
|
||||
/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
|
||||
* given ALU cycle time and the description of a single memory access.
|
||||
* this may have side effects, so don't call it more than necessary. */
|
||||
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION>
|
||||
FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr)
|
||||
{
|
||||
|
@ -429,30 +421,28 @@ FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr)
|
|||
return MMU_aluMemCycles<PROCNUM>(aluCycles, memCycles);
|
||||
}
|
||||
|
||||
// calculates the cycle contribution of FETCH + EXECUTE stages
|
||||
// given executeCycles = the combined ALU+MEM cycles
|
||||
// and fetchCycles = the cycle time of the FETCH stage
|
||||
// this function might belong more in armcpu, but I don't think it matters.
|
||||
/* calculates the cycle contribution of FETCH + EXECUTE stages
|
||||
* given executeCycles = the combined ALU+MEM cycles
|
||||
* and fetchCycles = the cycle time of the FETCH stage
|
||||
* this function might belong more in armcpu, but I don't think it matters. */
|
||||
template<int PROCNUM>
|
||||
FORCEINLINE u32 MMU_fetchExecuteCycles(u32 executeCycles, u32 fetchCycles)
|
||||
{
|
||||
#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES
|
||||
const bool allow = true;
|
||||
#else
|
||||
const bool allow = false;
|
||||
#endif
|
||||
#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES
|
||||
if(USE_TIMING())
|
||||
{
|
||||
/* execute and fetch are different stages of the pipeline for both arm7 and arm9.
|
||||
* again, we approximate the pipeline throughput using max. */
|
||||
return std::max(executeCycles, fetchCycles);
|
||||
/* TODO: add an option to support conflict between MEM and FETCH cycles
|
||||
* if they're both using the same data bus.
|
||||
* in the case of a conflict this should be:
|
||||
* return std::max(aluCycles, memCycles + fetchCycles);
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
|
||||
if(USE_TIMING() && allow)
|
||||
{
|
||||
// execute and fetch are different stages of the pipeline for both arm7 and arm9.
|
||||
// again, we approximate the pipeline throughput using max.
|
||||
return std::max(executeCycles, fetchCycles);
|
||||
// TODO: add an option to support conflict between MEM and FETCH cycles
|
||||
// if they're both using the same data bus.
|
||||
// in the case of a conflict this should be:
|
||||
// return std::max(aluCycles, memCycles + fetchCycles);
|
||||
}
|
||||
return executeCycles;
|
||||
return executeCycles;
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue