Revert "(MMU_timings.h) Style nits/cleanups"

This reverts commit 04c8b0915e85da85a8f08a7b2ad6814d95b6618f.
This commit is contained in:
zeromus 2016-08-16 14:16:59 -05:00
parent b554b05b00
commit 602da31591
1 changed files with 127 additions and 117 deletions

View File

@ -29,37 +29,39 @@
#include "debug.h"
#include "NDSSystem.h"
/*
* MEMORY TIMING ACCURACY CONFIGURATION
*
* the more of these are enabled,
* the more accurate memory access timing _should_ become.
* they should be listed roughly in order of most to least important.
* it's reasonable to disable some of these as a speed hack.
* obviously, these defines don't cover all the variables or features needed,
* and in particular, DMA or code+data access bus contention is still missing. */
////////////////////////////////////////////////////////////////
// MEMORY TIMING ACCURACY CONFIGURATION
//
// the more of these are enabled,
// the more accurate memory access timing _should_ become.
// they should be listed roughly in order of most to least important.
// it's reasonable to disable some of these as a speed hack.
// obviously, these defines don't cover all the variables or features needed,
// and in particular, DMA or code+data access bus contention is still missing.
/* disable this to prevent the advanced timing logic from ever running at all */
//disable this to prevent the advanced timing logic from ever running at all
#define ENABLE_ADVANCED_TIMING
#ifdef ENABLE_ADVANCED_TIMING
/* makes non-sequential accesses slower than sequential ones. */
// makes non-sequential accesses slower than sequential ones.
#define ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS
/* (SOMETIMES THIS IS A BIG SPEED HIT!) */
//(SOMETIMES THIS IS A BIG SPEED HIT!)
/* enables emulation of code fetch waits. */
// enables emulation of code fetch waits.
#define ACCOUNT_FOR_CODE_FETCH_CYCLES
/* makes access to DTCM (arm9 only) fast. */
// makes access to DTCM (arm9 only) fast.
#define ACCOUNT_FOR_DATA_TCM_SPEED
/* enables simulation of cache hits and cache misses. */
// enables simulation of cache hits and cache misses.
#define ENABLE_CACHE_CONTROLLER_EMULATION
#endif /* ENABLE_ADVANCED_TIMING */
#endif //ENABLE_ADVANCED_TIMING
FORCEINLINE bool USE_TIMING(void)
{
//
////////////////////////////////////////////////////////////////
FORCEINLINE bool USE_TIMING() {
#ifdef ENABLE_ADVANCED_TIMING
return CommonSettings.advanced_timing;
#else
@ -67,16 +69,16 @@ FORCEINLINE bool USE_TIMING(void)
#endif
}
enum MMU_ACCESS_DIRECTION
{
MMU_AD_READ = 0,
MMU_AD_WRITE
MMU_AD_READ, MMU_AD_WRITE
};
/* note that we don't actually emulate the cache contents here,
* only enough to guess what would be a cache hit or a cache miss.
* this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined. */
// note that we don't actually emulate the cache contents here,
// only enough to guess what would be a cache hit or a cache miss.
// this doesn't really get used unless ENABLE_CACHE_CONTROLLER_EMULATION is defined.
template<int SIZESHIFT, int ASSOCIATIVESHIFT, int BLOCKSIZESHIFT>
class CacheController
{
@ -87,13 +89,13 @@ public:
u32 blockMasked = addr & BLOCKMASK;
if(blockMasked == m_cacheCache)
return true;
else
return this->CachedInternal<DIR>(addr, blockMasked);
}
void Reset()
{
unsigned blockIndex;
for(blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++)
for(int blockIndex = 0; blockIndex < NUMBLOCKS; blockIndex++)
m_blocks[blockIndex].Reset();
m_cacheCache = ~0;
}
@ -104,24 +106,20 @@ public:
void savestate(EMUFILE* os, int version)
{
unsigned i;
write32le(m_cacheCache, os);
for(i = 0; i < NUMBLOCKS; i++)
for(int i = 0; i < NUMBLOCKS; i++)
{
unsigned j;
for(j = 0; j < ASSOCIATIVITY; j++)
for(int j = 0; j < ASSOCIATIVITY; j++)
write32le(m_blocks[i].tag[j],os);
write32le(m_blocks[i].nextWay,os);
}
}
bool loadstate(EMUFILE* is, int version)
{
unsigned i;
read32le(&m_cacheCache, is);
for(i = 0; i < NUMBLOCKS; i++)
for(int i = 0; i < NUMBLOCKS; i++)
{
unsigned j;
for(j = 0; j < ASSOCIATIVITY; j++)
for(int j = 0; j < ASSOCIATIVITY; j++)
read32le(&m_blocks[i].tag[j],is);
read32le(&m_blocks[i].nextWay,is);
}
@ -132,24 +130,20 @@ private:
template<MMU_ACCESS_DIRECTION DIR>
bool CachedInternal(u32 addr, u32 blockMasked)
{
unsigned way;
u32 blockIndex = blockMasked >> BLOCKSIZESHIFT;
CacheBlock& block = m_blocks[blockIndex];
addr &= TAGMASK;
for(way = 0; way < ASSOCIATIVITY; way++)
for(int way = 0; way < ASSOCIATIVITY; way++)
if(addr == block.tag[way])
{
if(addr != block.tag[way])
continue;
/* found it, already allocated */
// found it, already allocated
m_cacheCache = blockMasked;
return true;
}
if(DIR == MMU_AD_READ)
{
/* TODO: support other allocation orders? */
// TODO: support other allocation orders?
block.tag[block.nextWay++] = addr;
block.nextWay %= ASSOCIATIVITY;
m_cacheCache = blockMasked;
@ -176,14 +170,13 @@ private:
void Reset()
{
unsigned way;
nextWay = 0;
for(way = 0; way < ASSOCIATIVITY; way++)
for(int way = 0; way < ASSOCIATIVITY; way++)
tag[way] = 0;
}
};
u32 m_cacheCache; /* optimization */
u32 m_cacheCache; // optimization
CacheBlock m_blocks [NUMBLOCKS];
};
@ -207,7 +200,9 @@ public:
#endif
if(AT == MMU_AT_CODE && !prohibit)
{
return 1;
}
u32 time = _MMU_accesstime<PROCNUM, AT, READSIZE, DIRECTION,TIMING>(address,
#ifdef ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS
@ -253,12 +248,12 @@ private:
struct MMU_struct_timing
{
/* technically part of the cp15, but I didn't want the dereferencing penalty.
* these template values correspond with the value of armcp15->cacheType. */
CacheController<13,2,5> arm9codeCache; /* 8192 bytes, 4-way associative, 32-byte blocks */
CacheController<12,2,5> arm9dataCache; /* 4096 bytes, 4-way associative, 32-byte blocks */
// technically part of the cp15, but I didn't want the dereferencing penalty.
// these template values correspond with the value of armcp15->cacheType.
CacheController<13,2,5> arm9codeCache; // 8192 bytes, 4-way associative, 32-byte blocks
CacheController<12,2,5> arm9dataCache; // 4096 bytes, 4-way associative, 32-byte blocks
/* technically part of armcpu_t, but that struct isn't templated on PROCNUM */
// technically part of armcpu_t, but that struct isn't templated on PROCNUM
FetchAccessUnit<0,MMU_AT_CODE> arm9codeFetch;
FetchAccessUnit<0,MMU_AT_DATA> arm9dataFetch;
FetchAccessUnit<1,MMU_AT_CODE> arm7codeFetch;
@ -275,17 +270,19 @@ template<> FORCEINLINE FetchAccessUnit<1,MMU_AT_DATA>& MMU_struct_timing::armDat
extern MMU_struct_timing MMU_timing;
/* calculates the time a single memory access takes,
* in units of cycles of the current processor.
* this function replaces what used to be MMU_WAIT16 and MMU_WAIT32.
* this may have side effects, so don't call it more than necessary. */
// calculates the time a single memory access takes,
// in units of cycles of the current processor.
// this function replaces what used to be MMU_WAIT16 and MMU_WAIT32.
// this may have side effects, so don't call it more than necessary.
template<int PROCNUM, MMU_ACCESS_TYPE AT, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING>
FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
{
static const int MC = 1; /* cached or tcm memory speed */
static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; /* access through 32-bit bus */
static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); /* access through 16-bit bus */
static const int MSLW = M16 * 8; /* this needs tuning */
static const int MC = 1; // cached or tcm memory speed
static const int M32 = (PROCNUM==ARMCPU_ARM9) ? 2 : 1; // access through 32-bit bus
static const int M16 = M32 * ((READSIZE>16) ? 2 : 1); // access through 16-bit bus
static const int MSLW = M16 * 8; // this needs tuning
if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE && addr < 0x02000000)
return MC; // ITCM
@ -295,7 +292,7 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
return MC; // DTCM
#endif
/* for now, assume the cache is always enabled for all of main memory */
// for now, assume the cache is always enabled for all of main memory
if(AT != MMU_AT_DMA && TIMING && PROCNUM==ARMCPU_ARM9 && (addr & 0x0F000000) == 0x02000000)
{
#ifdef ENABLE_CACHE_CONTROLLER_EMULATION
@ -308,15 +305,15 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
return MC;
u32 c;
if(sequential && AT==MMU_AT_DATA)
c = M16; /* bonus for sequential data access */
c = M16; // bonus for sequential data access
else if(DIRECTION == MMU_AD_READ)
c = M16 * 5;
else
c = M16 * 2; /* should be 4, but write buffer isn't emulated yet. */
c = M16 * 2; // should be 4, but write buffer isn't emulated yet.
if(DIRECTION == MMU_AD_READ)
{
/* cache miss while reading means it has to fill a whole cache line
* by reading 32 bytes... */
// cache miss while reading means it has to fill a whole cache line
// by reading 32 bytes...
c += 8 * M32*2;
}
@ -329,9 +326,9 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
return c;
#elif defined(ACCOUNT_FOR_NON_SEQUENTIAL_ACCESS)
/* this is the closest approximation I could find
* to the with-cache-controller timing
* that doesn't do any actual caching logic. */
// this is the closest approximation I could find
// to the with-cache-controller timing
// that doesn't do any actual caching logic.
return sequential ? MC : M16;
#endif
}
@ -350,21 +347,28 @@ FORCEINLINE u32 _MMU_accesstime(u32 addr, bool sequential)
if(TIMING && !sequential)
{
//if(c != MC || PROCNUM==ARMCPU_ARM7) // check not needed anymore because ITCM/DTCM return earlier
{
c += (PROCNUM==ARMCPU_ARM9) ? 3*2 : 1;
}
}
#endif
return c;
}
/* calculates the cycle time of a single memory access in the MEM stage.
* to be used to calculate the memCycles argument for MMU_aluMemCycles.
* this may have side effects, so don't call it more than necessary. */
// calculates the cycle time of a single memory access in the MEM stage.
// to be used to calculate the memCycles argument for MMU_aluMemCycles.
// this may have side effects, so don't call it more than necessary.
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION, bool TIMING>
FORCEINLINE u32 MMU_memAccessCycles(u32 addr)
{
if(TIMING)
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,true>((addr)&(~((READSIZE>>3)-1)));
else
return MMU_timing.armDataFetch<PROCNUM>().template Fetch<READSIZE,DIRECTION,false>((addr)&(~((READSIZE>>3)-1)));
}
@ -373,44 +377,48 @@ FORCEINLINE u32 MMU_memAccessCycles(u32 addr)
{
if(USE_TIMING())
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,true>(addr);
else
return MMU_memAccessCycles<PROCNUM,READSIZE,DIRECTION,false>(addr);
}
/* calculates the cycle time of a single code fetch in the FETCH stage
* to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles.
* this may have side effects, so don't call it more than necessary. */
// calculates the cycle time of a single code fetch in the FETCH stage
// to be used to calculate the fetchCycles argument for MMU_fetchExecuteCycles.
// this may have side effects, so don't call it more than necessary.
template<int PROCNUM, int READSIZE>
FORCEINLINE u32 MMU_codeFetchCycles(u32 addr)
{
if(USE_TIMING())
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,true>((addr)&(~((READSIZE>>3)-1)));
else
return MMU_timing.armCodeFetch<PROCNUM>().template Fetch<READSIZE,MMU_AD_READ,false>((addr)&(~((READSIZE>>3)-1)));
}
/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
* given ALU cycle time and the summation of multiple memory access cycle times.
* this function might belong more in armcpu, but I don't think it matters. */
// calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
// given ALU cycle time and the summation of multiple memory access cycle times.
// this function might belong more in armcpu, but I don't think it matters.
template<int PROCNUM>
FORCEINLINE u32 MMU_aluMemCycles(u32 aluCycles, u32 memCycles)
{
if(PROCNUM==ARMCPU_ARM9)
{
/* ALU and MEM are different stages of the 5-stage pipeline.
* we approximate the pipeline throughput using max,
* since simply adding the cycles of each instruction together
* fails to take into account the parallelism of the arm pipeline
* and would make the emulated system unnaturally slow. */
// ALU and MEM are different stages of the 5-stage pipeline.
// we approximate the pipeline throughput using max,
// since simply adding the cycles of each instruction together
// fails to take into account the parallelism of the arm pipeline
// and would make the emulated system unnaturally slow.
return std::max(aluCycles, memCycles);
}
/* ALU and MEM are part of the same stage of the 3-stage pipeline,
* thus they occur in sequence and we can simply add the counts together. */
else
{
// ALU and MEM are part of the same stage of the 3-stage pipeline,
// thus they occur in sequence and we can simply add the counts together.
return aluCycles + memCycles;
}
}
/* calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
* given ALU cycle time and the description of a single memory access.
* this may have side effects, so don't call it more than necessary. */
// calculates the cycle contribution of ALU + MEM stages (= EXECUTE)
// given ALU cycle time and the description of a single memory access.
// this may have side effects, so don't call it more than necessary.
template<int PROCNUM, int READSIZE, MMU_ACCESS_DIRECTION DIRECTION>
FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr)
{
@ -421,27 +429,29 @@ FORCEINLINE u32 MMU_aluMemAccessCycles(u32 aluCycles, u32 addr)
return MMU_aluMemCycles<PROCNUM>(aluCycles, memCycles);
}
/* calculates the cycle contribution of FETCH + EXECUTE stages
* given executeCycles = the combined ALU+MEM cycles
* and fetchCycles = the cycle time of the FETCH stage
* this function might belong more in armcpu, but I don't think it matters. */
// calculates the cycle contribution of FETCH + EXECUTE stages
// given executeCycles = the combined ALU+MEM cycles
// and fetchCycles = the cycle time of the FETCH stage
// this function might belong more in armcpu, but I don't think it matters.
template<int PROCNUM>
FORCEINLINE u32 MMU_fetchExecuteCycles(u32 executeCycles, u32 fetchCycles)
{
#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES
if(USE_TIMING())
{
/* execute and fetch are different stages of the pipeline for both arm7 and arm9.
* again, we approximate the pipeline throughput using max. */
return std::max(executeCycles, fetchCycles);
/* TODO: add an option to support conflict between MEM and FETCH cycles
* if they're both using the same data bus.
* in the case of a conflict this should be:
* return std::max(aluCycles, memCycles + fetchCycles);
*/
}
#endif
#ifdef ACCOUNT_FOR_CODE_FETCH_CYCLES
const bool allow = true;
#else
const bool allow = false;
#endif
if(USE_TIMING() && allow)
{
// execute and fetch are different stages of the pipeline for both arm7 and arm9.
// again, we approximate the pipeline throughput using max.
return std::max(executeCycles, fetchCycles);
// TODO: add an option to support conflict between MEM and FETCH cycles
// if they're both using the same data bus.
// in the case of a conflict this should be:
// return std::max(aluCycles, memCycles + fetchCycles);
}
return executeCycles;
}