a grab bag of optimizations good for about 4fps on my system
This commit is contained in:
parent
dbe2226498
commit
ecf5b68df6
|
@ -573,12 +573,14 @@ static void GPU_InitFadeColors()
|
|||
}
|
||||
}
|
||||
|
||||
static CACHE_ALIGN GPU GPU_main, GPU_sub;
|
||||
|
||||
GPU * GPU_Init(u8 l)
|
||||
{
|
||||
GPU * g;
|
||||
|
||||
if ((g = (GPU *) malloc(sizeof(GPU))) == NULL)
|
||||
return NULL;
|
||||
if(l==0) g = &GPU_main;
|
||||
else g = &GPU_sub;
|
||||
|
||||
GPU_Reset(g, l);
|
||||
GPU_InitFadeColors();
|
||||
|
@ -591,8 +593,6 @@ GPU * GPU_Init(u8 l)
|
|||
g->setFinalColor3d_funcNum = 0;
|
||||
g->setFinalColorSpr = _master_setFinalOBJColor<None,false>;
|
||||
|
||||
|
||||
|
||||
return g;
|
||||
}
|
||||
|
||||
|
@ -1133,9 +1133,8 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
|
|||
//This is probably the best place to enforce it, since almost every single color that comes in here
|
||||
//will be pulled from a palette that needs the top bit stripped off anyway.
|
||||
//assert((color&0x8000)==0);
|
||||
color &= 0x7FFF;
|
||||
if(!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here
|
||||
|
||||
//if someone disagrees with these, they could be reimplemented as a function pointer easily
|
||||
bool draw=true;
|
||||
switch(setFinalColorBck_funcNum)
|
||||
{
|
||||
|
@ -1149,10 +1148,10 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
|
|||
case 0x7: draw=setFinalBGColorSpecialDecreaseWnd<BACKDROP>(color,x); break;
|
||||
};
|
||||
|
||||
if(draw)
|
||||
if(BACKDROP || draw) //backdrop must always be drawn
|
||||
{
|
||||
T2WriteWord(currDst, x<<1, color | 0x8000);
|
||||
bgPixels[x] = currBgNum;
|
||||
if(!BACKDROP) bgPixels[x] = currBgNum; //lets do this in the backdrop drawing loop, should be faster
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2359,7 +2358,6 @@ void GPU_set_DISPCAPCNT(u32 val)
|
|||
}
|
||||
// #define BRIGHT_TABLES
|
||||
|
||||
|
||||
static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
||||
{
|
||||
GPU * gpu = screen->gpu;
|
||||
|
@ -2381,6 +2379,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
|
|||
for(int x=0;x<256;x++) {
|
||||
gpu->__setFinalColorBck<false,true>(backdrop_color,x,1);
|
||||
}
|
||||
memset(gpu->bgPixels,5,256);
|
||||
|
||||
//this check isnt really helpful. it just slows us down in the cases where we need the most speed
|
||||
//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;
|
||||
|
|
|
@ -726,7 +726,7 @@ struct GPU
|
|||
u8 MasterBrightMode;
|
||||
u32 MasterBrightFactor;
|
||||
|
||||
u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it
|
||||
CACHE_ALIGN u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it
|
||||
|
||||
u32 currLine;
|
||||
u8 currBgNum;
|
||||
|
|
|
@ -495,22 +495,22 @@ FORCEINLINE void _MMU_write32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const
|
|||
#endif
|
||||
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT>
|
||||
u8 _MMU_read08(u32 addr) { return _MMU_read08(PROCNUM, AT, addr); }
|
||||
FORCEINLINE u8 _MMU_read08(u32 addr) { return _MMU_read08(PROCNUM, AT, addr); }
|
||||
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT>
|
||||
u16 _MMU_read16(u32 addr) { return _MMU_read16(PROCNUM, AT, addr); }
|
||||
FORCEINLINE u16 _MMU_read16(u32 addr) { return _MMU_read16(PROCNUM, AT, addr); }
|
||||
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT>
|
||||
u32 _MMU_read32(u32 addr) { return _MMU_read32(PROCNUM, AT, addr); }
|
||||
FORCEINLINE u32 _MMU_read32(u32 addr) { return _MMU_read32(PROCNUM, AT, addr); }
|
||||
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT>
|
||||
void _MMU_write08(u32 addr, u8 val) { _MMU_write08(PROCNUM, AT, addr, val); }
|
||||
FORCEINLINE void _MMU_write08(u32 addr, u8 val) { _MMU_write08(PROCNUM, AT, addr, val); }
|
||||
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT>
|
||||
void _MMU_write16(u32 addr, u16 val) { _MMU_write16(PROCNUM, AT, addr, val); }
|
||||
FORCEINLINE void _MMU_write16(u32 addr, u16 val) { _MMU_write16(PROCNUM, AT, addr, val); }
|
||||
|
||||
template<int PROCNUM, MMU_ACCESS_TYPE AT>
|
||||
void _MMU_write32(u32 addr, u32 val) { _MMU_write32(PROCNUM, AT, addr, val); }
|
||||
FORCEINLINE void _MMU_write32(u32 addr, u32 val) { _MMU_write32(PROCNUM, AT, addr, val); }
|
||||
|
||||
void FASTCALL MMU_DumpMemBlock(u8 proc, u32 address, u32 size, u8 *buffer);
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
|
||||
template<u32> static u32 armcpu_prefetch();
|
||||
|
||||
inline u32 armcpu_prefetch(armcpu_t *armcpu) {
|
||||
FORCEINLINE u32 armcpu_prefetch(armcpu_t *armcpu) {
|
||||
if(armcpu->proc_ID==0) return armcpu_prefetch<0>();
|
||||
else return armcpu_prefetch<1>();
|
||||
}
|
||||
|
@ -363,8 +363,7 @@ u32 armcpu_switchMode(armcpu_t *armcpu, u8 mode)
|
|||
}
|
||||
|
||||
template<u32 PROCNUM>
|
||||
static u32
|
||||
armcpu_prefetch()
|
||||
FORCEINLINE static u32 armcpu_prefetch()
|
||||
{
|
||||
armcpu_t* const armcpu = &ARMPROC;
|
||||
#ifdef GDB_STUB
|
||||
|
@ -521,7 +520,10 @@ u32 armcpu_exec()
|
|||
|
||||
if(ARMPROC.CPSR.bits.T == 0)
|
||||
{
|
||||
if((TEST_COND(CONDITION(ARMPROC.instruction), CODE(ARMPROC.instruction), ARMPROC.CPSR)))
|
||||
if(
|
||||
CONDITION(ARMPROC.instruction) == 0x0E //fast path for unconditional instructions
|
||||
|| (TEST_COND(CONDITION(ARMPROC.instruction), CODE(ARMPROC.instruction), ARMPROC.CPSR)) //handles any condition
|
||||
)
|
||||
{
|
||||
if(PROCNUM==0) {
|
||||
#ifdef WANTASMLISTING
|
||||
|
|
Loading…
Reference in New Issue