diff --git a/desmume/src/GPU.cpp b/desmume/src/GPU.cpp index 52ca3fb6d..34481b7be 100644 --- a/desmume/src/GPU.cpp +++ b/desmume/src/GPU.cpp @@ -573,12 +573,14 @@ static void GPU_InitFadeColors() } } +static CACHE_ALIGN GPU GPU_main, GPU_sub; + GPU * GPU_Init(u8 l) { GPU * g; - if ((g = (GPU *) malloc(sizeof(GPU))) == NULL) - return NULL; + if(l==0) g = &GPU_main; + else g = &GPU_sub; GPU_Reset(g, l); GPU_InitFadeColors(); @@ -591,8 +593,6 @@ GPU * GPU_Init(u8 l) g->setFinalColor3d_funcNum = 0; g->setFinalColorSpr = _master_setFinalOBJColor; - - return g; } @@ -1133,9 +1133,8 @@ template FORCEINLINE void GPU::setFinalColorBG(u16 color, const u //This is probably the best place to enforce it, since almost every single color that comes in here //will be pulled from a palette that needs the top bit stripped off anyway. //assert((color&0x8000)==0); - color &= 0x7FFF; + if(!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here - //if someone disagrees with these, they could be reimplemented as a function pointer easily bool draw=true; switch(setFinalColorBck_funcNum) { @@ -1149,10 +1148,10 @@ template FORCEINLINE void GPU::setFinalColorBG(u16 color, const u case 0x7: draw=setFinalBGColorSpecialDecreaseWnd(color,x); break; }; - if(draw) + if(BACKDROP || draw) //backdrop must always be drawn { T2WriteWord(currDst, x<<1, color | 0x8000); - bgPixels[x] = currBgNum; + if(!BACKDROP) bgPixels[x] = currBgNum; //lets do this in the backdrop drawing loop, should be faster } } @@ -2359,7 +2358,6 @@ void GPU_set_DISPCAPCNT(u32 val) } // #define BRIGHT_TABLES - static void GPU_ligne_layer(NDS_Screen * screen, u16 l) { GPU * gpu = screen->gpu; @@ -2381,6 +2379,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l) for(int x=0;x<256;x++) { gpu->__setFinalColorBck(backdrop_color,x,1); } + memset(gpu->bgPixels,5,256); //this check isnt really helpful. it just slows us down in the cases where we need the most speed //if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return; @@ -2395,7 +2394,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l) for (int i=0; iitemsForPriority[i].nbPixelsX = 0; } - + // for all the pixels in the line if (gpu->LayersEnable[4]) { diff --git a/desmume/src/GPU.h b/desmume/src/GPU.h index c2bd7830b..f21a0ea88 100644 --- a/desmume/src/GPU.h +++ b/desmume/src/GPU.h @@ -726,7 +726,7 @@ struct GPU u8 MasterBrightMode; u32 MasterBrightFactor; - u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it + CACHE_ALIGN u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it u32 currLine; u8 currBgNum; diff --git a/desmume/src/MMU.h b/desmume/src/MMU.h index 8a3e331fd..1a17373dd 100644 --- a/desmume/src/MMU.h +++ b/desmume/src/MMU.h @@ -495,22 +495,22 @@ FORCEINLINE void _MMU_write32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const #endif template -u8 _MMU_read08(u32 addr) { return _MMU_read08(PROCNUM, AT, addr); } +FORCEINLINE u8 _MMU_read08(u32 addr) { return _MMU_read08(PROCNUM, AT, addr); } template -u16 _MMU_read16(u32 addr) { return _MMU_read16(PROCNUM, AT, addr); } +FORCEINLINE u16 _MMU_read16(u32 addr) { return _MMU_read16(PROCNUM, AT, addr); } template -u32 _MMU_read32(u32 addr) { return _MMU_read32(PROCNUM, AT, addr); } +FORCEINLINE u32 _MMU_read32(u32 addr) { return _MMU_read32(PROCNUM, AT, addr); } template -void _MMU_write08(u32 addr, u8 val) { _MMU_write08(PROCNUM, AT, addr, val); } +FORCEINLINE void _MMU_write08(u32 addr, u8 val) { _MMU_write08(PROCNUM, AT, addr, val); } template -void _MMU_write16(u32 addr, u16 val) { _MMU_write16(PROCNUM, AT, addr, val); } +FORCEINLINE void _MMU_write16(u32 addr, u16 val) { _MMU_write16(PROCNUM, AT, addr, val); } template -void _MMU_write32(u32 addr, u32 val) { _MMU_write32(PROCNUM, AT, addr, val); } +FORCEINLINE void _MMU_write32(u32 addr, u32 val) { _MMU_write32(PROCNUM, AT, addr, val); } void FASTCALL MMU_DumpMemBlock(u8 proc, u32 address, u32 size, u8 *buffer); diff --git a/desmume/src/armcpu.cpp b/desmume/src/armcpu.cpp index f8cf8bb46..c2aeba1bb 100644 --- a/desmume/src/armcpu.cpp +++ b/desmume/src/armcpu.cpp @@ -33,7 +33,7 @@ template static u32 armcpu_prefetch(); -inline u32 armcpu_prefetch(armcpu_t *armcpu) { +FORCEINLINE u32 armcpu_prefetch(armcpu_t *armcpu) { if(armcpu->proc_ID==0) return armcpu_prefetch<0>(); else return armcpu_prefetch<1>(); } @@ -363,8 +363,7 @@ u32 armcpu_switchMode(armcpu_t *armcpu, u8 mode) } template -static u32 -armcpu_prefetch() +FORCEINLINE static u32 armcpu_prefetch() { armcpu_t* const armcpu = &ARMPROC; #ifdef GDB_STUB @@ -521,7 +520,10 @@ u32 armcpu_exec() if(ARMPROC.CPSR.bits.T == 0) { - if((TEST_COND(CONDITION(ARMPROC.instruction), CODE(ARMPROC.instruction), ARMPROC.CPSR))) + if( + CONDITION(ARMPROC.instruction) == 0x0E //fast path for unconditional instructions + || (TEST_COND(CONDITION(ARMPROC.instruction), CODE(ARMPROC.instruction), ARMPROC.CPSR)) //handles any condition + ) { if(PROCNUM==0) { #ifdef WANTASMLISTING