a grab bag of optimizations good for about 4fps on my system

2009-07-17 08:33:35 +00:00 · 2009-07-17 08:33:35 +00:00 · ecf5b68df6
parent dbe2226498
commit ecf5b68df6
4 changed files with 22 additions and 21 deletions
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@ -573,12 +573,14 @@ static void GPU_InitFadeColors()
 				}
 }

+static CACHE_ALIGN GPU GPU_main, GPU_sub;
+
 GPU * GPU_Init(u8 l)
 {
 	GPU * g;

-	if ((g = (GPU *) malloc(sizeof(GPU))) == NULL)
-		return NULL;
+	if(l==0) g = &GPU_main;
+	else g = &GPU_sub;

 	GPU_Reset(g, l);
 	GPU_InitFadeColors();
@ -591,8 +593,6 @@ GPU * GPU_Init(u8 l)
 	g->setFinalColor3d_funcNum = 0;
 	g->setFinalColorSpr = _master_setFinalOBJColor<None,false>;

-	
-
 	return g;
 }

@ -1133,9 +1133,8 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
 	//This is probably the best place to enforce it, since almost every single color that comes in here
 	//will be pulled from a palette that needs the top bit stripped off anyway.
 	//assert((color&0x8000)==0);
-	color &= 0x7FFF;
+	if(!BACKDROP) color &= 0x7FFF; //but for the backdrop we can easily guarantee earlier that theres no bit here

-	//if someone disagrees with these, they could be reimplemented as a function pointer easily
 	bool draw=true;
 	switch(setFinalColorBck_funcNum)
 	{
@ -1149,10 +1148,10 @@ template<bool BACKDROP> FORCEINLINE void GPU::setFinalColorBG(u16 color, const u
 	case 0x7: draw=setFinalBGColorSpecialDecreaseWnd<BACKDROP>(color,x); break;
 	};

-	if(draw) 
+	if(BACKDROP || draw) //backdrop must always be drawn
 	{
 		T2WriteWord(currDst, x<<1, color | 0x8000);
-		bgPixels[x] = currBgNum;
+		if(!BACKDROP) bgPixels[x] = currBgNum; //lets do this in the backdrop drawing loop, should be faster
 	}
 }

@ -2359,7 +2358,6 @@ void GPU_set_DISPCAPCNT(u32 val)
 }
 // #define BRIGHT_TABLES

-
 static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
 {
 	GPU * gpu = screen->gpu;
@ -2381,6 +2379,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
 	for(int x=0;x<256;x++) {
 		gpu->__setFinalColorBck<false,true>(backdrop_color,x,1);
 	}
+	memset(gpu->bgPixels,5,256);

 	//this check isnt really helpful. it just slows us down in the cases where we need the most speed
 	//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;
@ -2395,7 +2394,7 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)
 	for (int i=0; i<NB_PRIORITIES; i++) {
 		gpu->itemsForPriority[i].nbPixelsX = 0;
 	}
-	
+
 	// for all the pixels in the line
 	if (gpu->LayersEnable[4]) 
 	{
--- a/desmume/src/GPU.h
+++ b/desmume/src/GPU.h
@ -726,7 +726,7 @@ struct GPU
 	u8	MasterBrightMode;
 	u32 MasterBrightFactor;

-	u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it
+	CACHE_ALIGN u8 bgPixels[1024]; //yes indeed, this is oversized. map debug tools try to write to it

 	u32 currLine;
 	u8 currBgNum;
--- a/desmume/src/MMU.h
+++ b/desmume/src/MMU.h
@ -495,22 +495,22 @@ FORCEINLINE void _MMU_write32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const
 #endif

 template<int PROCNUM, MMU_ACCESS_TYPE AT>
-u8 _MMU_read08(u32 addr) { return _MMU_read08(PROCNUM, AT, addr); }
+FORCEINLINE u8 _MMU_read08(u32 addr) { return _MMU_read08(PROCNUM, AT, addr); }

 template<int PROCNUM, MMU_ACCESS_TYPE AT>
-u16 _MMU_read16(u32 addr) { return _MMU_read16(PROCNUM, AT, addr); }
+FORCEINLINE u16 _MMU_read16(u32 addr) { return _MMU_read16(PROCNUM, AT, addr); }

 template<int PROCNUM, MMU_ACCESS_TYPE AT>
-u32 _MMU_read32(u32 addr) { return _MMU_read32(PROCNUM, AT, addr); }
+FORCEINLINE u32 _MMU_read32(u32 addr) { return _MMU_read32(PROCNUM, AT, addr); }

 template<int PROCNUM, MMU_ACCESS_TYPE AT>
-void _MMU_write08(u32 addr, u8 val) { _MMU_write08(PROCNUM, AT, addr, val); }
+FORCEINLINE void _MMU_write08(u32 addr, u8 val) { _MMU_write08(PROCNUM, AT, addr, val); }

 template<int PROCNUM, MMU_ACCESS_TYPE AT>
-void _MMU_write16(u32 addr, u16 val) { _MMU_write16(PROCNUM, AT, addr, val); }
+FORCEINLINE void _MMU_write16(u32 addr, u16 val) { _MMU_write16(PROCNUM, AT, addr, val); }

 template<int PROCNUM, MMU_ACCESS_TYPE AT>
-void _MMU_write32(u32 addr, u32 val) { _MMU_write32(PROCNUM, AT, addr, val); }
+FORCEINLINE void _MMU_write32(u32 addr, u32 val) { _MMU_write32(PROCNUM, AT, addr, val); }

 void FASTCALL MMU_DumpMemBlock(u8 proc, u32 address, u32 size, u8 *buffer);

--- a/desmume/src/armcpu.cpp
+++ b/desmume/src/armcpu.cpp
@ -33,7 +33,7 @@

 template<u32> static u32 armcpu_prefetch();

-inline u32 armcpu_prefetch(armcpu_t *armcpu) { 
+FORCEINLINE u32 armcpu_prefetch(armcpu_t *armcpu) { 
 	if(armcpu->proc_ID==0) return armcpu_prefetch<0>();
 	else return armcpu_prefetch<1>();
 }
@ -363,8 +363,7 @@ u32 armcpu_switchMode(armcpu_t *armcpu, u8 mode)
 }

 template<u32 PROCNUM>
-static u32
-armcpu_prefetch()
+FORCEINLINE static u32 armcpu_prefetch()
 {
 	armcpu_t* const armcpu = &ARMPROC;
 #ifdef GDB_STUB
@ -521,7 +520,10 @@ u32 armcpu_exec()

 	if(ARMPROC.CPSR.bits.T == 0)
 	{
-        if((TEST_COND(CONDITION(ARMPROC.instruction), CODE(ARMPROC.instruction), ARMPROC.CPSR)))
+        if(
+			CONDITION(ARMPROC.instruction) == 0x0E  //fast path for unconditional instructions
+			|| (TEST_COND(CONDITION(ARMPROC.instruction), CODE(ARMPROC.instruction), ARMPROC.CPSR)) //handles any condition
+			)
 		{
 			if(PROCNUM==0) {
 #ifdef WANTASMLISTING