a number of little graphical speedups, and more intelligent arm7 memory access pattern optimization. good for a few fps.

2009-05-13 07:44:06 +00:00 · 2009-05-13 07:44:06 +00:00 · 29297644ac
parent 3844e866e0
commit 29297644ac
4 changed files with 77 additions and 49 deletions
--- a/desmume/src/GPU.cpp
+++ b/desmume/src/GPU.cpp
@ -674,12 +674,6 @@ static void GPU_resortBGs(GPU *gpu)
 	struct _DISPCNT * cnt = &gpu->dispx_st->dispx_DISPCNT.bits;
 	itemsForPriority_t * item;

-	//zero 29-dec-2008 - this really doesnt make sense to me.
-	//i changed the sprwin to be line by line,
-	//and resetting it here is pointless since line rendering is instantaneous
-	//and completely produces and consumes sprwin after which the contents of this buffer are useless
-	//memset(gpu->sprWin,0, 256*192);
-
 	// we don't need to check for windows here...
 // if we tick boxes, invisible layers become invisible & vice versa
 #define OP ^ !
@ -1387,7 +1381,7 @@ FORCEINLINE void GPU::setFinalColor3d(int dstX, int srcX)

 //this was forced inline because most of the time it just falls through to setFinalColorBck() and the function call
 //overhead was ridiculous and terrible
-template<bool MOSAIC> FORCEINLINE void GPU::__setFinalColorBck(u16 color, u8 x, bool opaque)
+template<bool MOSAIC> FORCEINLINE void GPU::__setFinalColorBck(u16 color, const u8 x, const bool opaque)
 {
 	//I commented out this line to make a point.
 	//indeed, since x is a u8 we cannot pass in anything >=256
@ -1519,8 +1513,8 @@ template<bool MOSAIC> INLINE void renderline_textBG(GPU * gpu, u16 XBG, u16 YBG,
 	u16 color;
 	u16 xoff;
 	u16 yoff;
-	u16 x      = 0;
-	u16 xfin;
+	u32 x      = 0;
+	u32 xfin;

 	s8 line_dir = 1;
 	u32 mapinfo;
@ -2610,32 +2604,15 @@ static void GPU_ligne_layer(NDS_Screen * screen, u16 l)

 	u16 backdrop_color = T1ReadWord(ARM9Mem.ARM9_VMEM, gpu->core * 0x400) & 0x7FFF;

-	///* Apply fading to backdrop */
-	if((gpu->BLDCNT & 0x20) && (gpu->BLDY_EVY > 0))
-	{
-		switch(gpu->BLDCNT & 0xC0)
-		{
-		case 0x80:	/* Fade in */
-			backdrop_color = fadeInColors[gpu->BLDY_EVY][backdrop_color];
-			break;
-		case 0xC0:	/* Fade out */
-			backdrop_color = fadeOutColors[gpu->BLDY_EVY][backdrop_color];
-			break;
-		default: break;
-		}
-	}
-
-	//we need to write backdrop colors in the same way as we do BG pixels in order to
-	//do correct window processing
-	//memset(gpu->bgPixels,6,256); //dont know whether we need this...
+	//we need to write backdrop colors in the same way as we do BG pixels in order to do correct window processing
+	//this is currently eating up 2fps or so. it is a reasonable candidate for optimization. 
 	gpu->currBgNum = 5;
 	for(int x=0;x<256;x++) {
 		gpu->__setFinalColorBck<false>(backdrop_color,x,1);
 	}

-	if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && 
-			!gpu->LayersEnable[2] && !gpu->LayersEnable[3] && 
-				!gpu->LayersEnable[4]) return;
+	//this check isnt really helpful. it just slows us down in the cases where we need the most speed
+	//if (!gpu->LayersEnable[0] && !gpu->LayersEnable[1] && !gpu->LayersEnable[2] && !gpu->LayersEnable[3] && !gpu->LayersEnable[4]) return;

 	// init background color & priorities
 	memset(sprAlpha, 0, 256);
--- a/desmume/src/GPU.h
+++ b/desmume/src/GPU.h
@ -787,7 +787,7 @@ struct GPU
 	FORCEINLINE void setFinal3DColorSpecialDecreaseWnd(int dstX, int srcX);


-	template<bool MOSAIC> void __setFinalColorBck(u16 color, u8 x, bool opaque);
+	template<bool MOSAIC> void __setFinalColorBck(u16 color, const u8 x, const bool opaque);
 	void setAffineStart(int layer, int xy, u32 val);
 	void setAffineStartWord(int layer, int xy, u16 val, int word);
 	u32 getAffineStart(int layer, int xy);
--- a/desmume/src/MMU.h
+++ b/desmume/src/MMU.h
@ -310,17 +310,31 @@ FORCEINLINE u32 _MMU_read32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u

 		goto dunno;
 	}
-	
-	//for other cases, we have to check from dtcm first because it is patched on top of the main memory range
+
+	//special handling for execution from arm7. try reading from main memory first
+	if(PROCNUM==ARMCPU_ARM7)
+	{
+		if ( (addr & 0x0F000000) == 0x02000000)
+			return T1ReadLong_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
+		else if((addr & 0xFF800000) == 0x03800000)
+			return T1ReadLong_guaranteedAligned(MMU.ARM7_ERAM, addr&0xFFFF);
+		else if((addr & 0xFF800000) == 0x03000000)
+			return T1ReadLong_guaranteedAligned(MMU.SWIRAM, addr&0x7FFF);
+	}
+
+
+	//for other arm9 cases, we have to check from dtcm first because it is patched on top of the main memory range
 	if(PROCNUM==ARMCPU_ARM9)
+	{
 		if((addr&(~0x3FFF)) == MMU.DTCMRegion)
 		{
 			//Returns data from DTCM (ARM9 only)
 			return T1ReadLong(ARM9Mem.ARM9_DTCM, addr & 0x3FFF);
 		}
-
-	if ( (addr & 0x0F000000) == 0x02000000)
-		return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
+	
+		if ( (addr & 0x0F000000) == 0x02000000)
+			return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
+	}

 dunno:
 	if(PROCNUM==ARMCPU_ARM9) return _MMU_ARM9_read32(addr);
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp
@ -52,6 +52,8 @@

 //#undef FORCEINLINE
 //#define FORCEINLINE
+//#undef INLINE
+//#define INLINE

 using std::min;
 using std::max;
@ -252,6 +254,37 @@ FORCEINLINE int iround(float f) {
 	return (int)f; //lol
 }

+//this function is an unreliable, inaccurate floor.
+//it should only be used for positive numbers
+//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
+FORCEINLINE u32 u32floor(float f)
+{
+#ifndef NOSSE2
+	__asm cvttss2si eax, f;
+#else
+	return (u32)f;
+#endif
+}
+
+//same as above but works for negative values too.
+//be sure that the results are the same thing as floorf!
+FORCEINLINE s32 s32floor(float f)
+{
+#ifndef NOSSE2
+	static const float c = -0.5f;
+	__asm
+	{
+		movss xmm0, f;
+		addss xmm0, xmm0;
+		addss xmm0, c;
+		cvtss2si eax, xmm0
+		sar eax, 1
+	}
+#else
+	return (s32)floorf(f);
+#endif
+}
+
 static struct Sampler
 {
 	int width, height;
@ -320,8 +353,8 @@ static struct Sampler
 	{
 		//finally, we can use floor here. but, it is slower than we want.
 		//the best solution is probably to wait until the pipeline is full of fixed point
-		int iu = (int)floorf(u);
-		int iv = (int)floorf(v);
+		s32 iu = s32floor(u);
+		s32 iv = s32floor(v);
 		dowrap(iu,iv);

 		FragmentColor color;
@ -460,12 +493,15 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
 	if(gfx3d.wbuffer) {
 		//not sure about this
 		//this value was chosen to make the skybox, castle window decals, and water level render correctly in SM64
-		depth = (u32)(4096*w);
+		depth = u32floor(4096*w);
 	}
 	else
 	{
+		float test = -1.2f;
+		u32 test2 = u32floor(test);
 		//depth = fastFloor(z*0x7FFF)>>8;
-		depth = (u32)(z*0x7FFF);
+		//depth = (u32)(z*0x7FFF);
+		depth = u32floor(z*0x7FFF);
 		//depth = z*0xFFFFFF;
 	}
 	if(polyAttr.decalMode)
@ -495,9 +531,9 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
 	//this is a HACK: 
 	//we are being very sloppy with our interpolation precision right now
 	//and rather than fix it, i just want to clamp it
-	shader.materialColor.r = max(0,min(31,(int)r));
-	shader.materialColor.g = max(0,min(31,(int)g));
-	shader.materialColor.b = max(0,min(31,(int)b));
+	shader.materialColor.r = max(0U,min(31U,u32floor(r)));
+	shader.materialColor.g = max(0U,min(31U,u32floor(g)));
+	shader.materialColor.b = max(0U,min(31U,u32floor(b)));

 	shader.materialColor.a = polyAttr.alpha;

@ -600,12 +636,13 @@ static FORCEINLINE void pixel(int adr,float r, float g, float b, float invu, flo
 	;
 }

+
 typedef int fixed28_4;

 static bool failure;

 // handle floor divides and mods correctly 
-inline void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
+INLINE void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod)
 {
 	//These must be caused by invalid or degenerate shapes.. not sure yet.
 	//check it out in the mario face intro of SM64
@ -636,10 +673,10 @@ inline void FloorDivMod(long Numerator, long Denominator, long &Floor, long &Mod
 	}
 }

-inline fixed28_4 FloatToFixed28_4( float Value ) {
+INLINE fixed28_4 FloatToFixed28_4( float Value ) {
 	return (fixed28_4)(Value * 16);
 }
-inline float Fixed28_4ToFloat( fixed28_4 Value ) {
+INLINE float Fixed28_4ToFloat( fixed28_4 Value ) {
 	return Value / 16.0;
 }
 //inline fixed16_16 FloatToFixed16_16( float Value ) {
@ -648,11 +685,11 @@ inline float Fixed28_4ToFloat( fixed28_4 Value ) {
 //inline float Fixed16_16ToFloat( fixed16_16 Value ) {
 //	return Value / 65536.0;
 //}
-inline fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
+INLINE fixed28_4 Fixed28_4Mul( fixed28_4 A, fixed28_4 B ) {
 	// could make this asm to prevent overflow
 	return (A * B) / 16;	// 28.4 * 28.4 = 24.8 / 16 = 28.4
 }
-inline int Ceil28_4( fixed28_4 Value ) {
+INLINE int Ceil28_4( fixed28_4 Value ) {
 	int ReturnValue;
 	int Numerator = Value - 1 + 16;
 	if(Numerator >= 0) {
@ -813,7 +850,7 @@ static void runscanlines(edge_fx_fl *left, edge_fx_fl *right)

 //rotates verts counterclockwise
 template<int type>
-inline static void rot_verts() {
+INLINE static void rot_verts() {
 	#define ROTSWAP(X) if(type>X) swap(verts[X-1],verts[X]);
 	ROTSWAP(1); ROTSWAP(2); ROTSWAP(3); ROTSWAP(4);
 	ROTSWAP(5); ROTSWAP(6); ROTSWAP(7);