- oglrender: add some lookup tables, optimize getline.

- add load average calculation and display it in windows (for correlating slow cpu to slow emulation) - fix bug(?) in texture transformation mode 1. I'm not 100% positive about it.
2008-09-13 06:11:12 +00:00 · 2008-09-13 06:11:12 +00:00 · e4f9a0f5dc
parent ecc0a65749
commit e4f9a0f5dc
5 changed files with 116 additions and 81 deletions
--- a/desmume/ChangeLog
+++ b/desmume/ChangeLog
@ -51,6 +51,8 @@
 - Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. [zeromus]
 - Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed [zeromus]
 - Added a bunch of crazy templates to the cpu and mmu which speed up a the emu little by optimizing variable accesses [zeromus]
 - Add an arm8 cpu load average calculator [zeromus]
 ? Fix a bug in texture transformation mode 1 [zeromus]
 0.7.3 -> 0.8
--- a/desmume/src/NDSSystem.cpp
+++ b/desmume/src/NDSSystem.cpp
@ -149,6 +149,8 @@ int NDS_Init( void) {
     nds.ARM9Cycle = 0;
     nds.ARM7Cycle = 0;
     nds.cycles = 0;
 	 nds.idleFrameCounter = 0;
 	 memset(nds.runCycleCollector,0,sizeof(nds.runCycleCollector));
     MMU_Init();
     nds.nextHBlank = 3168;
     nds.VCount = 0;
@ -887,9 +889,10 @@ NDS_exec(s32 nb, BOOL force)
 #endif
 				for (i = 0; i < 4 && (!force) && (execute); i++)
 				{
-					if(NDS_ARM9.waitIRQ)
+					if(NDS_ARM9.waitIRQ) {
 						nds.ARM9Cycle += 100;
-					else
+						nds.idleCycles += 100;
 					} else
 						//nds.ARM9Cycle += NDS_ARM9.exec();
 						//nds.ARM9Cycle += armcpu_exec(&NDS_ARM9);
 						nds.ARM9Cycle += armcpu_exec<0>();
@ -1030,6 +1033,11 @@ NDS_exec(s32 nb, BOOL force)
                  T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1);
                  NDS_ARM9VBlankInt();
                  NDS_ARM7VBlankInt();
 				  nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles;
 				  nds.idleFrameCounter++;
 				  nds.idleFrameCounter &= 15;
 				  nds.idleCycles = 0;
                  if(MMU.DMAStartTime[0][0] == 1)
                    MMU_doDMA(0, 0);
--- a/desmume/src/NDSSystem.h
+++ b/desmume/src/NDSSystem.h
@ -118,6 +118,12 @@ typedef struct
 	u16 touchX;
 	u16 touchY;
 	s32 idleCycles;
 	s32 runCycleCollector[16];
 	s32 idleFrameCounter;
 } NDSSystem;
 /** /brief A touchscreen calibration point.
--- a/desmume/src/windows/OGLRender.cpp
+++ b/desmume/src/windows/OGLRender.cpp
@ -27,6 +27,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
 #ifndef DESMUME_COCOA
 	#define WIN32_LEAN_AND_MEAN
@ -58,12 +59,6 @@
 static __declspec(align(16)) unsigned char  GPU_screen3D		[256*256*4]={0};
 static __declspec(align(16)) unsigned char  GPU_screenStencil[256*256]={0};
 // Acceleration tables
 static float*		float16table = NULL;
 static float*		float10Table = NULL;
 static float*		float10RelTable = NULL;
 static float*		normalTable = NULL;
 // Matrix stack handling
 static __declspec(align(16)) MatrixStack	mtxStack[4];
 static __declspec(align(16)) float		mtxCurrent [4][16];
@ -113,13 +108,51 @@ static const u8 material_5bit_to_8bit[] = {
 	0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF
 };
 static const u8 material_3bit_to_8bit[] = {
 	0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF
 };
 // Acceleration tables
 static float float16table[65536];
 static float float10Table[1024];
 static float float10RelTable[1024];
 static float normalTable[1024];
 static u32 color_15bit_to_24bit[32768];
 //produce a 32bpp color from a DS RGB16
 #define RGB16TO32(col,alpha) (((alpha)<<24) | ((((col) & 0x7C00)>>7)<<16) | ((((col) & 0x3E0)>>2)<<8) | (((col) & 0x1F)<<3))
-//make a table out of this:
+
-#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
+
 //produce a 32bpp color from a ds RGB15 plus an 8bit alpha, using a table
 #define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | color_15bit_to_24bit[col] )
 //produce a 32bpp color from a ds RGB15 plus an 8bit alpha, not using a table (but using other tables)
 #define RGB15TO32_DIRECT(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
 void makeTables() {
 	//produce the color bits of a 32bpp color from a DS RGB15 using bit logic (internal use only)
 	#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
 	for(int i=0;i<32768;i++)
 		color_15bit_to_24bit[i] = RGB15TO24_BITLOGIC((u16)i);
 	for (int i = 0; i < 65536; i++)
 		float16table[i] = fix2float((signed short)i);
 	for (int i = 0; i < 1024; i++)
 		float10Table[i] = ((signed short)(i<<6)) / (float)(1<<12);
 	for (int i = 0; i < 1024; i++)
 		float10RelTable[i] = ((signed short)(i<<6)) / (float)(1<<18);
 	for (int i = 0; i < 1024; i++)
 		normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15);
 }
 static unsigned short matrixMode[2] = {GL_PROJECTION, GL_MODELVIEW};
 static short mode = 0;
@ -442,29 +475,7 @@ char NDS_glInit(void)
 	twiddleLists();
 	// Precalculate some tables, to avoid pushing data to the FPU and back for conversion
-	float16table = (float*) malloc (sizeof(float)*65536);
+	makeTables();
 	for (i = 0; i < 65536; i++)
 	{
 		float16table[i] = fix2float((signed short)i);
 	}
 	float10RelTable = (float*) malloc (sizeof(float)*1024);
 	for (i = 0; i < 1024; i++)
 	{
 		float10RelTable[i] = ((signed short)(i<<6)) / (float)(1<<18);
 	}
 	float10Table = (float*) malloc (sizeof(float)*1024);
 	for (i = 0; i < 1024; i++)
 	{
 		float10Table[i] = ((signed short)(i<<6)) / (float)(1<<12);
 	}
 	normalTable = (float*) malloc (sizeof(float)*1024);
 	for (i = 0; i < 1024; i++)
 	{
 		normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15);
 	}
 	MatrixStackSetMaxSize(&mtxStack[0], 1);		// Projection stack
 	MatrixStackSetMaxSize(&mtxStack[1], 31);	// Coordinate stack
@ -881,7 +892,7 @@ static void DebugDumpTexture(int which)
 //================================================================================
 static int lastTexture = -1;
-__forceinline void setTexture(unsigned int format, unsigned int texpal)
+void setTexture(unsigned int format, unsigned int texpal)
 {
 	int palSize[7]={32,4,16,256,0,8,32768};
 	int i=0;
@ -978,10 +989,6 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
 	glLoadIdentity ();
 	glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f);
 	if(i==62 || textureMode==1) {
 		int zzz=9;
 	}
 	//printlog("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY);
 	//============================================================================ Texture render
@ -1213,7 +1220,7 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
 				{
 					unsigned short c = map[x];
 					int alpha = ((c&0x8000)?255:0);
-					*dst = RGB15TO32(c,alpha);
+					*dst = RGB15TO32(c&0x7FFF,alpha);
 					dst++;
 					txt_slot_current_size-=2;;
@ -1643,10 +1650,16 @@ __forceinline void NDS_glTexCoord(unsigned long val)
 	if (texCoordinateTransform == 1)
 	{
 		//last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] +
 		//		0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12];
 		//last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] +
 		//		0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13];
 		//zero 9/11/08 - I dunno... I think it needs to be like this to make things look right
 		last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] +
-				0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12];
+				mtxCurrent[3][8] + mtxCurrent[3][12];
 		last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] +
-				0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13];
+				mtxCurrent[3][9] + mtxCurrent[3][13];
 	}
 	else
 	{
@ -1972,15 +1985,17 @@ void GL_ReadFramebuffer()
 //NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better)
 //zeromus post-mod3: fix even better
-__forceinline void NDS_glGetLine (int line, unsigned short * dst)
+static void NDS_glGetLine (int line, u16* dst)
 {
 	assert(line<192 && line>=0);
 	if(needRefreshFramebuffer) {
 		needRefreshFramebuffer = false;
 		GL_ReadFramebuffer();
 	}
-	int		i, t;
+	
-	u8	*screen3D		= (u8 *)&GPU_screen3D	[(191-(line%192))*1024];
+	u8 *screen3D = (u8*)GPU_screen3D+((191-line)<<10);
-	u8  *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256];
+	u8 *screenStencil = (u8*)GPU_screenStencil+((191-line)<<8);
 	//the renderer clears the stencil to 0
 	//then it sets it to 1 whenever it renders a pixel that passes the alpha test
@ -1992,33 +2007,27 @@ __forceinline void NDS_glGetLine (int line, unsigned short * dst)
 	//this alpha compositing blending logic isnt thought through at all
 	//someone needs to think about what bitdepth it should take place at and how to do it efficiently
-	u32		a,r,g,b,stencil,oldcolor,oldr,oldg,oldb;
+	for(int i = 0; i < 256; i++)
 	for(i = 0, t=0; i < 256; i++)
 	{
-		stencil = screenStencil[i];
+		u32 stencil = screenStencil[i];
 		//you would use this if you wanted to use the stencil buffer to make decisions here
 		if(!stencil) continue;
-		t=i*4;
+		int t=i<<2;
-		r = screen3D[t+0];
+		u32 r = screen3D[t+0];
-		g = screen3D[t+1];
+		u32 g = screen3D[t+1];
-		b = screen3D[t+2];
+		u32 b = screen3D[t+2];
-		a = screen3D[t+3];
+		u32 a = screen3D[t+3];
-		if(a != 0xFF && a != 0) {
+		u32 oldcolor = RGB15TO32(dst[i],0);
-			int zzz=9;
+		u32 oldr = oldcolor&0xFF;
-		}
+		u32 oldg = (oldcolor>>8)&0xFF;
 		u32 oldb = (oldcolor>>16)&0xFF;
-		oldcolor = RGB15TO32(dst[i],0);
+		r = (r*a + oldr*(255-a)) >> 8;
-		oldr = oldcolor&0xFF;
+		g = (g*a + oldg*(255-a)) >> 8;
-		oldg = (oldcolor>>8)&0xFF;
+		b = (b*a + oldb*(255-a)) >> 8;
 		oldb = (oldcolor>>16)&0xFF;
 		r = (r*a + oldr*(255-a)) / 255;
 		g = (g*a + oldg*(255-a)) / 255;
 		b = (b*a + oldb*(255-a)) / 255;
 		r=min(255,r);
 		g=min(255,g);
--- a/desmume/src/windows/main.cpp
+++ b/desmume/src/windows/main.cpp
@ -541,15 +541,25 @@ DWORD WINAPI run( LPVOID lpParameter)
                  fpsframecount++;
                  QueryPerformanceCounter((LARGE_INTEGER *)&curticks);
-                  if(curticks >= fpsticks + freq) // TODO: print fps on screen in DDraw
+				  bool oneSecond = curticks >= fpsticks + freq;
                  if(oneSecond) // TODO: print fps on screen in DDraw
                  {
                     fps = fpsframecount;
 					 sprintf(txt,"(%d) DeSmuME v%s", fps, VERSION);
                     SetWindowText(hwnd, txt);
                     fpsframecount = 0;
                     QueryPerformanceCounter((LARGE_INTEGER *)&fpsticks);
                  }
 				  if(nds.idleFrameCounter==0 || oneSecond) 
 				  {
 					  //calculate a 16 frame arm9 load average
 					 int load = 0;
 					 for(int i=0;i<16;i++)
 						 load = load/8 + nds.runCycleCollector[(i+nds.idleFrameCounter)&15]*7/8;
 					 load = min(100,max(0,(int)(load*100/1120380)));
 					 sprintf(txt,"(%02d|%02d%%) DeSmuME v%s", fps, load, VERSION);
 					 SetWindowText(hwnd, txt);
 				  }
                  framesskipped = 0;
                  if (framestoskip > 0)