From e4f9a0f5dc51feda0744e7e4817dfa41985c6306 Mon Sep 17 00:00:00 2001 From: zeromus Date: Sat, 13 Sep 2008 06:11:12 +0000 Subject: [PATCH] - oglrender: add some lookup tables, optimize getline. - add load average calculation and display it in windows (for correlating slow cpu to slow emulation) - fix bug(?) in texture transformation mode 1. I'm not 100% positive about it. --- desmume/ChangeLog | 2 + desmume/src/NDSSystem.cpp | 12 ++- desmume/src/NDSSystem.h | 32 ++++--- desmume/src/windows/OGLRender.cpp | 135 ++++++++++++++++-------------- desmume/src/windows/main.cpp | 16 +++- 5 files changed, 116 insertions(+), 81 deletions(-) diff --git a/desmume/ChangeLog b/desmume/ChangeLog index 3de165149..dd6c5f2d9 100644 --- a/desmume/ChangeLog +++ b/desmume/ChangeLog @@ -51,6 +51,8 @@ - Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. [zeromus] - Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed [zeromus] - Added a bunch of crazy templates to the cpu and mmu which speed up a the emu little by optimizing variable accesses [zeromus] + - Add an arm8 cpu load average calculator [zeromus] + ? Fix a bug in texture transformation mode 1 [zeromus] 0.7.3 -> 0.8 diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index 887be2794..b8a7f7791 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -149,6 +149,8 @@ int NDS_Init( void) { nds.ARM9Cycle = 0; nds.ARM7Cycle = 0; nds.cycles = 0; + nds.idleFrameCounter = 0; + memset(nds.runCycleCollector,0,sizeof(nds.runCycleCollector)); MMU_Init(); nds.nextHBlank = 3168; nds.VCount = 0; @@ -887,9 +889,10 @@ NDS_exec(s32 nb, BOOL force) #endif for (i = 0; i < 4 && (!force) && (execute); i++) { - if(NDS_ARM9.waitIRQ) + if(NDS_ARM9.waitIRQ) { nds.ARM9Cycle += 100; - else + nds.idleCycles += 100; + } else //nds.ARM9Cycle += NDS_ARM9.exec(); //nds.ARM9Cycle += armcpu_exec(&NDS_ARM9); nds.ARM9Cycle += armcpu_exec<0>(); @@ -1030,6 +1033,11 @@ NDS_exec(s32 nb, BOOL force) T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1); NDS_ARM9VBlankInt(); NDS_ARM7VBlankInt(); + nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles; + nds.idleFrameCounter++; + nds.idleFrameCounter &= 15; + nds.idleCycles = 0; + if(MMU.DMAStartTime[0][0] == 1) MMU_doDMA(0, 0); diff --git a/desmume/src/NDSSystem.h b/desmume/src/NDSSystem.h index 3d5ff8fef..a0b3690e2 100644 --- a/desmume/src/NDSSystem.h +++ b/desmume/src/NDSSystem.h @@ -105,19 +105,25 @@ extern void debug(); typedef struct { - s32 ARM9Cycle; - s32 ARM7Cycle; - s32 cycles; - s32 timerCycle[2][4]; - BOOL timerOver[2][4]; - s32 nextHBlank; - u32 VCount; - u32 old; - s32 diff; - BOOL lignerendu; - - u16 touchX; - u16 touchY; + s32 ARM9Cycle; + s32 ARM7Cycle; + s32 cycles; + s32 timerCycle[2][4]; + BOOL timerOver[2][4]; + s32 nextHBlank; + u32 VCount; + u32 old; + s32 diff; + BOOL lignerendu; + + u16 touchX; + u16 touchY; + + s32 idleCycles; + s32 runCycleCollector[16]; + s32 idleFrameCounter; + + } NDSSystem; /** /brief A touchscreen calibration point. diff --git a/desmume/src/windows/OGLRender.cpp b/desmume/src/windows/OGLRender.cpp index 8caa5b8f0..7afc092f1 100644 --- a/desmume/src/windows/OGLRender.cpp +++ b/desmume/src/windows/OGLRender.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #ifndef DESMUME_COCOA #define WIN32_LEAN_AND_MEAN @@ -58,12 +59,6 @@ static __declspec(align(16)) unsigned char GPU_screen3D [256*256*4]={0}; static __declspec(align(16)) unsigned char GPU_screenStencil[256*256]={0}; -// Acceleration tables -static float* float16table = NULL; -static float* float10Table = NULL; -static float* float10RelTable = NULL; -static float* normalTable = NULL; - // Matrix stack handling static __declspec(align(16)) MatrixStack mtxStack[4]; static __declspec(align(16)) float mtxCurrent [4][16]; @@ -113,13 +108,51 @@ static const u8 material_5bit_to_8bit[] = { 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF }; + static const u8 material_3bit_to_8bit[] = { 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF }; +// Acceleration tables +static float float16table[65536]; +static float float10Table[1024]; +static float float10RelTable[1024]; +static float normalTable[1024]; +static u32 color_15bit_to_24bit[32768]; + + +//produce a 32bpp color from a DS RGB16 #define RGB16TO32(col,alpha) (((alpha)<<24) | ((((col) & 0x7C00)>>7)<<16) | ((((col) & 0x3E0)>>2)<<8) | (((col) & 0x1F)<<3)) -//make a table out of this: -#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) + + +//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, using a table +#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | color_15bit_to_24bit[col] ) + +//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, not using a table (but using other tables) +#define RGB15TO32_DIRECT(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) + + +void makeTables() { + + //produce the color bits of a 32bpp color from a DS RGB15 using bit logic (internal use only) + #define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] ) + + for(int i=0;i<32768;i++) + color_15bit_to_24bit[i] = RGB15TO24_BITLOGIC((u16)i); + + for (int i = 0; i < 65536; i++) + float16table[i] = fix2float((signed short)i); + + for (int i = 0; i < 1024; i++) + float10Table[i] = ((signed short)(i<<6)) / (float)(1<<12); + + for (int i = 0; i < 1024; i++) + float10RelTable[i] = ((signed short)(i<<6)) / (float)(1<<18); + + for (int i = 0; i < 1024; i++) + normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15); +} + static unsigned short matrixMode[2] = {GL_PROJECTION, GL_MODELVIEW}; static short mode = 0; @@ -442,29 +475,7 @@ char NDS_glInit(void) twiddleLists(); // Precalculate some tables, to avoid pushing data to the FPU and back for conversion - float16table = (float*) malloc (sizeof(float)*65536); - for (i = 0; i < 65536; i++) - { - float16table[i] = fix2float((signed short)i); - } - - float10RelTable = (float*) malloc (sizeof(float)*1024); - for (i = 0; i < 1024; i++) - { - float10RelTable[i] = ((signed short)(i<<6)) / (float)(1<<18); - } - - float10Table = (float*) malloc (sizeof(float)*1024); - for (i = 0; i < 1024; i++) - { - float10Table[i] = ((signed short)(i<<6)) / (float)(1<<12); - } - - normalTable = (float*) malloc (sizeof(float)*1024); - for (i = 0; i < 1024; i++) - { - normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15); - } + makeTables(); MatrixStackSetMaxSize(&mtxStack[0], 1); // Projection stack MatrixStackSetMaxSize(&mtxStack[1], 31); // Coordinate stack @@ -881,7 +892,7 @@ static void DebugDumpTexture(int which) //================================================================================ static int lastTexture = -1; -__forceinline void setTexture(unsigned int format, unsigned int texpal) +void setTexture(unsigned int format, unsigned int texpal) { int palSize[7]={32,4,16,256,0,8,32768}; int i=0; @@ -978,10 +989,6 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal) glLoadIdentity (); glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f); - if(i==62 || textureMode==1) { - int zzz=9; - } - //printlog("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY); //============================================================================ Texture render @@ -1213,7 +1220,7 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal) { unsigned short c = map[x]; int alpha = ((c&0x8000)?255:0); - *dst = RGB15TO32(c,alpha); + *dst = RGB15TO32(c&0x7FFF,alpha); dst++; txt_slot_current_size-=2;; @@ -1643,10 +1650,16 @@ __forceinline void NDS_glTexCoord(unsigned long val) if (texCoordinateTransform == 1) { + //last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] + + // 0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12]; + //last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] + + // 0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13]; + + //zero 9/11/08 - I dunno... I think it needs to be like this to make things look right last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] + - 0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12]; + mtxCurrent[3][8] + mtxCurrent[3][12]; last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] + - 0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13]; + mtxCurrent[3][9] + mtxCurrent[3][13]; } else { @@ -1972,15 +1985,17 @@ void GL_ReadFramebuffer() //NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better) //zeromus post-mod3: fix even better -__forceinline void NDS_glGetLine (int line, unsigned short * dst) +static void NDS_glGetLine (int line, u16* dst) { + assert(line<192 && line>=0); + if(needRefreshFramebuffer) { needRefreshFramebuffer = false; GL_ReadFramebuffer(); } - int i, t; - u8 *screen3D = (u8 *)&GPU_screen3D [(191-(line%192))*1024]; - u8 *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256]; + + u8 *screen3D = (u8*)GPU_screen3D+((191-line)<<10); + u8 *screenStencil = (u8*)GPU_screenStencil+((191-line)<<8); //the renderer clears the stencil to 0 //then it sets it to 1 whenever it renders a pixel that passes the alpha test @@ -1992,33 +2007,27 @@ __forceinline void NDS_glGetLine (int line, unsigned short * dst) //this alpha compositing blending logic isnt thought through at all //someone needs to think about what bitdepth it should take place at and how to do it efficiently - u32 a,r,g,b,stencil,oldcolor,oldr,oldg,oldb; - - for(i = 0, t=0; i < 256; i++) + for(int i = 0; i < 256; i++) { - stencil = screenStencil[i]; + u32 stencil = screenStencil[i]; //you would use this if you wanted to use the stencil buffer to make decisions here if(!stencil) continue; - t=i*4; - r = screen3D[t+0]; - g = screen3D[t+1]; - b = screen3D[t+2]; - a = screen3D[t+3]; + int t=i<<2; + u32 r = screen3D[t+0]; + u32 g = screen3D[t+1]; + u32 b = screen3D[t+2]; + u32 a = screen3D[t+3]; - if(a != 0xFF && a != 0) { - int zzz=9; - } + u32 oldcolor = RGB15TO32(dst[i],0); + u32 oldr = oldcolor&0xFF; + u32 oldg = (oldcolor>>8)&0xFF; + u32 oldb = (oldcolor>>16)&0xFF; - oldcolor = RGB15TO32(dst[i],0); - oldr = oldcolor&0xFF; - oldg = (oldcolor>>8)&0xFF; - oldb = (oldcolor>>16)&0xFF; - - r = (r*a + oldr*(255-a)) / 255; - g = (g*a + oldg*(255-a)) / 255; - b = (b*a + oldb*(255-a)) / 255; + r = (r*a + oldr*(255-a)) >> 8; + g = (g*a + oldg*(255-a)) >> 8; + b = (b*a + oldb*(255-a)) >> 8; r=min(255,r); g=min(255,g); diff --git a/desmume/src/windows/main.cpp b/desmume/src/windows/main.cpp index 3c6772ed9..1be9b9d44 100644 --- a/desmume/src/windows/main.cpp +++ b/desmume/src/windows/main.cpp @@ -541,15 +541,25 @@ DWORD WINAPI run( LPVOID lpParameter) fpsframecount++; QueryPerformanceCounter((LARGE_INTEGER *)&curticks); - if(curticks >= fpsticks + freq) // TODO: print fps on screen in DDraw + bool oneSecond = curticks >= fpsticks + freq; + if(oneSecond) // TODO: print fps on screen in DDraw { fps = fpsframecount; - sprintf(txt,"(%d) DeSmuME v%s", fps, VERSION); - SetWindowText(hwnd, txt); fpsframecount = 0; QueryPerformanceCounter((LARGE_INTEGER *)&fpsticks); } + if(nds.idleFrameCounter==0 || oneSecond) + { + //calculate a 16 frame arm9 load average + int load = 0; + for(int i=0;i<16;i++) + load = load/8 + nds.runCycleCollector[(i+nds.idleFrameCounter)&15]*7/8; + load = min(100,max(0,(int)(load*100/1120380))); + sprintf(txt,"(%02d|%02d%%) DeSmuME v%s", fps, load, VERSION); + SetWindowText(hwnd, txt); + } + framesskipped = 0; if (framestoskip > 0)