- oglrender: add some lookup tables, optimize getline.

- add load average calculation and display it in windows (for correlating slow cpu to slow emulation)
- fix bug(?) in texture transformation mode 1. I'm not 100% positive about it.
This commit is contained in:
zeromus 2008-09-13 06:11:12 +00:00
parent ecc0a65749
commit e4f9a0f5dc
5 changed files with 116 additions and 81 deletions

View File

@ -51,6 +51,8 @@
- Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. [zeromus] - Add opengl state caching. This is of dubious performance assistance, but it is easy to take out so I am leaving it for now. [zeromus]
- Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed [zeromus] - Add MMU->GPU signal for when vram mappings change, which allows it to assume textures are unchanged unless vram has changed [zeromus]
- Added a bunch of crazy templates to the cpu and mmu which speed up a the emu little by optimizing variable accesses [zeromus] - Added a bunch of crazy templates to the cpu and mmu which speed up a the emu little by optimizing variable accesses [zeromus]
- Add an arm8 cpu load average calculator [zeromus]
? Fix a bug in texture transformation mode 1 [zeromus]
0.7.3 -> 0.8 0.7.3 -> 0.8

View File

@ -149,6 +149,8 @@ int NDS_Init( void) {
nds.ARM9Cycle = 0; nds.ARM9Cycle = 0;
nds.ARM7Cycle = 0; nds.ARM7Cycle = 0;
nds.cycles = 0; nds.cycles = 0;
nds.idleFrameCounter = 0;
memset(nds.runCycleCollector,0,sizeof(nds.runCycleCollector));
MMU_Init(); MMU_Init();
nds.nextHBlank = 3168; nds.nextHBlank = 3168;
nds.VCount = 0; nds.VCount = 0;
@ -887,9 +889,10 @@ NDS_exec(s32 nb, BOOL force)
#endif #endif
for (i = 0; i < 4 && (!force) && (execute); i++) for (i = 0; i < 4 && (!force) && (execute); i++)
{ {
if(NDS_ARM9.waitIRQ) if(NDS_ARM9.waitIRQ) {
nds.ARM9Cycle += 100; nds.ARM9Cycle += 100;
else nds.idleCycles += 100;
} else
//nds.ARM9Cycle += NDS_ARM9.exec(); //nds.ARM9Cycle += NDS_ARM9.exec();
//nds.ARM9Cycle += armcpu_exec(&NDS_ARM9); //nds.ARM9Cycle += armcpu_exec(&NDS_ARM9);
nds.ARM9Cycle += armcpu_exec<0>(); nds.ARM9Cycle += armcpu_exec<0>();
@ -1030,6 +1033,11 @@ NDS_exec(s32 nb, BOOL force)
T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1); T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1);
NDS_ARM9VBlankInt(); NDS_ARM9VBlankInt();
NDS_ARM7VBlankInt(); NDS_ARM7VBlankInt();
nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles;
nds.idleFrameCounter++;
nds.idleFrameCounter &= 15;
nds.idleCycles = 0;
if(MMU.DMAStartTime[0][0] == 1) if(MMU.DMAStartTime[0][0] == 1)
MMU_doDMA(0, 0); MMU_doDMA(0, 0);

View File

@ -118,6 +118,12 @@ typedef struct
u16 touchX; u16 touchX;
u16 touchY; u16 touchY;
s32 idleCycles;
s32 runCycleCollector[16];
s32 idleFrameCounter;
} NDSSystem; } NDSSystem;
/** /brief A touchscreen calibration point. /** /brief A touchscreen calibration point.

View File

@ -27,6 +27,7 @@
#include <math.h> #include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <assert.h>
#ifndef DESMUME_COCOA #ifndef DESMUME_COCOA
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
@ -58,12 +59,6 @@
static __declspec(align(16)) unsigned char GPU_screen3D [256*256*4]={0}; static __declspec(align(16)) unsigned char GPU_screen3D [256*256*4]={0};
static __declspec(align(16)) unsigned char GPU_screenStencil[256*256]={0}; static __declspec(align(16)) unsigned char GPU_screenStencil[256*256]={0};
// Acceleration tables
static float* float16table = NULL;
static float* float10Table = NULL;
static float* float10RelTable = NULL;
static float* normalTable = NULL;
// Matrix stack handling // Matrix stack handling
static __declspec(align(16)) MatrixStack mtxStack[4]; static __declspec(align(16)) MatrixStack mtxStack[4];
static __declspec(align(16)) float mtxCurrent [4][16]; static __declspec(align(16)) float mtxCurrent [4][16];
@ -113,13 +108,51 @@ static const u8 material_5bit_to_8bit[] = {
0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF 0xC6, 0xCE, 0xD6, 0xDE, 0xE7, 0xEF, 0xF7, 0xFF
}; };
static const u8 material_3bit_to_8bit[] = { static const u8 material_3bit_to_8bit[] = {
0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF 0x00, 0x24, 0x49, 0x6D, 0x92, 0xB6, 0xDB, 0xFF
}; };
// Acceleration tables
static float float16table[65536];
static float float10Table[1024];
static float float10RelTable[1024];
static float normalTable[1024];
static u32 color_15bit_to_24bit[32768];
//produce a 32bpp color from a DS RGB16
#define RGB16TO32(col,alpha) (((alpha)<<24) | ((((col) & 0x7C00)>>7)<<16) | ((((col) & 0x3E0)>>2)<<8) | (((col) & 0x1F)<<3)) #define RGB16TO32(col,alpha) (((alpha)<<24) | ((((col) & 0x7C00)>>7)<<16) | ((((col) & 0x3E0)>>2)<<8) | (((col) & 0x1F)<<3))
//make a table out of this:
#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, using a table
#define RGB15TO32(col,alpha8) ( ((alpha8)<<24) | color_15bit_to_24bit[col] )
//produce a 32bpp color from a ds RGB15 plus an 8bit alpha, not using a table (but using other tables)
#define RGB15TO32_DIRECT(col,alpha8) ( ((alpha8)<<24) | (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
void makeTables() {
//produce the color bits of a 32bpp color from a DS RGB15 using bit logic (internal use only)
#define RGB15TO24_BITLOGIC(col) ( (material_5bit_to_8bit[((col)>>10)&0x1F]<<16) | (material_5bit_to_8bit[((col)>>5)&0x1F]<<8) | material_5bit_to_8bit[(col)&0x1F] )
for(int i=0;i<32768;i++)
color_15bit_to_24bit[i] = RGB15TO24_BITLOGIC((u16)i);
for (int i = 0; i < 65536; i++)
float16table[i] = fix2float((signed short)i);
for (int i = 0; i < 1024; i++)
float10Table[i] = ((signed short)(i<<6)) / (float)(1<<12);
for (int i = 0; i < 1024; i++)
float10RelTable[i] = ((signed short)(i<<6)) / (float)(1<<18);
for (int i = 0; i < 1024; i++)
normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15);
}
static unsigned short matrixMode[2] = {GL_PROJECTION, GL_MODELVIEW}; static unsigned short matrixMode[2] = {GL_PROJECTION, GL_MODELVIEW};
static short mode = 0; static short mode = 0;
@ -442,29 +475,7 @@ char NDS_glInit(void)
twiddleLists(); twiddleLists();
// Precalculate some tables, to avoid pushing data to the FPU and back for conversion // Precalculate some tables, to avoid pushing data to the FPU and back for conversion
float16table = (float*) malloc (sizeof(float)*65536); makeTables();
for (i = 0; i < 65536; i++)
{
float16table[i] = fix2float((signed short)i);
}
float10RelTable = (float*) malloc (sizeof(float)*1024);
for (i = 0; i < 1024; i++)
{
float10RelTable[i] = ((signed short)(i<<6)) / (float)(1<<18);
}
float10Table = (float*) malloc (sizeof(float)*1024);
for (i = 0; i < 1024; i++)
{
float10Table[i] = ((signed short)(i<<6)) / (float)(1<<12);
}
normalTable = (float*) malloc (sizeof(float)*1024);
for (i = 0; i < 1024; i++)
{
normalTable[i] = ((signed short)(i<<6)) / (float)(1<<15);
}
MatrixStackSetMaxSize(&mtxStack[0], 1); // Projection stack MatrixStackSetMaxSize(&mtxStack[0], 1); // Projection stack
MatrixStackSetMaxSize(&mtxStack[1], 31); // Coordinate stack MatrixStackSetMaxSize(&mtxStack[1], 31); // Coordinate stack
@ -881,7 +892,7 @@ static void DebugDumpTexture(int which)
//================================================================================ //================================================================================
static int lastTexture = -1; static int lastTexture = -1;
__forceinline void setTexture(unsigned int format, unsigned int texpal) void setTexture(unsigned int format, unsigned int texpal)
{ {
int palSize[7]={32,4,16,256,0,8,32768}; int palSize[7]={32,4,16,256,0,8,32768};
int i=0; int i=0;
@ -978,10 +989,6 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
glLoadIdentity (); glLoadIdentity ();
glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f); glScaled (texcache[i].invSizeX, texcache[i].invSizeY, 1.0f);
if(i==62 || textureMode==1) {
int zzz=9;
}
//printlog("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY); //printlog("Texture %03i - format=%08X; pal=%04X (mode %X, width %04i, height %04i)\n",i, texcache[i].frm, texcache[i].pal, texcache[i].mode, sizeX, sizeY);
//============================================================================ Texture render //============================================================================ Texture render
@ -1213,7 +1220,7 @@ __forceinline void setTexture(unsigned int format, unsigned int texpal)
{ {
unsigned short c = map[x]; unsigned short c = map[x];
int alpha = ((c&0x8000)?255:0); int alpha = ((c&0x8000)?255:0);
*dst = RGB15TO32(c,alpha); *dst = RGB15TO32(c&0x7FFF,alpha);
dst++; dst++;
txt_slot_current_size-=2;; txt_slot_current_size-=2;;
@ -1643,10 +1650,16 @@ __forceinline void NDS_glTexCoord(unsigned long val)
if (texCoordinateTransform == 1) if (texCoordinateTransform == 1)
{ {
//last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] +
// 0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12];
//last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] +
// 0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13];
//zero 9/11/08 - I dunno... I think it needs to be like this to make things look right
last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] + last_s =_s*mtxCurrent[3][0] + _t*mtxCurrent[3][4] +
0.0625f*mtxCurrent[3][8] + 0.0625f*mtxCurrent[3][12]; mtxCurrent[3][8] + mtxCurrent[3][12];
last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] + last_t =_s*mtxCurrent[3][1] + _t*mtxCurrent[3][5] +
0.0625f*mtxCurrent[3][9] + 0.0625f*mtxCurrent[3][13]; mtxCurrent[3][9] + mtxCurrent[3][13];
} }
else else
{ {
@ -1972,15 +1985,17 @@ void GL_ReadFramebuffer()
//NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better) //NHerve mod3 - Fixed blending with 2D backgrounds (New Super Mario Bros looks better)
//zeromus post-mod3: fix even better //zeromus post-mod3: fix even better
__forceinline void NDS_glGetLine (int line, unsigned short * dst) static void NDS_glGetLine (int line, u16* dst)
{ {
assert(line<192 && line>=0);
if(needRefreshFramebuffer) { if(needRefreshFramebuffer) {
needRefreshFramebuffer = false; needRefreshFramebuffer = false;
GL_ReadFramebuffer(); GL_ReadFramebuffer();
} }
int i, t;
u8 *screen3D = (u8 *)&GPU_screen3D [(191-(line%192))*1024]; u8 *screen3D = (u8*)GPU_screen3D+((191-line)<<10);
u8 *screenStencil = (u8*)&GPU_screenStencil[(191-(line%192))*256]; u8 *screenStencil = (u8*)GPU_screenStencil+((191-line)<<8);
//the renderer clears the stencil to 0 //the renderer clears the stencil to 0
//then it sets it to 1 whenever it renders a pixel that passes the alpha test //then it sets it to 1 whenever it renders a pixel that passes the alpha test
@ -1992,33 +2007,27 @@ __forceinline void NDS_glGetLine (int line, unsigned short * dst)
//this alpha compositing blending logic isnt thought through at all //this alpha compositing blending logic isnt thought through at all
//someone needs to think about what bitdepth it should take place at and how to do it efficiently //someone needs to think about what bitdepth it should take place at and how to do it efficiently
u32 a,r,g,b,stencil,oldcolor,oldr,oldg,oldb; for(int i = 0; i < 256; i++)
for(i = 0, t=0; i < 256; i++)
{ {
stencil = screenStencil[i]; u32 stencil = screenStencil[i];
//you would use this if you wanted to use the stencil buffer to make decisions here //you would use this if you wanted to use the stencil buffer to make decisions here
if(!stencil) continue; if(!stencil) continue;
t=i*4; int t=i<<2;
r = screen3D[t+0]; u32 r = screen3D[t+0];
g = screen3D[t+1]; u32 g = screen3D[t+1];
b = screen3D[t+2]; u32 b = screen3D[t+2];
a = screen3D[t+3]; u32 a = screen3D[t+3];
if(a != 0xFF && a != 0) { u32 oldcolor = RGB15TO32(dst[i],0);
int zzz=9; u32 oldr = oldcolor&0xFF;
} u32 oldg = (oldcolor>>8)&0xFF;
u32 oldb = (oldcolor>>16)&0xFF;
oldcolor = RGB15TO32(dst[i],0); r = (r*a + oldr*(255-a)) >> 8;
oldr = oldcolor&0xFF; g = (g*a + oldg*(255-a)) >> 8;
oldg = (oldcolor>>8)&0xFF; b = (b*a + oldb*(255-a)) >> 8;
oldb = (oldcolor>>16)&0xFF;
r = (r*a + oldr*(255-a)) / 255;
g = (g*a + oldg*(255-a)) / 255;
b = (b*a + oldb*(255-a)) / 255;
r=min(255,r); r=min(255,r);
g=min(255,g); g=min(255,g);

View File

@ -541,15 +541,25 @@ DWORD WINAPI run( LPVOID lpParameter)
fpsframecount++; fpsframecount++;
QueryPerformanceCounter((LARGE_INTEGER *)&curticks); QueryPerformanceCounter((LARGE_INTEGER *)&curticks);
if(curticks >= fpsticks + freq) // TODO: print fps on screen in DDraw bool oneSecond = curticks >= fpsticks + freq;
if(oneSecond) // TODO: print fps on screen in DDraw
{ {
fps = fpsframecount; fps = fpsframecount;
sprintf(txt,"(%d) DeSmuME v%s", fps, VERSION);
SetWindowText(hwnd, txt);
fpsframecount = 0; fpsframecount = 0;
QueryPerformanceCounter((LARGE_INTEGER *)&fpsticks); QueryPerformanceCounter((LARGE_INTEGER *)&fpsticks);
} }
if(nds.idleFrameCounter==0 || oneSecond)
{
//calculate a 16 frame arm9 load average
int load = 0;
for(int i=0;i<16;i++)
load = load/8 + nds.runCycleCollector[(i+nds.idleFrameCounter)&15]*7/8;
load = min(100,max(0,(int)(load*100/1120380)));
sprintf(txt,"(%02d|%02d%%) DeSmuME v%s", fps, load, VERSION);
SetWindowText(hwnd, txt);
}
framesskipped = 0; framesskipped = 0;
if (framestoskip > 0) if (framestoskip > 0)