From 48325223152bc8eebf45ece16527f45ada7e6b3d Mon Sep 17 00:00:00 2001 From: zeromus Date: Tue, 5 May 2009 07:30:27 +0000 Subject: [PATCH] a handful of tiny optimizations and important bugfixes in gfx3d savestates. also add a MSC_FORCEINLINE macro for people to use instead when they doing like my FORCEINLINES and are thinking about removing them. at least then we know which ones didnt work on gcc. --- desmume/src/MMU.cpp | 2 +- desmume/src/MMU.h | 10 ++--- desmume/src/NDSSystem.cpp | 4 +- desmume/src/armcpu.cpp | 2 +- desmume/src/gfx3d.cpp | 67 ++++++++++++++++++++++++--------- desmume/src/gfx3d.h | 2 +- desmume/src/mem.h | 49 +++++++++++++++++------- desmume/src/types.h | 16 ++++++++ desmume/src/windows/disView.cpp | 9 ++++- desmume/src/windows/disView.h | 2 +- 10 files changed, 120 insertions(+), 43 deletions(-) diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index 044733d50..07f8426d2 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -529,7 +529,7 @@ static inline void MMU_VRAMmapRefreshBank(const int bank) u8 en = VRAMBankCnt & 0x80; if(!en) return; - int mst,ofs; + int mst,ofs=0; switch(bank) { case VRAM_BANK_A: case VRAM_BANK_B: diff --git a/desmume/src/MMU.h b/desmume/src/MMU.h index d4e987951..a650dd035 100644 --- a/desmume/src/MMU.h +++ b/desmume/src/MMU.h @@ -274,10 +274,10 @@ FORCEINLINE u16 _MMU_read16(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE) { if ((addr & 0x0F000000) == 0x02000000) - return T1ReadWord( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK); + return T1ReadWord_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK); if(addr<0x02000000) - return T1ReadWord(ARM9Mem.ARM9_ITCM, addr&0x7FFF); + return T1ReadWord_guaranteedAligned(ARM9Mem.ARM9_ITCM, addr&0x7FFF); goto dunno; } @@ -297,16 +297,16 @@ dunno: else return _MMU_ARM7_read16(addr); } -FORCEINLINE u32 _MMU_read32(int PROCNUM, const MMU_ACCESS_TYPE AT, const u32 addr) { +FORCEINLINE u32 _MMU_read32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u32 addr) { //special handling for execution from arm9, since we spend so much time in there if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE) { if ( (addr & 0x0F000000) == 0x02000000) - return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK); + return T1ReadLong_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK); if(addr<0x02000000) - return T1ReadLong(ARM9Mem.ARM9_ITCM, addr&0x7FFF); + return T1ReadLong_guaranteedAligned(ARM9Mem.ARM9_ITCM, addr&0x7FFF); goto dunno; } diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index 564b140e6..32e7c0449 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1643,7 +1643,7 @@ u32 NDS_exec(s32 nb) } #ifdef _WIN32 #ifdef DEVELOPER - DisassemblerTools_Refresh(ARMCPU_ARM9); + DisassemblerTools_Refresh(); #endif #endif } @@ -1691,7 +1691,7 @@ u32 NDS_exec(s32 nb) } #ifdef _WIN32 #ifdef DEVELOPER - DisassemblerTools_Refresh(ARMCPU_ARM7); + DisassemblerTools_Refresh(); #endif #endif } diff --git a/desmume/src/armcpu.cpp b/desmume/src/armcpu.cpp index 31db6a191..729ccc54b 100644 --- a/desmume/src/armcpu.cpp +++ b/desmume/src/armcpu.cpp @@ -365,7 +365,7 @@ template static u32 armcpu_prefetch() { - armcpu_t* armcpu = &ARMPROC; + armcpu_t* const armcpu = &ARMPROC; #ifdef GDB_STUB u32 temp_instruction; #endif diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index c286909f4..606a69eba 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -124,7 +124,9 @@ static float normalTable[1024]; #define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9)) CACHE_ALIGN u16 gfx3d_convertedScreen[256*192]; -CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192]; + +//this extra *2 is a HACK to salvage some savestates. remove me when the savestate format changes. +CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192*2]; // Matrix stack handling static CACHE_ALIGN MatrixStack mtxStack[4] = { @@ -1376,7 +1378,7 @@ void gfx3d_glFlush(u32 v) gfx3d.wbuffer = BIT1(v); } -static int _CDECL_ gfx3d_ysort_compare(const void * elem1, const void * elem2) +static int _CDECL_ gfx3d_ysort_compare_old_qsort(const void * elem1, const void * elem2) { int num1 = *(int*)elem1; int num2 = *(int*)elem2; @@ -1396,6 +1398,23 @@ static int _CDECL_ gfx3d_ysort_compare(const void * elem1, const void * elem2) return 0; } +static bool gfx3d_ysort_compare(int num1, int num2) +{ + const POLY &poly1 = polylist->list[num1]; + const POLY &poly2 = polylist->list[num2]; + + if(poly1.maxy > poly2.maxy) + return true; + else if(poly1.maxy < poly2.maxy) + return false; + else if(poly1.miny < poly2.miny) + return true; + else if(poly1.miny > poly2.miny) + return false; + else + return false; //equal should always return false "strict weak ordering" +} + void gfx3d_VBlankSignal() { @@ -1448,16 +1467,21 @@ void gfx3d_VBlankSignal() gfx3d.indexlist[ctr++] = i; } + //========NOT SURE YET WHETHER I NEED A STABLE SORT======== + //now we have to sort the opaque polys by y-value. //should this be done after clipping?? + //does this need to be a stable sort??? //test case: harvest moon island of happiness character cretor UI - qsort(gfx3d.indexlist, opaqueCount, 4, gfx3d_ysort_compare); + //std::stable_sort(gfx3d.indexlist, gfx3d.indexlist + opaqueCount, gfx3d_ysort_compare); + qsort(gfx3d.indexlist, opaqueCount, 4, gfx3d_ysort_compare_old_qsort); if(!gfx3d.sortmode) { //if we are autosorting translucent polys, we need to do this also //TODO - this is unverified behavior. need a test case - qsort(gfx3d.indexlist + opaqueCount, polycount - opaqueCount, 4, gfx3d_ysort_compare); + //std::stable_sort(gfx3d.indexlist + opaqueCount, gfx3d.indexlist + polycount - opaqueCount, gfx3d_ysort_compare); + qsort(gfx3d.indexlist + opaqueCount, polycount - opaqueCount, 4, gfx3d_ysort_compare_old_qsort); } //switch to the new lists @@ -2256,14 +2280,6 @@ SFORMAT SF_GFX3D[]={ { "GMOD", 4, 1, &mode}, { "GMTM", 4,16, mtxTemporal}, { "GMCU", 4,64, mtxCurrent}, - { "GM0P", 4, 1, &mtxStack[0].position}, - { "GM0M", 4,16, mtxStack[0].matrix}, - { "GM1P", 4, 1, &mtxStack[1].position}, - { "GM1M", 4,496,mtxStack[1].matrix}, - { "GM2P", 4, 1, &mtxStack[2].position}, - { "GM2M", 4,496,mtxStack[2].matrix}, - { "GM3P", 4, 1, &mtxStack[3].position}, - { "GM3M", 4,16, mtxStack[3].matrix}, { "ML4I", 1, 1, &ML4x4ind}, { "ML3I", 1, 1, &ML4x3ind}, { "MM4I", 1, 1, &MM4x4ind}, @@ -2289,8 +2305,8 @@ SFORMAT SF_GFX3D[]={ { "GLPT", 4, 1, &PTind}, { "GLPC", 4, 4, PTcoords}, { "GLF9", 4, 1, &gxFIFO.tail}, - { "GLF9", 1, 261, &gxFIFO.cmd}, - { "GLF9", 4, 261, &gxFIFO.param}, + { "GLF9", 1, 261, &gxFIFO.cmd[0]}, + { "GLF9", 4, 261, &gxFIFO.param[0]}, { "GCOL", 1, 4, colorRGB}, { "GLCO", 4, 4, lightColor}, { "GLDI", 4, 4, lightDirection}, @@ -2323,7 +2339,7 @@ SFORMAT SF_GFX3D[]={ { "GTVM", 4, 4, tempVertInfo.map}, { "GTVF", 4, 1, &tempVertInfo.first}, { "G3CS", 2, 256*192, gfx3d_convertedScreen}, - { "G3CA", 2, 256*192, gfx3d_convertedAlpha}, + { "G3CA", 2, 256*192, gfx3d_convertedAlpha}, { 0 } }; @@ -2331,7 +2347,7 @@ SFORMAT SF_GFX3D[]={ void gfx3d_savestate(std::ostream* os) { //version - write32le(1,os); + write32le(2,os); //dump the render lists OSWRITE(vertlist->count); @@ -2340,6 +2356,13 @@ void gfx3d_savestate(std::ostream* os) OSWRITE(polylist->count); for(int i=0;icount;i++) polylist->list[i].save(os); + + for(int i=0;i<4;i++) + { + OSWRITE(mtxStack[i].position); + for(int j=0;j=1) { OSREAD(vertlist->count); for(int i=0;icount;i++) @@ -2372,6 +2395,16 @@ bool gfx3d_loadstate(std::istream* is, int size) polylist->list[i].load(is); } + if(version>=2) + { + for(int i=0;i<4;i++) + { + OSREAD(mtxStack[i].position); + for(int j=0;jcount=0; diff --git a/desmume/src/gfx3d.h b/desmume/src/gfx3d.h index b612c7980..1f305e711 100644 --- a/desmume/src/gfx3d.h +++ b/desmume/src/gfx3d.h @@ -232,7 +232,7 @@ extern CACHE_ALIGN const u8 alpha_5bit_to_4bit[32]; //these contain the 3d framebuffer converted into the most useful format //they are stored here instead of in the renderers in order to consolidate the buffers extern CACHE_ALIGN u16 gfx3d_convertedScreen[256*192]; -extern CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192]; +extern CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192*2]; //see cpp for explanation of illogical *2 //GE commands: void gfx3d_glViewPort(u32 v); diff --git a/desmume/src/mem.h b/desmume/src/mem.h index 10a01690e..261f7c138 100644 --- a/desmume/src/mem.h +++ b/desmume/src/mem.h @@ -22,16 +22,27 @@ #define MEM_H #include +#include #include "types.h" /* Type 1 Memory, faster for byte (8 bits) accesses */ -static INLINE u8 T1ReadByte(u8 * mem, u32 addr) +static INLINE u8 T1ReadByte(u8* const mem, const u32 addr) { return mem[addr]; } -static INLINE u16 T1ReadWord(void * mem, u32 addr) +static INLINE u16 T1ReadWord_guaranteedAligned(void* const mem, const u32 addr) +{ + assert((addr&1)==0); +#ifdef WORDS_BIGENDIAN + return (((u8*)mem)[addr + 1] << 8) | ((u8*)mem)[addr]; +#else + return *(u16*)((u8*)mem + addr); +#endif +} + +static INLINE u16 T1ReadWord(void* const mem, const u32 addr) { #ifdef WORDS_BIGENDIAN return (((u8*)mem)[addr + 1] << 8) | ((u8*)mem)[addr]; @@ -40,7 +51,19 @@ static INLINE u16 T1ReadWord(void * mem, u32 addr) #endif } -static INLINE u32 T1ReadLong(u8 * mem, u32 addr) +static INLINE u32 T1ReadLong_guaranteedAligned(u8* const mem, const u32 addr) +{ + assert((addr&3)==0); +#ifdef WORDS_BIGENDIAN + return (mem[addr + 3] << 24 | mem[addr + 2] << 16 | + mem[addr + 1] << 8 | mem[addr]); +#else + return *(u32*)(mem + addr); +#endif +} + + +static INLINE u32 T1ReadLong(u8* const mem, const u32 addr) { #ifdef WORDS_BIGENDIAN return (mem[addr + 3] << 24 | mem[addr + 2] << 16 | @@ -50,7 +73,7 @@ static INLINE u32 T1ReadLong(u8 * mem, u32 addr) #endif } -static INLINE u64 T1ReadQuad(u8 * mem, u32 addr) +static INLINE u64 T1ReadQuad(u8* const mem, const u32 addr) { #ifdef WORDS_BIGENDIAN return (u64(mem[addr + 7]) << 56 | u64(mem[addr + 6]) << 48 | @@ -62,12 +85,12 @@ static INLINE u64 T1ReadQuad(u8 * mem, u32 addr) #endif } -static INLINE void T1WriteByte(u8 * mem, u32 addr, u8 val) +static INLINE void T1WriteByte(u8* const mem, const u32 addr, const u8 val) { mem[addr] = val; } -static INLINE void T1WriteWord(u8 * mem, u32 addr, u16 val) +static INLINE void T1WriteWord(u8* const mem, const u32 addr, const u16 val) { #ifdef WORDS_BIGENDIAN mem[addr + 1] = val >> 8; @@ -77,7 +100,7 @@ static INLINE void T1WriteWord(u8 * mem, u32 addr, u16 val) #endif } -static INLINE void T1WriteLong(u8 * mem, u32 addr, u32 val) +static INLINE void T1WriteLong(u8* const mem, const u32 addr, const u32 val) { #ifdef WORDS_BIGENDIAN mem[addr + 3] = val >> 24; @@ -91,7 +114,7 @@ static INLINE void T1WriteLong(u8 * mem, u32 addr, u32 val) /* Type 2 Memory, faster for word (16 bits) accesses */ -static INLINE u8 T2ReadByte(u8 * mem, u32 addr) +static INLINE u8 T2ReadByte(u8* const mem, const u32 addr) { #ifdef WORDS_BIGENDIAN return mem[addr ^ 1]; @@ -100,12 +123,12 @@ static INLINE u8 T2ReadByte(u8 * mem, u32 addr) #endif } -static INLINE u16 T2ReadWord(u8 * mem, u32 addr) +static INLINE u16 T2ReadWord(u8* const mem, const u32 addr) { return *((u16 *) (mem + addr)); } -static INLINE u32 T2ReadLong(u8 * mem, u32 addr) +static INLINE u32 T2ReadLong(u8* const mem, const u32 addr) { #ifdef WORDS_BIGENDIAN return *((u16 *) (mem + addr + 2)) << 16 | *((u16 *) (mem + addr)); @@ -114,7 +137,7 @@ static INLINE u32 T2ReadLong(u8 * mem, u32 addr) #endif } -static INLINE void T2WriteByte(u8 * mem, u32 addr, u8 val) +static INLINE void T2WriteByte(u8* const mem, const u32 addr, const u8 val) { #ifdef WORDS_BIGENDIAN mem[addr ^ 1] = val; @@ -123,12 +146,12 @@ static INLINE void T2WriteByte(u8 * mem, u32 addr, u8 val) #endif } -static INLINE void T2WriteWord(u8 * mem, u32 addr, u16 val) +static INLINE void T2WriteWord(u8* const mem, const u32 addr, const u16 val) { *((u16 *) (mem + addr)) = val; } -static INLINE void T2WriteLong(u8 * mem, u32 addr, u32 val) +static INLINE void T2WriteLong(u8* const mem, const u32 addr, const u32 val) { #ifdef WORDS_BIGENDIAN *((u16 *) (mem + addr + 2)) = val >> 16; diff --git a/desmume/src/types.h b/desmume/src/types.h index abaf1152e..2d283371e 100644 --- a/desmume/src/types.h +++ b/desmume/src/types.h @@ -131,11 +131,27 @@ #ifndef FORCEINLINE #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #define FORCEINLINE __forceinline +#define MSC_FORCEINLINE __forceinline #else #define FORCEINLINE INLINE +#define MSC_FORCEINLINE #endif #endif +#ifndef _PREFETCH +#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(NOSSE2) +#include +#include +#define _PREFETCH(X) _mm_prefetch((char*)(X),_MM_HINT_T0); +#define _PREFETCHNTA(X) _mm_prefetch((char*)(X),_MM_HINT_NTA); +#else +#define _PREFETCH(X) {} +#define _PREFETCHNTA(X) {} +#endif +#endif + + + #if defined(__LP64__) typedef unsigned char u8; typedef unsigned short u16; diff --git a/desmume/src/windows/disView.cpp b/desmume/src/windows/disView.cpp index f6998591a..d8e3df0e6 100644 --- a/desmume/src/windows/disView.cpp +++ b/desmume/src/windows/disView.cpp @@ -710,7 +710,8 @@ BOOL CALLBACK ViewDisasm_ARM9Proc (HWND hwnd, UINT message, WPARAM wParam, LPARA return FALSE; } -void DisassemblerTools_Refresh(u8 proc) +template +FORCEINLINE void DisassemblerTools_Refresh() { if (DisViewWnd[proc] == NULL) return; if (proc == 0) @@ -725,4 +726,8 @@ void DisassemblerTools_Refresh(u8 proc) InvalidateRect(DisViewWnd[proc], NULL, FALSE); DisView7->autogo=false; } -} \ No newline at end of file +} + +//these templates needed to be instantiated manually +template void DisassemblerTools_Refresh<0>(); +template void DisassemblerTools_Refresh<1>(); \ No newline at end of file diff --git a/desmume/src/windows/disView.h b/desmume/src/windows/disView.h index 96e447b4a..ad2edc913 100644 --- a/desmume/src/windows/disView.h +++ b/desmume/src/windows/disView.h @@ -30,6 +30,6 @@ extern LRESULT CALLBACK ViewDisasm_ARM7BoxProc(HWND hwnd, UINT msg, WPARAM wPara extern BOOL CALLBACK ViewDisasm_ARM9Proc (HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam); extern LRESULT CALLBACK ViewDisasm_ARM9BoxProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam); -extern void DisassemblerTools_Refresh(u8 proc); +template void FORCEINLINE DisassemblerTools_Refresh(); #endif