a handful of tiny optimizations and important bugfixes in gfx3d savestates. also add a MSC_FORCEINLINE macro for people to use instead when they doing like my FORCEINLINES and are thinking about removing them. at least then we know which ones didnt work on gcc.

This commit is contained in:
zeromus 2009-05-05 07:30:27 +00:00
parent 712b23d512
commit 4832522315
10 changed files with 120 additions and 43 deletions

View File

@ -529,7 +529,7 @@ static inline void MMU_VRAMmapRefreshBank(const int bank)
u8 en = VRAMBankCnt & 0x80;
if(!en) return;
int mst,ofs;
int mst,ofs=0;
switch(bank) {
case VRAM_BANK_A:
case VRAM_BANK_B:

View File

@ -274,10 +274,10 @@ FORCEINLINE u16 _MMU_read16(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u
if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE)
{
if ((addr & 0x0F000000) == 0x02000000)
return T1ReadWord( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
return T1ReadWord_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
if(addr<0x02000000)
return T1ReadWord(ARM9Mem.ARM9_ITCM, addr&0x7FFF);
return T1ReadWord_guaranteedAligned(ARM9Mem.ARM9_ITCM, addr&0x7FFF);
goto dunno;
}
@ -297,16 +297,16 @@ dunno:
else return _MMU_ARM7_read16(addr);
}
FORCEINLINE u32 _MMU_read32(int PROCNUM, const MMU_ACCESS_TYPE AT, const u32 addr) {
FORCEINLINE u32 _MMU_read32(const int PROCNUM, const MMU_ACCESS_TYPE AT, const u32 addr) {
//special handling for execution from arm9, since we spend so much time in there
if(PROCNUM==ARMCPU_ARM9 && AT == MMU_AT_CODE)
{
if ( (addr & 0x0F000000) == 0x02000000)
return T1ReadLong( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
return T1ReadLong_guaranteedAligned( ARM9Mem.MAIN_MEM, addr & _MMU_MAIN_MEM_MASK);
if(addr<0x02000000)
return T1ReadLong(ARM9Mem.ARM9_ITCM, addr&0x7FFF);
return T1ReadLong_guaranteedAligned(ARM9Mem.ARM9_ITCM, addr&0x7FFF);
goto dunno;
}

View File

@ -1643,7 +1643,7 @@ u32 NDS_exec(s32 nb)
}
#ifdef _WIN32
#ifdef DEVELOPER
DisassemblerTools_Refresh(ARMCPU_ARM9);
DisassemblerTools_Refresh<ARMCPU_ARM9>();
#endif
#endif
}
@ -1691,7 +1691,7 @@ u32 NDS_exec(s32 nb)
}
#ifdef _WIN32
#ifdef DEVELOPER
DisassemblerTools_Refresh(ARMCPU_ARM7);
DisassemblerTools_Refresh<ARMCPU_ARM7>();
#endif
#endif
}

View File

@ -365,7 +365,7 @@ template<u32 PROCNUM>
static u32
armcpu_prefetch()
{
armcpu_t* armcpu = &ARMPROC;
armcpu_t* const armcpu = &ARMPROC;
#ifdef GDB_STUB
u32 temp_instruction;
#endif

View File

@ -124,7 +124,9 @@ static float normalTable[1024];
#define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
CACHE_ALIGN u16 gfx3d_convertedScreen[256*192];
CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192];
//this extra *2 is a HACK to salvage some savestates. remove me when the savestate format changes.
CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192*2];
// Matrix stack handling
static CACHE_ALIGN MatrixStack mtxStack[4] = {
@ -1376,7 +1378,7 @@ void gfx3d_glFlush(u32 v)
gfx3d.wbuffer = BIT1(v);
}
static int _CDECL_ gfx3d_ysort_compare(const void * elem1, const void * elem2)
static int _CDECL_ gfx3d_ysort_compare_old_qsort(const void * elem1, const void * elem2)
{
int num1 = *(int*)elem1;
int num2 = *(int*)elem2;
@ -1396,6 +1398,23 @@ static int _CDECL_ gfx3d_ysort_compare(const void * elem1, const void * elem2)
return 0;
}
static bool gfx3d_ysort_compare(int num1, int num2)
{
const POLY &poly1 = polylist->list[num1];
const POLY &poly2 = polylist->list[num2];
if(poly1.maxy > poly2.maxy)
return true;
else if(poly1.maxy < poly2.maxy)
return false;
else if(poly1.miny < poly2.miny)
return true;
else if(poly1.miny > poly2.miny)
return false;
else
return false; //equal should always return false "strict weak ordering"
}
void gfx3d_VBlankSignal()
{
@ -1448,16 +1467,21 @@ void gfx3d_VBlankSignal()
gfx3d.indexlist[ctr++] = i;
}
//========NOT SURE YET WHETHER I NEED A STABLE SORT========
//now we have to sort the opaque polys by y-value.
//should this be done after clipping??
//does this need to be a stable sort???
//test case: harvest moon island of happiness character cretor UI
qsort(gfx3d.indexlist, opaqueCount, 4, gfx3d_ysort_compare);
//std::stable_sort(gfx3d.indexlist, gfx3d.indexlist + opaqueCount, gfx3d_ysort_compare);
qsort(gfx3d.indexlist, opaqueCount, 4, gfx3d_ysort_compare_old_qsort);
if(!gfx3d.sortmode)
{
//if we are autosorting translucent polys, we need to do this also
//TODO - this is unverified behavior. need a test case
qsort(gfx3d.indexlist + opaqueCount, polycount - opaqueCount, 4, gfx3d_ysort_compare);
//std::stable_sort(gfx3d.indexlist + opaqueCount, gfx3d.indexlist + polycount - opaqueCount, gfx3d_ysort_compare);
qsort(gfx3d.indexlist + opaqueCount, polycount - opaqueCount, 4, gfx3d_ysort_compare_old_qsort);
}
//switch to the new lists
@ -2256,14 +2280,6 @@ SFORMAT SF_GFX3D[]={
{ "GMOD", 4, 1, &mode},
{ "GMTM", 4,16, mtxTemporal},
{ "GMCU", 4,64, mtxCurrent},
{ "GM0P", 4, 1, &mtxStack[0].position},
{ "GM0M", 4,16, mtxStack[0].matrix},
{ "GM1P", 4, 1, &mtxStack[1].position},
{ "GM1M", 4,496,mtxStack[1].matrix},
{ "GM2P", 4, 1, &mtxStack[2].position},
{ "GM2M", 4,496,mtxStack[2].matrix},
{ "GM3P", 4, 1, &mtxStack[3].position},
{ "GM3M", 4,16, mtxStack[3].matrix},
{ "ML4I", 1, 1, &ML4x4ind},
{ "ML3I", 1, 1, &ML4x3ind},
{ "MM4I", 1, 1, &MM4x4ind},
@ -2289,8 +2305,8 @@ SFORMAT SF_GFX3D[]={
{ "GLPT", 4, 1, &PTind},
{ "GLPC", 4, 4, PTcoords},
{ "GLF9", 4, 1, &gxFIFO.tail},
{ "GLF9", 1, 261, &gxFIFO.cmd},
{ "GLF9", 4, 261, &gxFIFO.param},
{ "GLF9", 1, 261, &gxFIFO.cmd[0]},
{ "GLF9", 4, 261, &gxFIFO.param[0]},
{ "GCOL", 1, 4, colorRGB},
{ "GLCO", 4, 4, lightColor},
{ "GLDI", 4, 4, lightDirection},
@ -2331,7 +2347,7 @@ SFORMAT SF_GFX3D[]={
void gfx3d_savestate(std::ostream* os)
{
//version
write32le(1,os);
write32le(2,os);
//dump the render lists
OSWRITE(vertlist->count);
@ -2340,6 +2356,13 @@ void gfx3d_savestate(std::ostream* os)
OSWRITE(polylist->count);
for(int i=0;i<polylist->count;i++)
polylist->list[i].save(os);
for(int i=0;i<4;i++)
{
OSWRITE(mtxStack[i].position);
for(int j=0;j<mtxStack[i].size*16+16;j++)
OSWRITE(mtxStack[i].matrix[j]);
}
}
bool gfx3d_loadstate(std::istream* is, int size)
@ -2362,7 +2385,7 @@ bool gfx3d_loadstate(std::istream* is, int size)
polylist = &polylists[listTwiddle];
vertlist = &vertlists[listTwiddle];
if(version==1)
if(version>=1)
{
OSREAD(vertlist->count);
for(int i=0;i<vertlist->count;i++)
@ -2372,6 +2395,16 @@ bool gfx3d_loadstate(std::istream* is, int size)
polylist->list[i].load(is);
}
if(version>=2)
{
for(int i=0;i<4;i++)
{
OSREAD(mtxStack[i].position);
for(int j=0;j<mtxStack[i].size*16+16;j++)
OSREAD(mtxStack[i].matrix[j]);
}
}
gfx3d.polylist = &polylists[listTwiddle^1];
gfx3d.vertlist = &vertlists[listTwiddle^1];
gfx3d.polylist->count=0;

View File

@ -232,7 +232,7 @@ extern CACHE_ALIGN const u8 alpha_5bit_to_4bit[32];
//these contain the 3d framebuffer converted into the most useful format
//they are stored here instead of in the renderers in order to consolidate the buffers
extern CACHE_ALIGN u16 gfx3d_convertedScreen[256*192];
extern CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192];
extern CACHE_ALIGN u8 gfx3d_convertedAlpha[256*192*2]; //see cpp for explanation of illogical *2
//GE commands:
void gfx3d_glViewPort(u32 v);

View File

@ -22,16 +22,27 @@
#define MEM_H
#include <stdlib.h>
#include <assert.h>
#include "types.h"
/* Type 1 Memory, faster for byte (8 bits) accesses */
static INLINE u8 T1ReadByte(u8 * mem, u32 addr)
static INLINE u8 T1ReadByte(u8* const mem, const u32 addr)
{
return mem[addr];
}
static INLINE u16 T1ReadWord(void * mem, u32 addr)
static INLINE u16 T1ReadWord_guaranteedAligned(void* const mem, const u32 addr)
{
assert((addr&1)==0);
#ifdef WORDS_BIGENDIAN
return (((u8*)mem)[addr + 1] << 8) | ((u8*)mem)[addr];
#else
return *(u16*)((u8*)mem + addr);
#endif
}
static INLINE u16 T1ReadWord(void* const mem, const u32 addr)
{
#ifdef WORDS_BIGENDIAN
return (((u8*)mem)[addr + 1] << 8) | ((u8*)mem)[addr];
@ -40,7 +51,19 @@ static INLINE u16 T1ReadWord(void * mem, u32 addr)
#endif
}
static INLINE u32 T1ReadLong(u8 * mem, u32 addr)
static INLINE u32 T1ReadLong_guaranteedAligned(u8* const mem, const u32 addr)
{
assert((addr&3)==0);
#ifdef WORDS_BIGENDIAN
return (mem[addr + 3] << 24 | mem[addr + 2] << 16 |
mem[addr + 1] << 8 | mem[addr]);
#else
return *(u32*)(mem + addr);
#endif
}
static INLINE u32 T1ReadLong(u8* const mem, const u32 addr)
{
#ifdef WORDS_BIGENDIAN
return (mem[addr + 3] << 24 | mem[addr + 2] << 16 |
@ -50,7 +73,7 @@ static INLINE u32 T1ReadLong(u8 * mem, u32 addr)
#endif
}
static INLINE u64 T1ReadQuad(u8 * mem, u32 addr)
static INLINE u64 T1ReadQuad(u8* const mem, const u32 addr)
{
#ifdef WORDS_BIGENDIAN
return (u64(mem[addr + 7]) << 56 | u64(mem[addr + 6]) << 48 |
@ -62,12 +85,12 @@ static INLINE u64 T1ReadQuad(u8 * mem, u32 addr)
#endif
}
static INLINE void T1WriteByte(u8 * mem, u32 addr, u8 val)
static INLINE void T1WriteByte(u8* const mem, const u32 addr, const u8 val)
{
mem[addr] = val;
}
static INLINE void T1WriteWord(u8 * mem, u32 addr, u16 val)
static INLINE void T1WriteWord(u8* const mem, const u32 addr, const u16 val)
{
#ifdef WORDS_BIGENDIAN
mem[addr + 1] = val >> 8;
@ -77,7 +100,7 @@ static INLINE void T1WriteWord(u8 * mem, u32 addr, u16 val)
#endif
}
static INLINE void T1WriteLong(u8 * mem, u32 addr, u32 val)
static INLINE void T1WriteLong(u8* const mem, const u32 addr, const u32 val)
{
#ifdef WORDS_BIGENDIAN
mem[addr + 3] = val >> 24;
@ -91,7 +114,7 @@ static INLINE void T1WriteLong(u8 * mem, u32 addr, u32 val)
/* Type 2 Memory, faster for word (16 bits) accesses */
static INLINE u8 T2ReadByte(u8 * mem, u32 addr)
static INLINE u8 T2ReadByte(u8* const mem, const u32 addr)
{
#ifdef WORDS_BIGENDIAN
return mem[addr ^ 1];
@ -100,12 +123,12 @@ static INLINE u8 T2ReadByte(u8 * mem, u32 addr)
#endif
}
static INLINE u16 T2ReadWord(u8 * mem, u32 addr)
static INLINE u16 T2ReadWord(u8* const mem, const u32 addr)
{
return *((u16 *) (mem + addr));
}
static INLINE u32 T2ReadLong(u8 * mem, u32 addr)
static INLINE u32 T2ReadLong(u8* const mem, const u32 addr)
{
#ifdef WORDS_BIGENDIAN
return *((u16 *) (mem + addr + 2)) << 16 | *((u16 *) (mem + addr));
@ -114,7 +137,7 @@ static INLINE u32 T2ReadLong(u8 * mem, u32 addr)
#endif
}
static INLINE void T2WriteByte(u8 * mem, u32 addr, u8 val)
static INLINE void T2WriteByte(u8* const mem, const u32 addr, const u8 val)
{
#ifdef WORDS_BIGENDIAN
mem[addr ^ 1] = val;
@ -123,12 +146,12 @@ static INLINE void T2WriteByte(u8 * mem, u32 addr, u8 val)
#endif
}
static INLINE void T2WriteWord(u8 * mem, u32 addr, u16 val)
static INLINE void T2WriteWord(u8* const mem, const u32 addr, const u16 val)
{
*((u16 *) (mem + addr)) = val;
}
static INLINE void T2WriteLong(u8 * mem, u32 addr, u32 val)
static INLINE void T2WriteLong(u8* const mem, const u32 addr, const u32 val)
{
#ifdef WORDS_BIGENDIAN
*((u16 *) (mem + addr + 2)) = val >> 16;

View File

@ -131,11 +131,27 @@
#ifndef FORCEINLINE
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
#define FORCEINLINE __forceinline
#define MSC_FORCEINLINE __forceinline
#else
#define FORCEINLINE INLINE
#define MSC_FORCEINLINE
#endif
#endif
#ifndef _PREFETCH
#if (defined(_MSC_VER) || defined(__INTEL_COMPILER)) && !defined(NOSSE2)
#include <xmmintrin.h>
#include <intrin.h>
#define _PREFETCH(X) _mm_prefetch((char*)(X),_MM_HINT_T0);
#define _PREFETCHNTA(X) _mm_prefetch((char*)(X),_MM_HINT_NTA);
#else
#define _PREFETCH(X) {}
#define _PREFETCHNTA(X) {}
#endif
#endif
#if defined(__LP64__)
typedef unsigned char u8;
typedef unsigned short u16;

View File

@ -710,7 +710,8 @@ BOOL CALLBACK ViewDisasm_ARM9Proc (HWND hwnd, UINT message, WPARAM wParam, LPARA
return FALSE;
}
void DisassemblerTools_Refresh(u8 proc)
template<int proc>
FORCEINLINE void DisassemblerTools_Refresh()
{
if (DisViewWnd[proc] == NULL) return;
if (proc == 0)
@ -726,3 +727,7 @@ void DisassemblerTools_Refresh(u8 proc)
DisView7->autogo=false;
}
}
//these templates needed to be instantiated manually
template void DisassemblerTools_Refresh<0>();
template void DisassemblerTools_Refresh<1>();

View File

@ -30,6 +30,6 @@ extern LRESULT CALLBACK ViewDisasm_ARM7BoxProc(HWND hwnd, UINT msg, WPARAM wPara
extern BOOL CALLBACK ViewDisasm_ARM9Proc (HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam);
extern LRESULT CALLBACK ViewDisasm_ARM9BoxProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam);
extern void DisassemblerTools_Refresh(u8 proc);
template<int proc> void FORCEINLINE DisassemblerTools_Refresh();
#endif