- add SSE2 fix2float (some speedup);
new gxFIFO:
- fix timing for SwapBuffers (Homebrew apps work correct now);
This commit is contained in:
mtabachenko 2009-07-04 08:20:56 +00:00
parent a30f22976a
commit 40ac90fa14
7 changed files with 178 additions and 34 deletions

View File

@ -159,8 +159,10 @@ void GFX_FIFOsend(u8 cmd, u32 param)
gxFIFO.cmd[gxFIFO.tail] = cmd;
gxFIFO.param[gxFIFO.tail] = param;
gxFIFO.tail++;
if (gxFIFO.tail > 256)
gxFIFO.tail = 256;
#ifdef USE_GEOMETRY_FIFO_EMULATION
gxstat |= 0x08000000; // set busy flag
#endif
gxstat |= (gxFIFO.tail << 16);
@ -189,6 +191,7 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param)
T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat);
return FALSE;
}
*cmd = gxFIFO.cmd[0];
*param = gxFIFO.param[0];
gxFIFO.tail--;
@ -198,10 +201,11 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param)
gxFIFO.param[i] = gxFIFO.param[i+1];
}
gxstat |= (gxFIFO.tail << 16);
#ifdef USE_GEOMETRY_FIFO_EMULATION
gxstat |= 0x08000000; // set busy flag
#endif
if (gxFIFO.tail == 0)
gxstat |= 0x04000000;
gxstat |= (gxFIFO.tail << 16);
if (gxFIFO.tail < 128)
gxstat |= 0x02000000;

View File

@ -2743,9 +2743,9 @@ u16 FASTCALL _MMU_ARM9_read16(u32 adr)
{
// ============================================= 3D
case 0x04000604:
return (gfx3d_GetNumPolys()&2047);
return (gfx3d_GetNumPolys());
case 0x04000606:
return (gfx3d_GetNumVertex()&8191);
return (gfx3d_GetNumVertex());
case 0x04000630:
case 0x04000632:
case 0x04000634:
@ -2847,7 +2847,7 @@ u32 FASTCALL _MMU_ARM9_read32(u32 adr)
case 0x4000604:
{
return (gfx3d_GetNumPolys()&2047) & ((gfx3d_GetNumVertex()&8191) << 16);
return (gfx3d_GetNumPolys()) & ((gfx3d_GetNumVertex()) << 16);
//LOG ("read32 - RAM_COUNT -> 0x%X", ((u32 *)(MMU.MMU_MEM[ARMCPU_ARM9][(adr>>20)&0xFF]))[(adr&MMU.MMU_MASK[ARMCPU_ARM9][(adr>>20)&0xFF])>>2]);
}

View File

@ -1891,7 +1891,6 @@ void NDS_exec(s32 nb)
T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1);
NDS_ARM9VBlankInt();
NDS_ARM7VBlankInt();
cheatsProcess();
nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles;
nds.idleFrameCounter++;
@ -2495,6 +2494,7 @@ void NDS_exec(s32 nb)
}
currFrameCounter++;
cheatsProcess();
}
static std::string MakeInputDisplayString(u16 pad, const std::string* Buttons, int count) {

View File

@ -146,6 +146,11 @@ static float float10Table[1024];
static float float10RelTable[1024];
static float normalTable[1024];
#ifndef NOSSE2
float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f };
float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f };
#endif
#define fix2float(v) (((float)((s32)(v))) / (float)(1<<12))
#define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
@ -429,7 +434,11 @@ void gfx3d_glLoadIdentity()
BOOL gfx3d_glLoadMatrix4x4(s32 v)
{
#ifdef NOSSE2
mtxCurrent[mode][ML4x4ind] = fix2float(v);
#else
mtxCurrent[mode][ML4x4ind] = v;
#endif
++ML4x4ind;
if(ML4x4ind<16) return FALSE;
@ -437,6 +446,10 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
GFX_DELAY(19);
#ifndef NOSSE2
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
#endif
if (mode == 2)
MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
return TRUE;
@ -444,16 +457,24 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
BOOL gfx3d_glLoadMatrix4x3(s32 v)
{
#ifdef NOSSE2
mtxCurrent[mode][ML4x3ind] = fix2float(v);
#else
mtxCurrent[mode][ML4x3ind] = v;
#endif
ML4x3ind++;
if((ML4x3ind & 0x03) == 3) ML4x3ind++;
if(ML4x3ind<16) return FALSE;
ML4x3ind = 0;
#ifndef NOSSE2
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
#endif
//fill in the unusued matrix values
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0;
mtxCurrent[mode][15] = 1;
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
mtxCurrent[mode][15] = 1.f;
GFX_DELAY(30);
@ -604,7 +625,12 @@ BOOL gfx3d_glScale(s32 v)
BOOL gfx3d_glMultMatrix3x3(s32 v)
{
#ifdef NOSSE2
mtxTemporal[MM3x3ind] = fix2float(v);
#else
mtxTemporal[MM3x3ind] = v;
#endif
MM3x3ind++;
if((MM3x3ind & 0x03) == 3) MM3x3ind++;
@ -613,6 +639,10 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
GFX_DELAY(28);
#ifndef NOSSE2
_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
#endif
//fill in the unusued matrix values
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
mtxTemporal[15] = 1;
@ -633,7 +663,11 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
BOOL gfx3d_glMultMatrix4x3(s32 v)
{
#ifdef NOSSE2
mtxTemporal[MM4x3ind] = fix2float(v);
#else
mtxTemporal[MM4x3ind] = v;
#endif
MM4x3ind++;
if((MM4x3ind & 0x03) == 3) MM4x3ind++;
@ -642,9 +676,13 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
GFX_DELAY(31);
#ifndef NOSSE2
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
#endif
//fill in the unusued matrix values
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
mtxTemporal[15] = 1;
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
mtxTemporal[15] = 1.f;
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
@ -661,7 +699,11 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
BOOL gfx3d_glMultMatrix4x4(s32 v)
{
#ifdef NOSSE2
mtxTemporal[MM4x4ind] = fix2float(v);
#else
mtxTemporal[MM4x4ind] = v;
#endif
MM4x4ind++;
if(MM4x4ind<16) return FALSE;
@ -669,6 +711,10 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
GFX_DELAY(35);
#ifndef NOSSE2
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
#endif
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
if (mode == 2)
@ -747,11 +793,15 @@ static void SetVertex()
if(polylist->count >= POLYLIST_SIZE)
return;
#ifdef NOSSE2
//apply modelview matrix
MatrixMultVec4x4 (mtxCurrent[1], coordTransformed);
//apply projection matrix
MatrixMultVec4x4 (mtxCurrent[0], coordTransformed);
#else
_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
#endif
//TODO - culling should be done here.
//TODO - viewport transform?
@ -924,13 +974,13 @@ void gfx3d_glSwapScreen(unsigned int screen)
int gfx3d_GetNumPolys()
{
//so is this in the currently-displayed or currently-built list?
return 0;
return (polylists[listTwiddle].count);
}
int gfx3d_GetNumVertex()
{
//so is this in the currently-displayed or currently-built list?
return 0;
return (vertlists[listTwiddle].count);
}
@ -1354,12 +1404,12 @@ unsigned short gfx3d_glGetVecRes(unsigned int index)
#ifdef USE_GEOMETRY_FIFO_EMULATION
//#define _3D_LOG
//#define _3D_LOG_EXEC
void gfx3d_execute(u8 cmd, u32 param)
{
#ifdef _3D_LOG
#ifdef _3D_LOG_EXEC
u32 gxstat2 = T1ReadLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600);
INFO("- execute GX FIFO cmd 0x%02X, gxstat 0x%08X (%03i): time %i/%i\n", cmd, gxstat2, gxFIFO.tail, nds.cycles, MMU.gfx3dCycles);
INFO("*** gxFIFO: exec 0x%02X, tail %03i, gxstat 0x%08X\n", cmd, gxFIFO.tail, gxstat2);
#endif
switch (cmd)
{
@ -1490,6 +1540,18 @@ void gfx3d_execute3D()
if (GFX_FIFOrecv(&cmd, &param))
{
gfx3d_execute(cmd, param);
#if 0
for ( ;;)
{
if ( (cmd == 0x11) || (cmd==0x15) || (cmd==41) )
{
if (!GFX_FIFOrecv(&cmd, &param)) return;
gfx3d_execute(cmd, param);
continue;
}
break;
}
#endif
#if 0
if (bWaitForPolys)
{
@ -1633,6 +1695,11 @@ void gfx3d_VBlankSignal()
{
#ifdef USE_GEOMETRY_FIFO_EMULATION
isVBlank = true;
if (isSwapBuffers)
{
isSwapBuffers = false;
GFX_DELAY(392);
}
#else
//the 3d buffers are swapped when a vblank begins.
//so, if we have a redraw pending, now is a safe time to do it
@ -1668,12 +1735,6 @@ void gfx3d_VBlankEndSignal(bool skipFrame)
gpu3D->NDS_3D_Render();
}
}
if (isSwapBuffers)
{
isSwapBuffers = false;
GFX_DELAY(392);
}
#else
//if we are skipping 3d frames then the 3d rendering will get held up here.
//but, as soon as we quit skipping frames, the held-up 3d frame will render
@ -1695,6 +1756,8 @@ void gfx3d_VBlankEndSignal(bool skipFrame)
}
#ifdef USE_GEOMETRY_FIFO_EMULATION
//#define _3D_LOG
static void NOPARAMS()
{
for (;;)
@ -1834,6 +1897,7 @@ void gfx3d_sendCommandToFIFO(u32 val)
clCmd >>= 8;
return;
}
NOPARAMS();
}
void gfx3d_sendCommand(u32 cmd, u32 param)

View File

@ -127,16 +127,11 @@ void MatrixTranspose(float *matrix)
#undef swap
}
void MATRIXFASTCALL MatrixIdentity (float *matrix) //============== TODO
void MATRIXFASTCALL MatrixIdentity (float *matrix)
{
//memset (matrix, 0, sizeof(float)*16);
//this is fastest for SSE2 i think.
//study code generation and split into sse2 specific module later
for(int i=0;i<16;i++)
matrix[i] = 0.0f;
//matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
//matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
//matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f;
matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f;
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
}
@ -155,7 +150,23 @@ void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value) // TODO
void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
{
memcpy ((void*)matrixDST, matrixSRC, sizeof(float)*16);
matrixDST[0] = matrixSRC[0];
matrixDST[1] = matrixSRC[1];
matrixDST[2] = matrixSRC[2];
matrixDST[3] = matrixSRC[3];
matrixDST[4] = matrixSRC[4];
matrixDST[5] = matrixSRC[5];
matrixDST[6] = matrixSRC[6];
matrixDST[7] = matrixSRC[7];
matrixDST[8] = matrixSRC[8];
matrixDST[9] = matrixSRC[9];
matrixDST[10] = matrixSRC[10];
matrixDST[11] = matrixSRC[11];
matrixDST[12] = matrixSRC[12];
matrixDST[13] = matrixSRC[13];
matrixDST[14] = matrixSRC[14];
matrixDST[15] = matrixSRC[15];
}
int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC)

View File

@ -53,6 +53,9 @@ void MatrixInit (float *matrix);
#define MatrixMultiply _sse2_MatrixMultiply
#define MatrixTranslate _sse2_MatrixTranslate
#define MatrixScale _sse2_MatrixScale
void MATRIXFASTCALL _sse2_fix2float_16 (float* matrix, float* divizor_mask);
void MATRIXFASTCALL _sse2_fix2float_12 (float* matrix, float* divizor_mask);
void MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2
#else
#define SSE2_FUNC(X) X
#endif

View File

@ -41,6 +41,35 @@
ret 0
@_sse2_MatrixMultVec4x4@8 ENDP
@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx+64]
mulps xmm5, XMMWORD PTR [ecx+80]
mulps xmm6, XMMWORD PTR [ecx+96]
mulps xmm7, XMMWORD PTR [ecx+112]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
pshufd xmm5, xmm4, 01010101b
pshufd xmm6, xmm4, 10101010b
pshufd xmm7, xmm4, 11111111b
shufps xmm4, xmm4, 00000000b
mulps xmm4, XMMWORD PTR [ecx]
mulps xmm5, XMMWORD PTR [ecx+16]
mulps xmm6, XMMWORD PTR [ecx+32]
mulps xmm7, XMMWORD PTR [ecx+48]
addps xmm4, xmm5
addps xmm4, xmm6
addps xmm4, xmm7
movaps XMMWORD PTR [edx], xmm4
ret 0
@_sse2_MatrixMultVec4x4_M2@8 ENDP
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
movaps xmm4, XMMWORD PTR [edx]
pshufd xmm5, xmm4, 01010101b
@ -148,5 +177,38 @@
ret 0
@_sse2_MatrixScale@8 ENDP
@_sse2_fix2float_12@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR[ecx]
movaps xmm1, XMMWORD PTR[ecx+16]
movaps xmm2, XMMWORD PTR[ecx+32]
movaps xmm4, XMMWORD PTR [edx]
;prefetchnta [ecx+64]
divps xmm0, xmm4
divps xmm1, xmm4
divps xmm2, xmm4
movaps XMMWORD PTR[ecx], xmm0
movaps XMMWORD PTR[ecx+16],xmm1
movaps XMMWORD PTR[ecx+32],xmm2
ret 0
@_sse2_fix2float_12@8 ENDP
@_sse2_fix2float_16@8 PROC PUBLIC
movaps xmm0, XMMWORD PTR[ecx]
movaps xmm1, XMMWORD PTR[ecx+16]
movaps xmm2, XMMWORD PTR[ecx+32]
movaps xmm3, XMMWORD PTR[ecx+48]
movaps xmm4, XMMWORD PTR [edx]
;prefetchnta [ecx+64]
divps xmm0, xmm4
divps xmm1, xmm4
divps xmm2, xmm4
divps xmm3, xmm4
movaps XMMWORD PTR[ecx], xmm0
movaps XMMWORD PTR[ecx+16],xmm1
movaps XMMWORD PTR[ecx+32],xmm2
movaps XMMWORD PTR[ecx+48],xmm3
ret 0
@_sse2_fix2float_16@8 ENDP
end