gfx3d:
- add SSE2 fix2float (some speedup); new gxFIFO: - fix timing for SwapBuffers (Homebrew apps work correct now);
This commit is contained in:
parent
a30f22976a
commit
40ac90fa14
|
@ -159,8 +159,10 @@ void GFX_FIFOsend(u8 cmd, u32 param)
|
|||
gxFIFO.cmd[gxFIFO.tail] = cmd;
|
||||
gxFIFO.param[gxFIFO.tail] = param;
|
||||
gxFIFO.tail++;
|
||||
if (gxFIFO.tail > 256)
|
||||
gxFIFO.tail = 256;
|
||||
|
||||
#ifdef USE_GEOMETRY_FIFO_EMULATION
|
||||
gxstat |= 0x08000000; // set busy flag
|
||||
#endif
|
||||
|
||||
gxstat |= (gxFIFO.tail << 16);
|
||||
|
||||
|
@ -189,6 +191,7 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param)
|
|||
T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
*cmd = gxFIFO.cmd[0];
|
||||
*param = gxFIFO.param[0];
|
||||
gxFIFO.tail--;
|
||||
|
@ -198,10 +201,11 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param)
|
|||
gxFIFO.param[i] = gxFIFO.param[i+1];
|
||||
}
|
||||
|
||||
gxstat |= (gxFIFO.tail << 16);
|
||||
#ifdef USE_GEOMETRY_FIFO_EMULATION
|
||||
gxstat |= 0x08000000; // set busy flag
|
||||
#endif
|
||||
|
||||
if (gxFIFO.tail == 0)
|
||||
gxstat |= 0x04000000;
|
||||
gxstat |= (gxFIFO.tail << 16);
|
||||
|
||||
if (gxFIFO.tail < 128)
|
||||
gxstat |= 0x02000000;
|
||||
|
|
|
@ -2743,9 +2743,9 @@ u16 FASTCALL _MMU_ARM9_read16(u32 adr)
|
|||
{
|
||||
// ============================================= 3D
|
||||
case 0x04000604:
|
||||
return (gfx3d_GetNumPolys()&2047);
|
||||
return (gfx3d_GetNumPolys());
|
||||
case 0x04000606:
|
||||
return (gfx3d_GetNumVertex()&8191);
|
||||
return (gfx3d_GetNumVertex());
|
||||
case 0x04000630:
|
||||
case 0x04000632:
|
||||
case 0x04000634:
|
||||
|
@ -2847,7 +2847,7 @@ u32 FASTCALL _MMU_ARM9_read32(u32 adr)
|
|||
|
||||
case 0x4000604:
|
||||
{
|
||||
return (gfx3d_GetNumPolys()&2047) & ((gfx3d_GetNumVertex()&8191) << 16);
|
||||
return (gfx3d_GetNumPolys()) & ((gfx3d_GetNumVertex()) << 16);
|
||||
//LOG ("read32 - RAM_COUNT -> 0x%X", ((u32 *)(MMU.MMU_MEM[ARMCPU_ARM9][(adr>>20)&0xFF]))[(adr&MMU.MMU_MASK[ARMCPU_ARM9][(adr>>20)&0xFF])>>2]);
|
||||
}
|
||||
|
||||
|
|
|
@ -1891,7 +1891,6 @@ void NDS_exec(s32 nb)
|
|||
T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1);
|
||||
NDS_ARM9VBlankInt();
|
||||
NDS_ARM7VBlankInt();
|
||||
cheatsProcess();
|
||||
|
||||
nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles;
|
||||
nds.idleFrameCounter++;
|
||||
|
@ -2495,6 +2494,7 @@ void NDS_exec(s32 nb)
|
|||
}
|
||||
|
||||
currFrameCounter++;
|
||||
cheatsProcess();
|
||||
}
|
||||
|
||||
static std::string MakeInputDisplayString(u16 pad, const std::string* Buttons, int count) {
|
||||
|
|
|
@ -146,6 +146,11 @@ static float float10Table[1024];
|
|||
static float float10RelTable[1024];
|
||||
static float normalTable[1024];
|
||||
|
||||
#ifndef NOSSE2
|
||||
float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f };
|
||||
float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f };
|
||||
#endif
|
||||
|
||||
#define fix2float(v) (((float)((s32)(v))) / (float)(1<<12))
|
||||
#define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
|
||||
|
||||
|
@ -429,7 +434,11 @@ void gfx3d_glLoadIdentity()
|
|||
|
||||
BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][ML4x4ind] = fix2float(v);
|
||||
#else
|
||||
mtxCurrent[mode][ML4x4ind] = v;
|
||||
#endif
|
||||
|
||||
++ML4x4ind;
|
||||
if(ML4x4ind<16) return FALSE;
|
||||
|
@ -437,6 +446,10 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
|||
|
||||
GFX_DELAY(19);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
|
||||
#endif
|
||||
|
||||
if (mode == 2)
|
||||
MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
|
||||
return TRUE;
|
||||
|
@ -444,16 +457,24 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
|
|||
|
||||
BOOL gfx3d_glLoadMatrix4x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxCurrent[mode][ML4x3ind] = fix2float(v);
|
||||
#else
|
||||
mtxCurrent[mode][ML4x3ind] = v;
|
||||
#endif
|
||||
|
||||
ML4x3ind++;
|
||||
if((ML4x3ind & 0x03) == 3) ML4x3ind++;
|
||||
if(ML4x3ind<16) return FALSE;
|
||||
ML4x3ind = 0;
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
|
||||
#endif
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0;
|
||||
mtxCurrent[mode][15] = 1;
|
||||
mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
|
||||
mtxCurrent[mode][15] = 1.f;
|
||||
|
||||
GFX_DELAY(30);
|
||||
|
||||
|
@ -604,7 +625,12 @@ BOOL gfx3d_glScale(s32 v)
|
|||
|
||||
BOOL gfx3d_glMultMatrix3x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[MM3x3ind] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[MM3x3ind] = v;
|
||||
#endif
|
||||
|
||||
|
||||
MM3x3ind++;
|
||||
if((MM3x3ind & 0x03) == 3) MM3x3ind++;
|
||||
|
@ -613,6 +639,10 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
|
|||
|
||||
GFX_DELAY(28);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
|
||||
mtxTemporal[15] = 1;
|
||||
|
@ -633,7 +663,11 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
|
|||
|
||||
BOOL gfx3d_glMultMatrix4x3(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[MM4x3ind] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[MM4x3ind] = v;
|
||||
#endif
|
||||
|
||||
MM4x3ind++;
|
||||
if((MM4x3ind & 0x03) == 3) MM4x3ind++;
|
||||
|
@ -642,9 +676,13 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
GFX_DELAY(31);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
|
||||
//fill in the unusued matrix values
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
|
||||
mtxTemporal[15] = 1;
|
||||
mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
|
||||
mtxTemporal[15] = 1.f;
|
||||
|
||||
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
|
||||
|
||||
|
@ -661,7 +699,11 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
|
|||
|
||||
BOOL gfx3d_glMultMatrix4x4(s32 v)
|
||||
{
|
||||
#ifdef NOSSE2
|
||||
mtxTemporal[MM4x4ind] = fix2float(v);
|
||||
#else
|
||||
mtxTemporal[MM4x4ind] = v;
|
||||
#endif
|
||||
|
||||
MM4x4ind++;
|
||||
if(MM4x4ind<16) return FALSE;
|
||||
|
@ -669,6 +711,10 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
|
|||
|
||||
GFX_DELAY(35);
|
||||
|
||||
#ifndef NOSSE2
|
||||
_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
|
||||
#endif
|
||||
|
||||
MatrixMultiply (mtxCurrent[mode], mtxTemporal);
|
||||
|
||||
if (mode == 2)
|
||||
|
@ -747,11 +793,15 @@ static void SetVertex()
|
|||
if(polylist->count >= POLYLIST_SIZE)
|
||||
return;
|
||||
|
||||
#ifdef NOSSE2
|
||||
//apply modelview matrix
|
||||
MatrixMultVec4x4 (mtxCurrent[1], coordTransformed);
|
||||
|
||||
//apply projection matrix
|
||||
MatrixMultVec4x4 (mtxCurrent[0], coordTransformed);
|
||||
#else
|
||||
_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
|
||||
#endif
|
||||
|
||||
//TODO - culling should be done here.
|
||||
//TODO - viewport transform?
|
||||
|
@ -924,13 +974,13 @@ void gfx3d_glSwapScreen(unsigned int screen)
|
|||
int gfx3d_GetNumPolys()
|
||||
{
|
||||
//so is this in the currently-displayed or currently-built list?
|
||||
return 0;
|
||||
return (polylists[listTwiddle].count);
|
||||
}
|
||||
|
||||
int gfx3d_GetNumVertex()
|
||||
{
|
||||
//so is this in the currently-displayed or currently-built list?
|
||||
return 0;
|
||||
return (vertlists[listTwiddle].count);
|
||||
}
|
||||
|
||||
|
||||
|
@ -1354,12 +1404,12 @@ unsigned short gfx3d_glGetVecRes(unsigned int index)
|
|||
|
||||
#ifdef USE_GEOMETRY_FIFO_EMULATION
|
||||
|
||||
//#define _3D_LOG
|
||||
//#define _3D_LOG_EXEC
|
||||
void gfx3d_execute(u8 cmd, u32 param)
|
||||
{
|
||||
#ifdef _3D_LOG
|
||||
#ifdef _3D_LOG_EXEC
|
||||
u32 gxstat2 = T1ReadLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600);
|
||||
INFO("- execute GX FIFO cmd 0x%02X, gxstat 0x%08X (%03i): time %i/%i\n", cmd, gxstat2, gxFIFO.tail, nds.cycles, MMU.gfx3dCycles);
|
||||
INFO("*** gxFIFO: exec 0x%02X, tail %03i, gxstat 0x%08X\n", cmd, gxFIFO.tail, gxstat2);
|
||||
#endif
|
||||
switch (cmd)
|
||||
{
|
||||
|
@ -1490,6 +1540,18 @@ void gfx3d_execute3D()
|
|||
if (GFX_FIFOrecv(&cmd, ¶m))
|
||||
{
|
||||
gfx3d_execute(cmd, param);
|
||||
#if 0
|
||||
for ( ;;)
|
||||
{
|
||||
if ( (cmd == 0x11) || (cmd==0x15) || (cmd==41) )
|
||||
{
|
||||
if (!GFX_FIFOrecv(&cmd, ¶m)) return;
|
||||
gfx3d_execute(cmd, param);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
if (bWaitForPolys)
|
||||
{
|
||||
|
@ -1633,6 +1695,11 @@ void gfx3d_VBlankSignal()
|
|||
{
|
||||
#ifdef USE_GEOMETRY_FIFO_EMULATION
|
||||
isVBlank = true;
|
||||
if (isSwapBuffers)
|
||||
{
|
||||
isSwapBuffers = false;
|
||||
GFX_DELAY(392);
|
||||
}
|
||||
#else
|
||||
//the 3d buffers are swapped when a vblank begins.
|
||||
//so, if we have a redraw pending, now is a safe time to do it
|
||||
|
@ -1668,12 +1735,6 @@ void gfx3d_VBlankEndSignal(bool skipFrame)
|
|||
gpu3D->NDS_3D_Render();
|
||||
}
|
||||
}
|
||||
|
||||
if (isSwapBuffers)
|
||||
{
|
||||
isSwapBuffers = false;
|
||||
GFX_DELAY(392);
|
||||
}
|
||||
#else
|
||||
//if we are skipping 3d frames then the 3d rendering will get held up here.
|
||||
//but, as soon as we quit skipping frames, the held-up 3d frame will render
|
||||
|
@ -1695,6 +1756,8 @@ void gfx3d_VBlankEndSignal(bool skipFrame)
|
|||
}
|
||||
|
||||
#ifdef USE_GEOMETRY_FIFO_EMULATION
|
||||
//#define _3D_LOG
|
||||
|
||||
static void NOPARAMS()
|
||||
{
|
||||
for (;;)
|
||||
|
@ -1834,6 +1897,7 @@ void gfx3d_sendCommandToFIFO(u32 val)
|
|||
clCmd >>= 8;
|
||||
return;
|
||||
}
|
||||
NOPARAMS();
|
||||
}
|
||||
|
||||
void gfx3d_sendCommand(u32 cmd, u32 param)
|
||||
|
|
|
@ -127,16 +127,11 @@ void MatrixTranspose(float *matrix)
|
|||
#undef swap
|
||||
}
|
||||
|
||||
void MATRIXFASTCALL MatrixIdentity (float *matrix) //============== TODO
|
||||
void MATRIXFASTCALL MatrixIdentity (float *matrix)
|
||||
{
|
||||
//memset (matrix, 0, sizeof(float)*16);
|
||||
//this is fastest for SSE2 i think.
|
||||
//study code generation and split into sse2 specific module later
|
||||
for(int i=0;i<16;i++)
|
||||
matrix[i] = 0.0f;
|
||||
//matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
|
||||
//matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
|
||||
//matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f;
|
||||
matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
|
||||
matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
|
||||
matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f;
|
||||
matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
|
||||
}
|
||||
|
||||
|
@ -155,7 +150,23 @@ void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value) // TODO
|
|||
|
||||
void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
|
||||
{
|
||||
memcpy ((void*)matrixDST, matrixSRC, sizeof(float)*16);
|
||||
matrixDST[0] = matrixSRC[0];
|
||||
matrixDST[1] = matrixSRC[1];
|
||||
matrixDST[2] = matrixSRC[2];
|
||||
matrixDST[3] = matrixSRC[3];
|
||||
matrixDST[4] = matrixSRC[4];
|
||||
matrixDST[5] = matrixSRC[5];
|
||||
matrixDST[6] = matrixSRC[6];
|
||||
matrixDST[7] = matrixSRC[7];
|
||||
matrixDST[8] = matrixSRC[8];
|
||||
matrixDST[9] = matrixSRC[9];
|
||||
matrixDST[10] = matrixSRC[10];
|
||||
matrixDST[11] = matrixSRC[11];
|
||||
matrixDST[12] = matrixSRC[12];
|
||||
matrixDST[13] = matrixSRC[13];
|
||||
matrixDST[14] = matrixSRC[14];
|
||||
matrixDST[15] = matrixSRC[15];
|
||||
|
||||
}
|
||||
|
||||
int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC)
|
||||
|
|
|
@ -53,6 +53,9 @@ void MatrixInit (float *matrix);
|
|||
#define MatrixMultiply _sse2_MatrixMultiply
|
||||
#define MatrixTranslate _sse2_MatrixTranslate
|
||||
#define MatrixScale _sse2_MatrixScale
|
||||
void MATRIXFASTCALL _sse2_fix2float_16 (float* matrix, float* divizor_mask);
|
||||
void MATRIXFASTCALL _sse2_fix2float_12 (float* matrix, float* divizor_mask);
|
||||
void MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2
|
||||
#else
|
||||
#define SSE2_FUNC(X) X
|
||||
#endif
|
||||
|
|
|
@ -41,6 +41,35 @@
|
|||
ret 0
|
||||
@_sse2_MatrixMultVec4x4@8 ENDP
|
||||
|
||||
@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx+64]
|
||||
mulps xmm5, XMMWORD PTR [ecx+80]
|
||||
mulps xmm6, XMMWORD PTR [ecx+96]
|
||||
mulps xmm7, XMMWORD PTR [ecx+112]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
pshufd xmm6, xmm4, 10101010b
|
||||
pshufd xmm7, xmm4, 11111111b
|
||||
shufps xmm4, xmm4, 00000000b
|
||||
mulps xmm4, XMMWORD PTR [ecx]
|
||||
mulps xmm5, XMMWORD PTR [ecx+16]
|
||||
mulps xmm6, XMMWORD PTR [ecx+32]
|
||||
mulps xmm7, XMMWORD PTR [ecx+48]
|
||||
addps xmm4, xmm5
|
||||
addps xmm4, xmm6
|
||||
addps xmm4, xmm7
|
||||
movaps XMMWORD PTR [edx], xmm4
|
||||
ret 0
|
||||
@_sse2_MatrixMultVec4x4_M2@8 ENDP
|
||||
|
||||
|
||||
@_sse2_MatrixMultVec3x3@8 PROC PUBLIC
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
pshufd xmm5, xmm4, 01010101b
|
||||
|
@ -148,5 +177,38 @@
|
|||
ret 0
|
||||
@_sse2_MatrixScale@8 ENDP
|
||||
|
||||
@_sse2_fix2float_12@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR[ecx]
|
||||
movaps xmm1, XMMWORD PTR[ecx+16]
|
||||
movaps xmm2, XMMWORD PTR[ecx+32]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
;prefetchnta [ecx+64]
|
||||
divps xmm0, xmm4
|
||||
divps xmm1, xmm4
|
||||
divps xmm2, xmm4
|
||||
movaps XMMWORD PTR[ecx], xmm0
|
||||
movaps XMMWORD PTR[ecx+16],xmm1
|
||||
movaps XMMWORD PTR[ecx+32],xmm2
|
||||
ret 0
|
||||
@_sse2_fix2float_12@8 ENDP
|
||||
|
||||
@_sse2_fix2float_16@8 PROC PUBLIC
|
||||
movaps xmm0, XMMWORD PTR[ecx]
|
||||
movaps xmm1, XMMWORD PTR[ecx+16]
|
||||
movaps xmm2, XMMWORD PTR[ecx+32]
|
||||
movaps xmm3, XMMWORD PTR[ecx+48]
|
||||
movaps xmm4, XMMWORD PTR [edx]
|
||||
;prefetchnta [ecx+64]
|
||||
divps xmm0, xmm4
|
||||
divps xmm1, xmm4
|
||||
divps xmm2, xmm4
|
||||
divps xmm3, xmm4
|
||||
movaps XMMWORD PTR[ecx], xmm0
|
||||
movaps XMMWORD PTR[ecx+16],xmm1
|
||||
movaps XMMWORD PTR[ecx+32],xmm2
|
||||
movaps XMMWORD PTR[ecx+48],xmm3
|
||||
ret 0
|
||||
@_sse2_fix2float_16@8 ENDP
|
||||
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue