diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp index 4eb74c1ad..c6ece7aad 100644 --- a/desmume/src/FIFO.cpp +++ b/desmume/src/FIFO.cpp @@ -159,8 +159,10 @@ void GFX_FIFOsend(u8 cmd, u32 param) gxFIFO.cmd[gxFIFO.tail] = cmd; gxFIFO.param[gxFIFO.tail] = param; gxFIFO.tail++; - if (gxFIFO.tail > 256) - gxFIFO.tail = 256; + +#ifdef USE_GEOMETRY_FIFO_EMULATION + gxstat |= 0x08000000; // set busy flag +#endif gxstat |= (gxFIFO.tail << 16); @@ -189,6 +191,7 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param) T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat); return FALSE; } + *cmd = gxFIFO.cmd[0]; *param = gxFIFO.param[0]; gxFIFO.tail--; @@ -198,10 +201,11 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param) gxFIFO.param[i] = gxFIFO.param[i+1]; } - gxstat |= (gxFIFO.tail << 16); +#ifdef USE_GEOMETRY_FIFO_EMULATION + gxstat |= 0x08000000; // set busy flag +#endif - if (gxFIFO.tail == 0) - gxstat |= 0x04000000; + gxstat |= (gxFIFO.tail << 16); if (gxFIFO.tail < 128) gxstat |= 0x02000000; diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp index 10563d238..162312314 100644 --- a/desmume/src/MMU.cpp +++ b/desmume/src/MMU.cpp @@ -2743,9 +2743,9 @@ u16 FASTCALL _MMU_ARM9_read16(u32 adr) { // ============================================= 3D case 0x04000604: - return (gfx3d_GetNumPolys()&2047); + return (gfx3d_GetNumPolys()); case 0x04000606: - return (gfx3d_GetNumVertex()&8191); + return (gfx3d_GetNumVertex()); case 0x04000630: case 0x04000632: case 0x04000634: @@ -2847,7 +2847,7 @@ u32 FASTCALL _MMU_ARM9_read32(u32 adr) case 0x4000604: { - return (gfx3d_GetNumPolys()&2047) & ((gfx3d_GetNumVertex()&8191) << 16); + return (gfx3d_GetNumPolys()) & ((gfx3d_GetNumVertex()) << 16); //LOG ("read32 - RAM_COUNT -> 0x%X", ((u32 *)(MMU.MMU_MEM[ARMCPU_ARM9][(adr>>20)&0xFF]))[(adr&MMU.MMU_MASK[ARMCPU_ARM9][(adr>>20)&0xFF])>>2]); } diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp index 7fd265cfc..2ac9c0fd7 100644 --- a/desmume/src/NDSSystem.cpp +++ b/desmume/src/NDSSystem.cpp @@ -1891,7 +1891,6 @@ void NDS_exec(s32 nb) T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1); NDS_ARM9VBlankInt(); NDS_ARM7VBlankInt(); - cheatsProcess(); nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles; nds.idleFrameCounter++; @@ -2495,6 +2494,7 @@ void NDS_exec(s32 nb) } currFrameCounter++; + cheatsProcess(); } static std::string MakeInputDisplayString(u16 pad, const std::string* Buttons, int count) { diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp index a2adc7753..8540e9fbe 100644 --- a/desmume/src/gfx3d.cpp +++ b/desmume/src/gfx3d.cpp @@ -146,6 +146,11 @@ static float float10Table[1024]; static float float10RelTable[1024]; static float normalTable[1024]; +#ifndef NOSSE2 +float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f }; +float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f }; +#endif + #define fix2float(v) (((float)((s32)(v))) / (float)(1<<12)) #define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9)) @@ -429,7 +434,11 @@ void gfx3d_glLoadIdentity() BOOL gfx3d_glLoadMatrix4x4(s32 v) { +#ifdef NOSSE2 mtxCurrent[mode][ML4x4ind] = fix2float(v); +#else + mtxCurrent[mode][ML4x4ind] = v; +#endif ++ML4x4ind; if(ML4x4ind<16) return FALSE; @@ -437,6 +446,10 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v) GFX_DELAY(19); +#ifndef NOSSE2 + _sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask); +#endif + if (mode == 2) MatrixCopy (mtxCurrent[1], mtxCurrent[2]); return TRUE; @@ -444,16 +457,24 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v) BOOL gfx3d_glLoadMatrix4x3(s32 v) { +#ifdef NOSSE2 mtxCurrent[mode][ML4x3ind] = fix2float(v); +#else + mtxCurrent[mode][ML4x3ind] = v; +#endif ML4x3ind++; if((ML4x3ind & 0x03) == 3) ML4x3ind++; if(ML4x3ind<16) return FALSE; ML4x3ind = 0; +#ifndef NOSSE2 + _sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask); +#endif + //fill in the unusued matrix values - mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0; - mtxCurrent[mode][15] = 1; + mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f; + mtxCurrent[mode][15] = 1.f; GFX_DELAY(30); @@ -604,7 +625,12 @@ BOOL gfx3d_glScale(s32 v) BOOL gfx3d_glMultMatrix3x3(s32 v) { +#ifdef NOSSE2 mtxTemporal[MM3x3ind] = fix2float(v); +#else + mtxTemporal[MM3x3ind] = v; +#endif + MM3x3ind++; if((MM3x3ind & 0x03) == 3) MM3x3ind++; @@ -613,6 +639,10 @@ BOOL gfx3d_glMultMatrix3x3(s32 v) GFX_DELAY(28); +#ifndef NOSSE2 + _sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask); +#endif + //fill in the unusued matrix values mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0; mtxTemporal[15] = 1; @@ -633,7 +663,11 @@ BOOL gfx3d_glMultMatrix3x3(s32 v) BOOL gfx3d_glMultMatrix4x3(s32 v) { +#ifdef NOSSE2 mtxTemporal[MM4x3ind] = fix2float(v); +#else + mtxTemporal[MM4x3ind] = v; +#endif MM4x3ind++; if((MM4x3ind & 0x03) == 3) MM4x3ind++; @@ -642,9 +676,13 @@ BOOL gfx3d_glMultMatrix4x3(s32 v) GFX_DELAY(31); +#ifndef NOSSE2 + _sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask); +#endif + //fill in the unusued matrix values - mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0; - mtxTemporal[15] = 1; + mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f; + mtxTemporal[15] = 1.f; MatrixMultiply (mtxCurrent[mode], mtxTemporal); @@ -661,7 +699,11 @@ BOOL gfx3d_glMultMatrix4x3(s32 v) BOOL gfx3d_glMultMatrix4x4(s32 v) { +#ifdef NOSSE2 mtxTemporal[MM4x4ind] = fix2float(v); +#else + mtxTemporal[MM4x4ind] = v; +#endif MM4x4ind++; if(MM4x4ind<16) return FALSE; @@ -669,6 +711,10 @@ BOOL gfx3d_glMultMatrix4x4(s32 v) GFX_DELAY(35); +#ifndef NOSSE2 + _sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask); +#endif + MatrixMultiply (mtxCurrent[mode], mtxTemporal); if (mode == 2) @@ -747,11 +793,15 @@ static void SetVertex() if(polylist->count >= POLYLIST_SIZE) return; +#ifdef NOSSE2 //apply modelview matrix MatrixMultVec4x4 (mtxCurrent[1], coordTransformed); //apply projection matrix MatrixMultVec4x4 (mtxCurrent[0], coordTransformed); +#else + _sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed); +#endif //TODO - culling should be done here. //TODO - viewport transform? @@ -924,13 +974,13 @@ void gfx3d_glSwapScreen(unsigned int screen) int gfx3d_GetNumPolys() { //so is this in the currently-displayed or currently-built list? - return 0; + return (polylists[listTwiddle].count); } int gfx3d_GetNumVertex() { //so is this in the currently-displayed or currently-built list? - return 0; + return (vertlists[listTwiddle].count); } @@ -1354,12 +1404,12 @@ unsigned short gfx3d_glGetVecRes(unsigned int index) #ifdef USE_GEOMETRY_FIFO_EMULATION -//#define _3D_LOG +//#define _3D_LOG_EXEC void gfx3d_execute(u8 cmd, u32 param) { -#ifdef _3D_LOG +#ifdef _3D_LOG_EXEC u32 gxstat2 = T1ReadLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600); - INFO("- execute GX FIFO cmd 0x%02X, gxstat 0x%08X (%03i): time %i/%i\n", cmd, gxstat2, gxFIFO.tail, nds.cycles, MMU.gfx3dCycles); + INFO("*** gxFIFO: exec 0x%02X, tail %03i, gxstat 0x%08X\n", cmd, gxFIFO.tail, gxstat2); #endif switch (cmd) { @@ -1490,6 +1540,18 @@ void gfx3d_execute3D() if (GFX_FIFOrecv(&cmd, ¶m)) { gfx3d_execute(cmd, param); +#if 0 + for ( ;;) + { + if ( (cmd == 0x11) || (cmd==0x15) || (cmd==41) ) + { + if (!GFX_FIFOrecv(&cmd, ¶m)) return; + gfx3d_execute(cmd, param); + continue; + } + break; + } +#endif #if 0 if (bWaitForPolys) { @@ -1633,6 +1695,11 @@ void gfx3d_VBlankSignal() { #ifdef USE_GEOMETRY_FIFO_EMULATION isVBlank = true; + if (isSwapBuffers) + { + isSwapBuffers = false; + GFX_DELAY(392); + } #else //the 3d buffers are swapped when a vblank begins. //so, if we have a redraw pending, now is a safe time to do it @@ -1668,12 +1735,6 @@ void gfx3d_VBlankEndSignal(bool skipFrame) gpu3D->NDS_3D_Render(); } } - - if (isSwapBuffers) - { - isSwapBuffers = false; - GFX_DELAY(392); - } #else //if we are skipping 3d frames then the 3d rendering will get held up here. //but, as soon as we quit skipping frames, the held-up 3d frame will render @@ -1695,6 +1756,8 @@ void gfx3d_VBlankEndSignal(bool skipFrame) } #ifdef USE_GEOMETRY_FIFO_EMULATION +//#define _3D_LOG + static void NOPARAMS() { for (;;) @@ -1834,6 +1897,7 @@ void gfx3d_sendCommandToFIFO(u32 val) clCmd >>= 8; return; } + NOPARAMS(); } void gfx3d_sendCommand(u32 cmd, u32 param) diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp index b298cb4fe..f611473c6 100644 --- a/desmume/src/matrix.cpp +++ b/desmume/src/matrix.cpp @@ -127,16 +127,11 @@ void MatrixTranspose(float *matrix) #undef swap } -void MATRIXFASTCALL MatrixIdentity (float *matrix) //============== TODO +void MATRIXFASTCALL MatrixIdentity (float *matrix) { - //memset (matrix, 0, sizeof(float)*16); - //this is fastest for SSE2 i think. - //study code generation and split into sse2 specific module later - for(int i=0;i<16;i++) - matrix[i] = 0.0f; - //matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f; - //matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f; - //matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f; + matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f; + matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f; + matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f; matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f; } @@ -155,7 +150,23 @@ void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value) // TODO void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC) { - memcpy ((void*)matrixDST, matrixSRC, sizeof(float)*16); + matrixDST[0] = matrixSRC[0]; + matrixDST[1] = matrixSRC[1]; + matrixDST[2] = matrixSRC[2]; + matrixDST[3] = matrixSRC[3]; + matrixDST[4] = matrixSRC[4]; + matrixDST[5] = matrixSRC[5]; + matrixDST[6] = matrixSRC[6]; + matrixDST[7] = matrixSRC[7]; + matrixDST[8] = matrixSRC[8]; + matrixDST[9] = matrixSRC[9]; + matrixDST[10] = matrixSRC[10]; + matrixDST[11] = matrixSRC[11]; + matrixDST[12] = matrixSRC[12]; + matrixDST[13] = matrixSRC[13]; + matrixDST[14] = matrixSRC[14]; + matrixDST[15] = matrixSRC[15]; + } int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC) diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h index a0687691f..f87ed8ec2 100644 --- a/desmume/src/matrix.h +++ b/desmume/src/matrix.h @@ -53,6 +53,9 @@ void MatrixInit (float *matrix); #define MatrixMultiply _sse2_MatrixMultiply #define MatrixTranslate _sse2_MatrixTranslate #define MatrixScale _sse2_MatrixScale +void MATRIXFASTCALL _sse2_fix2float_16 (float* matrix, float* divizor_mask); +void MATRIXFASTCALL _sse2_fix2float_12 (float* matrix, float* divizor_mask); +void MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2 #else #define SSE2_FUNC(X) X #endif diff --git a/desmume/src/matrix_sse2-x86.asm b/desmume/src/matrix_sse2-x86.asm index 550bbe515..4f2ddf30e 100644 --- a/desmume/src/matrix_sse2-x86.asm +++ b/desmume/src/matrix_sse2-x86.asm @@ -41,6 +41,35 @@ ret 0 @_sse2_MatrixMultVec4x4@8 ENDP +@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC + movaps xmm4, XMMWORD PTR [edx] + pshufd xmm5, xmm4, 01010101b + pshufd xmm6, xmm4, 10101010b + pshufd xmm7, xmm4, 11111111b + shufps xmm4, xmm4, 00000000b + mulps xmm4, XMMWORD PTR [ecx+64] + mulps xmm5, XMMWORD PTR [ecx+80] + mulps xmm6, XMMWORD PTR [ecx+96] + mulps xmm7, XMMWORD PTR [ecx+112] + addps xmm4, xmm5 + addps xmm4, xmm6 + addps xmm4, xmm7 + pshufd xmm5, xmm4, 01010101b + pshufd xmm6, xmm4, 10101010b + pshufd xmm7, xmm4, 11111111b + shufps xmm4, xmm4, 00000000b + mulps xmm4, XMMWORD PTR [ecx] + mulps xmm5, XMMWORD PTR [ecx+16] + mulps xmm6, XMMWORD PTR [ecx+32] + mulps xmm7, XMMWORD PTR [ecx+48] + addps xmm4, xmm5 + addps xmm4, xmm6 + addps xmm4, xmm7 + movaps XMMWORD PTR [edx], xmm4 + ret 0 +@_sse2_MatrixMultVec4x4_M2@8 ENDP + + @_sse2_MatrixMultVec3x3@8 PROC PUBLIC movaps xmm4, XMMWORD PTR [edx] pshufd xmm5, xmm4, 01010101b @@ -148,5 +177,38 @@ ret 0 @_sse2_MatrixScale@8 ENDP +@_sse2_fix2float_12@8 PROC PUBLIC + movaps xmm0, XMMWORD PTR[ecx] + movaps xmm1, XMMWORD PTR[ecx+16] + movaps xmm2, XMMWORD PTR[ecx+32] + movaps xmm4, XMMWORD PTR [edx] + ;prefetchnta [ecx+64] + divps xmm0, xmm4 + divps xmm1, xmm4 + divps xmm2, xmm4 + movaps XMMWORD PTR[ecx], xmm0 + movaps XMMWORD PTR[ecx+16],xmm1 + movaps XMMWORD PTR[ecx+32],xmm2 + ret 0 +@_sse2_fix2float_12@8 ENDP + +@_sse2_fix2float_16@8 PROC PUBLIC + movaps xmm0, XMMWORD PTR[ecx] + movaps xmm1, XMMWORD PTR[ecx+16] + movaps xmm2, XMMWORD PTR[ecx+32] + movaps xmm3, XMMWORD PTR[ecx+48] + movaps xmm4, XMMWORD PTR [edx] + ;prefetchnta [ecx+64] + divps xmm0, xmm4 + divps xmm1, xmm4 + divps xmm2, xmm4 + divps xmm3, xmm4 + movaps XMMWORD PTR[ecx], xmm0 + movaps XMMWORD PTR[ecx+16],xmm1 + movaps XMMWORD PTR[ecx+32],xmm2 + movaps XMMWORD PTR[ecx+48],xmm3 + ret 0 +@_sse2_fix2float_16@8 ENDP + end