diff --git a/desmume/src/FIFO.cpp b/desmume/src/FIFO.cpp
index 4eb74c1ad..c6ece7aad 100644
--- a/desmume/src/FIFO.cpp
+++ b/desmume/src/FIFO.cpp
@@ -159,8 +159,10 @@ void GFX_FIFOsend(u8 cmd, u32 param)
 	gxFIFO.cmd[gxFIFO.tail] = cmd;
 	gxFIFO.param[gxFIFO.tail] = param;
 	gxFIFO.tail++;
-	if (gxFIFO.tail > 256)
-		gxFIFO.tail = 256;
+
+#ifdef USE_GEOMETRY_FIFO_EMULATION
+	gxstat |= 0x08000000;		// set busy flag
+#endif
 
 	gxstat |= (gxFIFO.tail << 16);
 
@@ -189,6 +191,7 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param)
 		T1WriteLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600, gxstat);
 		return FALSE;
 	}
+
 	*cmd = gxFIFO.cmd[0];
 	*param = gxFIFO.param[0];
 	gxFIFO.tail--;
@@ -198,10 +201,11 @@ BOOL GFX_FIFOrecv(u8 *cmd, u32 *param)
 		gxFIFO.param[i] = gxFIFO.param[i+1];
 	}
 
-	gxstat |= (gxFIFO.tail << 16);
+#ifdef USE_GEOMETRY_FIFO_EMULATION
+	gxstat |= 0x08000000;		// set busy flag
+#endif
 
-	if (gxFIFO.tail == 0)
-		gxstat |= 0x04000000;
+	gxstat |= (gxFIFO.tail << 16);
 
 	if (gxFIFO.tail < 128)
 		gxstat |= 0x02000000;
diff --git a/desmume/src/MMU.cpp b/desmume/src/MMU.cpp
index 10563d238..162312314 100644
--- a/desmume/src/MMU.cpp
+++ b/desmume/src/MMU.cpp
@@ -2743,9 +2743,9 @@ u16 FASTCALL _MMU_ARM9_read16(u32 adr)
 		{
 			// ============================================= 3D
 			case 0x04000604:
-				return (gfx3d_GetNumPolys()&2047);
+				return (gfx3d_GetNumPolys());
 			case 0x04000606:
-				return (gfx3d_GetNumVertex()&8191);
+				return (gfx3d_GetNumVertex());
 			case 0x04000630:
 			case 0x04000632:
 			case 0x04000634:
@@ -2847,7 +2847,7 @@ u32 FASTCALL _MMU_ARM9_read32(u32 adr)
 
 			case 0x4000604:
 			{
-				return (gfx3d_GetNumPolys()&2047) & ((gfx3d_GetNumVertex()&8191) << 16);
+				return (gfx3d_GetNumPolys()) & ((gfx3d_GetNumVertex()) << 16);
 				//LOG ("read32 - RAM_COUNT -> 0x%X", ((u32 *)(MMU.MMU_MEM[ARMCPU_ARM9][(adr>>20)&0xFF]))[(adr&MMU.MMU_MASK[ARMCPU_ARM9][(adr>>20)&0xFF])>>2]);
 			}
 
diff --git a/desmume/src/NDSSystem.cpp b/desmume/src/NDSSystem.cpp
index 7fd265cfc..2ac9c0fd7 100644
--- a/desmume/src/NDSSystem.cpp
+++ b/desmume/src/NDSSystem.cpp
@@ -1891,7 +1891,6 @@ void NDS_exec(s32 nb)
 					T1WriteWord(MMU.ARM7_REG, 4, T1ReadWord(MMU.ARM7_REG, 4) | 1);
 					NDS_ARM9VBlankInt();
 					NDS_ARM7VBlankInt();
-					cheatsProcess();
 
 					nds.runCycleCollector[nds.idleFrameCounter] = 1120380-nds.idleCycles;
 					nds.idleFrameCounter++;
@@ -2495,6 +2494,7 @@ void NDS_exec(s32 nb)
 	}
 
 	currFrameCounter++;
+	cheatsProcess();
 }
 
 static std::string MakeInputDisplayString(u16 pad, const std::string* Buttons, int count) {
diff --git a/desmume/src/gfx3d.cpp b/desmume/src/gfx3d.cpp
index a2adc7753..8540e9fbe 100644
--- a/desmume/src/gfx3d.cpp
+++ b/desmume/src/gfx3d.cpp
@@ -146,6 +146,11 @@ static float float10Table[1024];
 static float float10RelTable[1024];
 static float normalTable[1024];
 
+#ifndef NOSSE2
+float ALIGN(16) _fix2float_divizor_mask[4] = { 4096.f, 4096.f, 4096.f, 4096.f };
+float ALIGN(16) _fix10_2float_divizor_mask[4] = { 512.f, 512.f, 512.f, 512.f };
+#endif
+
 #define fix2float(v)    (((float)((s32)(v))) / (float)(1<<12))
 #define fix10_2float(v) (((float)((s32)(v))) / (float)(1<<9))
 
@@ -429,7 +434,11 @@ void gfx3d_glLoadIdentity()
 
 BOOL gfx3d_glLoadMatrix4x4(s32 v)
 {
+#ifdef NOSSE2
 	mtxCurrent[mode][ML4x4ind] = fix2float(v);
+#else
+	mtxCurrent[mode][ML4x4ind] = v;
+#endif
 
 	++ML4x4ind;
 	if(ML4x4ind<16) return FALSE;
@@ -437,6 +446,10 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
 
 	GFX_DELAY(19);
 
+#ifndef NOSSE2
+	_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
+#endif
+
 	if (mode == 2)
 		MatrixCopy (mtxCurrent[1], mtxCurrent[2]);
 	return TRUE;
@@ -444,16 +457,24 @@ BOOL gfx3d_glLoadMatrix4x4(s32 v)
 
 BOOL gfx3d_glLoadMatrix4x3(s32 v)
 {
+#ifdef NOSSE2
 	mtxCurrent[mode][ML4x3ind] = fix2float(v);
+#else
+	mtxCurrent[mode][ML4x3ind] = v;
+#endif
 
 	ML4x3ind++;
 	if((ML4x3ind & 0x03) == 3) ML4x3ind++;
 	if(ML4x3ind<16) return FALSE;
 	ML4x3ind = 0;
 
+#ifndef NOSSE2
+	_sse2_fix2float_16(mtxCurrent[mode], _fix2float_divizor_mask);
+#endif
+
 	//fill in the unusued matrix values
-	mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0;
-	mtxCurrent[mode][15] = 1;
+	mtxCurrent[mode][3] = mtxCurrent[mode][7] = mtxCurrent[mode][11] = 0.f;
+	mtxCurrent[mode][15] = 1.f;
 
 	GFX_DELAY(30);
 
@@ -604,7 +625,12 @@ BOOL gfx3d_glScale(s32 v)
 
 BOOL gfx3d_glMultMatrix3x3(s32 v)
 {
+#ifdef NOSSE2
 	mtxTemporal[MM3x3ind] = fix2float(v);
+#else
+	mtxTemporal[MM3x3ind] = v;
+#endif
+
 
 	MM3x3ind++;
 	if((MM3x3ind & 0x03) == 3) MM3x3ind++;
@@ -613,6 +639,10 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
 
 	GFX_DELAY(28);
 
+#ifndef NOSSE2
+	_sse2_fix2float_12(mtxTemporal, _fix2float_divizor_mask);
+#endif
+
 	//fill in the unusued matrix values
 	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
 	mtxTemporal[15] = 1;
@@ -633,7 +663,11 @@ BOOL gfx3d_glMultMatrix3x3(s32 v)
 
 BOOL gfx3d_glMultMatrix4x3(s32 v)
 {
+#ifdef NOSSE2
 	mtxTemporal[MM4x3ind] = fix2float(v);
+#else
+	mtxTemporal[MM4x3ind] = v;
+#endif
 
 	MM4x3ind++;
 	if((MM4x3ind & 0x03) == 3) MM4x3ind++;
@@ -642,9 +676,13 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
 
 	GFX_DELAY(31);
 
+#ifndef NOSSE2
+	_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
+#endif
+
 	//fill in the unusued matrix values
-	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0;
-	mtxTemporal[15] = 1;
+	mtxTemporal[3] = mtxTemporal[7] = mtxTemporal[11] = 0.f;
+	mtxTemporal[15] = 1.f;
 
 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);
 
@@ -661,7 +699,11 @@ BOOL gfx3d_glMultMatrix4x3(s32 v)
 
 BOOL gfx3d_glMultMatrix4x4(s32 v)
 {
+#ifdef NOSSE2
 	mtxTemporal[MM4x4ind] = fix2float(v);
+#else
+	mtxTemporal[MM4x4ind] = v;
+#endif
 
 	MM4x4ind++;
 	if(MM4x4ind<16) return FALSE;
@@ -669,6 +711,10 @@ BOOL gfx3d_glMultMatrix4x4(s32 v)
 
 	GFX_DELAY(35);
 
+#ifndef NOSSE2
+	_sse2_fix2float_16(mtxTemporal, _fix2float_divizor_mask);
+#endif
+
 	MatrixMultiply (mtxCurrent[mode], mtxTemporal);
 
 	if (mode == 2)
@@ -747,11 +793,15 @@ static void SetVertex()
 	if(polylist->count >= POLYLIST_SIZE) 
 			return;
 	
+#ifdef NOSSE2
 	//apply modelview matrix
 	MatrixMultVec4x4 (mtxCurrent[1], coordTransformed);
 
 	//apply projection matrix
 	MatrixMultVec4x4 (mtxCurrent[0], coordTransformed);
+#else
+	_sse2_MatrixMultVec4x4_M2(mtxCurrent[0], coordTransformed);
+#endif
 
 	//TODO - culling should be done here.
 	//TODO - viewport transform?
@@ -924,13 +974,13 @@ void gfx3d_glSwapScreen(unsigned int screen)
 int gfx3d_GetNumPolys()
 {
 	//so is this in the currently-displayed or currently-built list?
-	return 0;
+	return (polylists[listTwiddle].count);
 }
 
 int gfx3d_GetNumVertex()
 {
 	//so is this in the currently-displayed or currently-built list?
-	return 0;
+	return (vertlists[listTwiddle].count);
 }
 
 
@@ -1354,12 +1404,12 @@ unsigned short gfx3d_glGetVecRes(unsigned int index)
 
 #ifdef USE_GEOMETRY_FIFO_EMULATION
 
-//#define _3D_LOG
+//#define _3D_LOG_EXEC
 void gfx3d_execute(u8 cmd, u32 param)
 {
-#ifdef _3D_LOG
+#ifdef _3D_LOG_EXEC
 	u32 gxstat2 = T1ReadLong(MMU.MMU_MEM[ARMCPU_ARM9][0x40], 0x600);
-	INFO("- execute GX FIFO cmd 0x%02X, gxstat 0x%08X (%03i): time %i/%i\n", cmd, gxstat2, gxFIFO.tail, nds.cycles, MMU.gfx3dCycles);
+	INFO("*** gxFIFO: exec 0x%02X, tail %03i, gxstat 0x%08X\n", cmd, gxFIFO.tail, gxstat2);
 #endif
 	switch (cmd)
 	{
@@ -1490,6 +1540,18 @@ void gfx3d_execute3D()
 	if (GFX_FIFOrecv(&cmd, &param))
 	{
 		gfx3d_execute(cmd, param);
+#if 0
+		for ( ;;)
+		{
+			if ( (cmd == 0x11) || (cmd==0x15) || (cmd==41) )
+			{
+				if (!GFX_FIFOrecv(&cmd, &param)) return;
+				gfx3d_execute(cmd, param);
+				continue;
+			}
+			break;
+		}
+#endif
 #if 0
 		if (bWaitForPolys)
 		{
@@ -1633,6 +1695,11 @@ void gfx3d_VBlankSignal()
 {
 #ifdef USE_GEOMETRY_FIFO_EMULATION
 	isVBlank = true;
+		if (isSwapBuffers)
+	{
+		isSwapBuffers = false;
+		GFX_DELAY(392);
+	}
 #else
 	//the 3d buffers are swapped when a vblank begins.
 	//so, if we have a redraw pending, now is a safe time to do it
@@ -1668,12 +1735,6 @@ void gfx3d_VBlankEndSignal(bool skipFrame)
 				gpu3D->NDS_3D_Render();
 		}
 	}
-
-	if (isSwapBuffers)
-	{
-		isSwapBuffers = false;
-		GFX_DELAY(392);
-	}
 #else
 	//if we are skipping 3d frames then the 3d rendering will get held up here.
 	//but, as soon as we quit skipping frames, the held-up 3d frame will render
@@ -1695,6 +1756,8 @@ void gfx3d_VBlankEndSignal(bool skipFrame)
 }
 
 #ifdef USE_GEOMETRY_FIFO_EMULATION
+//#define _3D_LOG
+
 static void NOPARAMS()
 {
 	for (;;)
@@ -1834,6 +1897,7 @@ void gfx3d_sendCommandToFIFO(u32 val)
 			clCmd >>= 8;
 			return;
 	}
+	NOPARAMS();
 }
 
 void gfx3d_sendCommand(u32 cmd, u32 param)
diff --git a/desmume/src/matrix.cpp b/desmume/src/matrix.cpp
index b298cb4fe..f611473c6 100644
--- a/desmume/src/matrix.cpp
+++ b/desmume/src/matrix.cpp
@@ -127,16 +127,11 @@ void MatrixTranspose(float *matrix)
 #undef swap
 }
 
-void MATRIXFASTCALL MatrixIdentity	(float *matrix) //============== TODO
+void MATRIXFASTCALL MatrixIdentity	(float *matrix)
 {
-	//memset (matrix, 0, sizeof(float)*16);
-	//this is fastest for SSE2 i think.
-	//study code generation and split into sse2 specific module later
-	for(int i=0;i<16;i++)
-		matrix[i] = 0.0f;
-	//matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
-	//matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
-	//matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f;
+	matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0f;
+	matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0f;
+	matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0f;
 	matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.f;
 }
 
@@ -155,7 +150,23 @@ void MATRIXFASTCALL MatrixSet (float *matrix, int x, int y, float value)	// TODO
 
 void MATRIXFASTCALL MatrixCopy (float* matrixDST, const float* matrixSRC)
 {
-	memcpy ((void*)matrixDST, matrixSRC, sizeof(float)*16);
+	matrixDST[0] = matrixSRC[0];
+	matrixDST[1] = matrixSRC[1];
+	matrixDST[2] = matrixSRC[2];
+	matrixDST[3] = matrixSRC[3];
+	matrixDST[4] = matrixSRC[4];
+	matrixDST[5] = matrixSRC[5];
+	matrixDST[6] = matrixSRC[6];
+	matrixDST[7] = matrixSRC[7];
+	matrixDST[8] = matrixSRC[8];
+	matrixDST[9] = matrixSRC[9];
+	matrixDST[10] = matrixSRC[10];
+	matrixDST[11] = matrixSRC[11];
+	matrixDST[12] = matrixSRC[12];
+	matrixDST[13] = matrixSRC[13];
+	matrixDST[14] = matrixSRC[14];
+	matrixDST[15] = matrixSRC[15];
+
 }
 
 int MATRIXFASTCALL MatrixCompare (const float* matrixDST, const float* matrixSRC)
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
index a0687691f..f87ed8ec2 100644
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@@ -53,6 +53,9 @@ void	MatrixInit				(float *matrix);
 #define MatrixMultiply _sse2_MatrixMultiply
 #define MatrixTranslate _sse2_MatrixTranslate
 #define MatrixScale _sse2_MatrixScale
+void	MATRIXFASTCALL _sse2_fix2float_16		(float* matrix, float* divizor_mask);
+void	MATRIXFASTCALL _sse2_fix2float_12		(float* matrix, float* divizor_mask);
+void	MATRIXFASTCALL _sse2_MatrixMultVec4x4_M2 (const float * matrix, float * vecPtr); // mode 2
 #else
 #define SSE2_FUNC(X) X
 #endif
diff --git a/desmume/src/matrix_sse2-x86.asm b/desmume/src/matrix_sse2-x86.asm
index 550bbe515..4f2ddf30e 100644
--- a/desmume/src/matrix_sse2-x86.asm
+++ b/desmume/src/matrix_sse2-x86.asm
@@ -41,6 +41,35 @@
 		ret		0
 @_sse2_MatrixMultVec4x4@8 ENDP
 
+@_sse2_MatrixMultVec4x4_M2@8 PROC PUBLIC
+		movaps	xmm4, XMMWORD PTR [edx]
+		pshufd	xmm5, xmm4, 01010101b
+		pshufd	xmm6, xmm4, 10101010b
+		pshufd	xmm7, xmm4, 11111111b
+		shufps	xmm4, xmm4, 00000000b
+		mulps	xmm4, XMMWORD PTR [ecx+64]
+		mulps	xmm5, XMMWORD PTR [ecx+80]
+		mulps	xmm6, XMMWORD PTR [ecx+96]
+		mulps	xmm7, XMMWORD PTR [ecx+112]
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		addps	xmm4, xmm7
+		pshufd	xmm5, xmm4, 01010101b
+		pshufd	xmm6, xmm4, 10101010b
+		pshufd	xmm7, xmm4, 11111111b
+		shufps	xmm4, xmm4, 00000000b
+		mulps	xmm4, XMMWORD PTR [ecx]
+		mulps	xmm5, XMMWORD PTR [ecx+16]
+		mulps	xmm6, XMMWORD PTR [ecx+32]
+		mulps	xmm7, XMMWORD PTR [ecx+48]
+		addps	xmm4, xmm5
+		addps	xmm4, xmm6
+		addps	xmm4, xmm7
+		movaps	XMMWORD PTR [edx], xmm4
+		ret		0
+@_sse2_MatrixMultVec4x4_M2@8 ENDP
+
+
 @_sse2_MatrixMultVec3x3@8 PROC PUBLIC
 		movaps	xmm4, XMMWORD PTR [edx]
 		pshufd	xmm5, xmm4, 01010101b
@@ -148,5 +177,38 @@
 		ret		0
 @_sse2_MatrixScale@8 ENDP
 
+@_sse2_fix2float_12@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR[ecx]
+		movaps	xmm1, XMMWORD PTR[ecx+16]
+		movaps	xmm2, XMMWORD PTR[ecx+32]
+		movaps	xmm4, XMMWORD PTR [edx]
+		;prefetchnta [ecx+64]
+		divps	xmm0, xmm4
+		divps	xmm1, xmm4
+		divps	xmm2, xmm4
+		movaps	XMMWORD PTR[ecx],	xmm0
+		movaps	XMMWORD PTR[ecx+16],xmm1
+		movaps	XMMWORD PTR[ecx+32],xmm2
+		ret 0
+@_sse2_fix2float_12@8 ENDP
+
+@_sse2_fix2float_16@8 PROC PUBLIC
+		movaps	xmm0, XMMWORD PTR[ecx]
+		movaps	xmm1, XMMWORD PTR[ecx+16]
+		movaps	xmm2, XMMWORD PTR[ecx+32]
+		movaps	xmm3, XMMWORD PTR[ecx+48]
+		movaps	xmm4, XMMWORD PTR [edx]
+		;prefetchnta [ecx+64]
+		divps	xmm0, xmm4
+		divps	xmm1, xmm4
+		divps	xmm2, xmm4
+		divps	xmm3, xmm4
+		movaps	XMMWORD PTR[ecx],	xmm0
+		movaps	XMMWORD PTR[ecx+16],xmm1
+		movaps	XMMWORD PTR[ecx+32],xmm2
+		movaps	XMMWORD PTR[ecx+48],xmm3
+		ret 0
+@_sse2_fix2float_16@8 ENDP
+
 end