From 56d3982dc51a9eb1c7ecafe4c8eb2c3d7afff2d0 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Tue, 13 Jul 2010 04:36:39 +0000
Subject: [PATCH] * Minor optimization to GIFpath, by utilizing the precached
 value for numregs in place of a convoluted test against NREG. * Disabled the
 SSE store version of memzero (no performance benefits and it was messy
 anyway)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3473 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 common/include/Utilities/win_memzero.h | 66 ++++++++++----------------
 pcsx2/ps2/GIFpath.cpp                  |  2 +-
 2 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/common/include/Utilities/win_memzero.h b/common/include/Utilities/win_memzero.h
index 5207002cdc..639fba6b71 100644
--- a/common/include/Utilities/win_memzero.h
+++ b/common/include/Utilities/win_memzero.h
@@ -73,6 +73,10 @@ static __forceinline void memzero_ptr( void *dest )
 		return;
 	}
 
+#if 0
+	// SSE-based memory clear.  Currently disabled so to avoid unnecessary dependence on
+	// SSE cpu instruction sets.  (memzero typically isn't used in any performance critical
+	// situations anyway)
 	enum
 	{
 		remainder = MZFbytes & 127,
@@ -86,8 +90,6 @@ static __forceinline void memzero_ptr( void *dest )
 
 	if( (MZFbytes & 0xf) == 0 )
 	{
-		u64 _xmm_backup[2];
-
 		if( ((uptr)dest & 0xf) != 0 )
 		{
 			// UNALIGNED COPY MODE.
@@ -97,24 +99,21 @@ static __forceinline void memzero_ptr( void *dest )
 			{
 				__asm
 				{
-					movups _xmm_backup,xmm0;
 					mov ecx,dest
 					pxor xmm0,xmm0
 					mov eax,bytes128
 
-					align 16
-
 				_loop_6:
-					movups [ecx],xmm0;
-					movups [ecx+0x10],xmm0;
-					movups [ecx+0x20],xmm0;
-					movups [ecx+0x30],xmm0;
-					movups [ecx+0x40],xmm0;
-					movups [ecx+0x50],xmm0;
-					movups [ecx+0x60],xmm0;
-					movups [ecx+0x70],xmm0;
+					movups [ecx],xmm0
+					movups [ecx+0x10],xmm0
+					movups [ecx+0x20],xmm0
+					movups [ecx+0x30],xmm0
+					movups [ecx+0x40],xmm0
+					movups [ecx+0x50],xmm0
+					movups [ecx+0x60],xmm0
+					movups [ecx+0x70],xmm0
 					sub ecx,-128
-					dec eax;
+					sub eax,1
 					jnz _loop_6;
 				}
 				if( remainder != 0 )
@@ -130,10 +129,6 @@ static __forceinline void memzero_ptr( void *dest )
 						jnz _loop_5;
 					}
 				}
-				__asm
-				{
-					movups xmm0,[_xmm_backup];
-				}
 				return;
 			}
 		}
@@ -145,24 +140,21 @@ static __forceinline void memzero_ptr( void *dest )
 
 			__asm
 			{
-				movups _xmm_backup,xmm0;
 				mov ecx,dest
 				pxor xmm0,xmm0
 				mov eax,bytes128
 
-				align 16
-
 			_loop_8:
-				movaps [ecx],xmm0;
-				movaps [ecx+0x10],xmm0;
-				movaps [ecx+0x20],xmm0;
-				movaps [ecx+0x30],xmm0;
-				movaps [ecx+0x40],xmm0;
-				movaps [ecx+0x50],xmm0;
-				movaps [ecx+0x60],xmm0;
-				movaps [ecx+0x70],xmm0;
+				movaps [ecx],xmm0
+				movaps [ecx+0x10],xmm0
+				movaps [ecx+0x20],xmm0
+				movaps [ecx+0x30],xmm0
+				movaps [ecx+0x40],xmm0
+				movaps [ecx+0x50],xmm0
+				movaps [ecx+0x60],xmm0
+				movaps [ecx+0x70],xmm0
 				sub ecx,-128
-				dec eax;
+				sub eax,1
 				jnz _loop_8;
 			}
 			if( remainder != 0 )
@@ -173,18 +165,15 @@ static __forceinline void memzero_ptr( void *dest )
 					mov eax, remainder
 
 				_loop_10:
-					movaps [ecx+eax],xmm0;
+					movaps [ecx+eax],xmm0
 					sub eax,16;
 					jnz _loop_10;
 				}
 			}
-			__asm
-			{
-				movups xmm0,[_xmm_backup];
-			}
 			return;
 		}
 	}
+	#endif
 
 	// This function only works on 32-bit alignments.
 	pxAssume( (MZFbytes & 0x3) == 0 );
@@ -271,8 +260,6 @@ static __forceinline void memset_8( void *dest )
 		return;
 	}
 
-	//u64 _xmm_backup[2];
-
 	/*static const size_t remainder = MZFbytes & 127;
 	static const size_t bytes128 = MZFbytes / 128;
 	if( bytes128 > 32 )
@@ -283,7 +270,6 @@ static __forceinline void memset_8( void *dest )
 
 		__asm
 		{
-			movups _xmm_backup,xmm0;
 			mov eax,bytes128
 			mov ecx,dest
 			movss xmm0,data
@@ -316,10 +302,6 @@ static __forceinline void memset_8( void *dest )
 				jnz _loop_10;
 			}
 		}
-		__asm
-		{
-			movups xmm0,[_xmm_backup];
-		}
 	}*/
 
 	// This function only works on 32-bit alignments of data copied.
diff --git a/pcsx2/ps2/GIFpath.cpp b/pcsx2/ps2/GIFpath.cpp
index 02b6551e4f..b311361c84 100644
--- a/pcsx2/ps2/GIFpath.cpp
+++ b/pcsx2/ps2/GIFpath.cpp
@@ -252,7 +252,7 @@ __forceinline void GIFPath::Reset()
 
 __forceinline bool GIFPath::StepReg()
 {
-	if ((++curreg & 0xf) == tag.NREG) {
+	if (++curreg >= numregs) {
 		curreg = 0;
 		if (--nloop == 0) {
 			return false;