diff --git a/pcsx2/IopCounters.cpp b/pcsx2/IopCounters.cpp
index 1152854a19..626f857503 100644
--- a/pcsx2/IopCounters.cpp
+++ b/pcsx2/IopCounters.cpp
@@ -357,13 +357,15 @@ static void psxCheckEndGate32(int i)
 	_psxCheckEndGate( i );
 }
 
-
+#include <windows.h>
 void psxVBlankStart()
 {
 	cdvdVsync();
 	psxHu32(0x1070) |= 1;
 	if(psxvblankgate & (1 << 1)) psxCheckStartGate16(1);
 	if(psxvblankgate & (1 << 3)) psxCheckStartGate32(3);
+	if (GetAsyncKeyState('P'))
+		Cpu->Reset();
 }
 
 void psxVBlankEnd()
diff --git a/pcsx2/vtlb.cpp b/pcsx2/vtlb.cpp
index f34df81bf4..810cbc5d41 100644
--- a/pcsx2/vtlb.cpp
+++ b/pcsx2/vtlb.cpp
@@ -61,6 +61,12 @@ vtlbHandler UnmappedVirtHandler1;
 vtlbHandler UnmappedPhyHandler0;
 vtlbHandler UnmappedPhyHandler1;
 
+#define VTLB_ALLOC_SIZE (0x2900000)	//this is a bit more than required
+
+u8* vtlb_alloc_base;		//base of the memory array
+u8* vtlb_alloc_current;		//current base
+u8  vtlb_alloc_bits[VTLB_ALLOC_SIZE/16/8];		//328 kb
+
 
 	/*
 	__asm
@@ -91,6 +97,13 @@ callfunction:
 // Interpreter Implementations of VTLB Memory Operations.
 // See recVTLB.cpp for the dynarec versions.
 
+void memwritebits(u8* ptr)
+{
+	u32 offs=ptr-vtlb_alloc_base;
+	offs/=16;
+	vtlb_alloc_bits[offs/8]|=1<<(offs%8);
+}
+
 // Interpreted VTLB lookup for 8, 16, and 32 bit accesses
 template<int DataSize,typename DataType>
 __forceinline DataType __fastcall MemOp_r0(u32 addr)
@@ -116,7 +129,6 @@ __forceinline DataType __fastcall MemOp_r0(u32 addr)
 		jNO_DEFAULT;
 	}
 }
-
 // Interpreterd VTLB lookup for 64 and 128 bit accesses.
 template<int DataSize,typename DataType>
 __forceinline void __fastcall MemOp_r1(u32 addr, DataType* data)
@@ -155,6 +167,7 @@ __forceinline void __fastcall MemOp_w0(u32 addr, DataType data)
 	s32 ppf=addr+vmv;
 	if (!(ppf<0))
 	{
+		//memwritebits((u8*)ppf);
 		*reinterpret_cast<DataType*>(ppf)=data;
 	}
 	else
@@ -182,6 +195,7 @@ __forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data)
 	s32 ppf=addr+vmv;
 	if (!(ppf<0))
 	{
+		//memwritebits((u8*)ppf);
 		*reinterpret_cast<DataType*>(ppf)=*data;
 		if (DataSize==128)
 			*reinterpret_cast<DataType*>(ppf+8)=data[1];
@@ -552,6 +566,13 @@ void vtlb_Term()
 	//nothing to do for now
 }
 
+
+void vtlb_alloc_mem()
+{
+	u32 size=VTLB_ALLOC_SIZE;
+	vtlb_alloc_base=SysMmapEx( 0, size, 0x80000000, "Vtlb");
+	vtlb_alloc_current=vtlb_alloc_base;
+}
 // This function allocates memory block with are compatible with the Vtlb's requirements
 // for memory locations.  The Vtlb requires the topmost bit (Sign bit) of the memory
 // pointer to be cleared.  Some operating systems and/or implementations of malloc do that,
@@ -559,6 +580,17 @@ void vtlb_Term()
 // platform.
 u8* vtlb_malloc( uint size, uint align, uptr tryBaseAddress )
 {
+	if (!vtlb_alloc_base)
+		vtlb_alloc_mem();
+
+	u32 realign=((uptr)vtlb_alloc_current&(align-1));
+	if (realign)
+		vtlb_alloc_current+=align-realign;
+
+	u8* rv=vtlb_alloc_current;
+	vtlb_alloc_current+=size;
+	return rv;
+
 #ifdef __LINUX__
 	return SysMmapEx( tryBaseAddress, size, 0x80000000, "Vtlb" );
 #else
@@ -569,6 +601,7 @@ u8* vtlb_malloc( uint size, uint align, uptr tryBaseAddress )
 
 void vtlb_free( void* pmem, uint size )
 {
+	return;//whatever
 	if( pmem == NULL ) return;
 
 #ifdef __LINUX__
diff --git a/pcsx2/windows/WinMain.cpp b/pcsx2/windows/WinMain.cpp
index de6b3a953b..c74e88157f 100644
--- a/pcsx2/windows/WinMain.cpp
+++ b/pcsx2/windows/WinMain.cpp
@@ -202,10 +202,8 @@ void WinRun()
 	_doPluginOverride( "DEV9", g_Startup.dev9dll, Config.DEV9 );
 
 
-#ifndef _DEBUG
 	if( Config.Profiler )
 		ProfilerInit();
-#endif
 
 	InitCPUTicks();
 
@@ -800,7 +798,6 @@ LRESULT WINAPI MainWndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
 				SaveConfig();
 				break;
 
-#ifndef _DEBUG
 			case ID_PROFILER:
 				Config.Profiler = !Config.Profiler;
 				if( Config.Profiler )
@@ -815,7 +812,6 @@ LRESULT WINAPI MainWndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
 				}
 				SaveConfig();
 				break;
-#endif
 
 			default:
 				if (LOWORD(wParam) >= ID_LANGS && LOWORD(wParam) <= (ID_LANGS + langsMax))
@@ -989,9 +985,7 @@ void CreateMainMenu() {
 	ADDMENUITEM(0,_("Print cdvd &Info"), ID_CDVDPRINT);
 	ADDMENUITEM(0,_("Close GS Window on Esc"), ID_CLOSEGS);
 	ADDSEPARATOR(0);
-#ifndef _DEBUG
 	ADDMENUITEM(0,_("Enable &Profiler"), ID_PROFILER);
-#endif
 	ADDMENUITEM(0,_("Enable &Patches"), ID_PATCHES);
 	ADDMENUITEM(0,_("Enable &Console"), ID_CONSOLE); 
 	ADDSEPARATOR(0);
diff --git a/pcsx2/windows/WinSysExec.cpp b/pcsx2/windows/WinSysExec.cpp
index ec62fe33d0..3c7ce85aff 100644
--- a/pcsx2/windows/WinSysExec.cpp
+++ b/pcsx2/windows/WinSysExec.cpp
@@ -49,14 +49,53 @@ int SysPageFaultExceptionFilter( EXCEPTION_POINTERS* eps )
 	}
 
 	// get bad virtual address
-	u32 offset = (u8*)ExceptionRecord.ExceptionInformation[1]-psM;
+	uptr addr=ExceptionRecord.ExceptionInformation[1];
 
-	if (offset>=Ps2MemSize::Base)
-		return EXCEPTION_CONTINUE_SEARCH;
+	//this is a *hackfix* for a bug on x64 windows kernels.They do not give correct address
+	//if the error is a missaligned access (they return 0)
+	if (addr==0)
+	{
+		if (eps->ContextRecord->Ecx & 0x80000000)
+			addr=eps->ContextRecord->Ecx;
+	}
+	u32 offset = addr-(uptr)psM;
+	
+	if (addr&0x80000000)
+	{
+		uptr _vtlb_HandleRewrite(uptr code);
+		u8* pcode=(u8*)ExceptionRecord.ExceptionAddress;
 
-	mmap_ClearCpuBlock( offset );
+		u32 patch_point=1;
+		//01 C1
+		while(pcode[-patch_point]!=0x81 || pcode[-patch_point-1]!=0xC1 || pcode[-patch_point-2]!=0x01)
+		{
+			patch_point++;
+		}
+		assert(pcode[-patch_point]==0x81);
+		pcode[-patch_point]=0xF;//js32, 0x81 is add32
+		pcode[-patch_point+1]=0x88;
 
-	return EXCEPTION_CONTINUE_EXECUTION;
+		//resume execution from correct point
+
+		eps->ContextRecord->Eax-=*(u32*)&pcode[-patch_point+2];
+
+		uptr codeloc=_vtlb_HandleRewrite(*(u32*)&pcode[-patch_point+2]);
+
+		eps->ContextRecord->Eip=codeloc;
+		*(u32*)&pcode[-patch_point+2]=codeloc-(u32)&pcode[-patch_point+6];
+
+		SysPrintf("memop patch for full mapping @ %08X : pp %d\n",pcode,patch_point);
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
+	else
+		{
+		if (offset>=Ps2MemSize::Base)
+			return EXCEPTION_CONTINUE_SEARCH;
+
+		mmap_ClearCpuBlock( offset );
+
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
 }
 
 
diff --git a/pcsx2/x86/ix86-32/recVTLB.cpp b/pcsx2/x86/ix86-32/recVTLB.cpp
index cbc567c68d..2554ab041f 100644
--- a/pcsx2/x86/ix86-32/recVTLB.cpp
+++ b/pcsx2/x86/ix86-32/recVTLB.cpp
@@ -24,23 +24,178 @@
 #include "iCore.h"
 #include "iR5900.h"
 
-using namespace vtlb_private;
+u8* execohax_pos=0;
+u8* execohax_start=0;
+u32 execohx_sz;
 
-// NOTICE: This function *destroys* EAX!!
-// Moves 128 bits of memory from the source register ptr to the dest register ptr.
-// (used as an equivalent to movaps, when a free XMM register is unavailable for some reason)
-void MOV128_MtoM( x86IntRegType destRm, x86IntRegType srcRm )
+u8* code_pos=0;
+u8* code_start=0;
+u32 code_sz;
+
+using namespace vtlb_private;
+#include <windows.h>
+
+void execuCode(bool set)
 {
-	MOV32RmtoR(EAX,srcRm);
-	MOV32RtoRm(destRm,EAX);
-	MOV32RmtoROffset(EAX,srcRm,4);
-	MOV32RtoRmOffset(destRm,EAX,4);
-	MOV32RmtoROffset(EAX,srcRm,8);
-	MOV32RtoRmOffset(destRm,EAX,8);
-	MOV32RmtoROffset(EAX,srcRm,12);
-	MOV32RtoRmOffset(destRm,EAX,12);
+	u32 used=code_pos-code_start;
+	u32 free=2*1024*1024-used;
+
+	if (code_pos == 0 || free<128)
+	{
+		SysPrintf("Leaking 2 megabytes of ram\n");
+		code_start=code_pos=(u8*)VirtualAlloc(0,2*1024*1024,MEM_COMMIT,PAGE_EXECUTE_READWRITE);
+		code_sz+=2*1024*1024;
+		int i=0;
+		while(i<code_sz)
+		{
+			//UD2 is 0xF 0xB.Fill the stream with it so that the cpu don't try to execute past branches ..
+			code_start[i]=0xF;i++;
+			code_start[i]=0xB;i++;
+		}
+	}
+
+	static u8* old;
+
+	if (set)
+	{
+		old=x86SetPtr(code_pos);
+	}
+	else
+	{
+		code_pos=x86SetPtr(old);
+		u32 tt=execohx_sz-2*1024*1024+(execohax_pos-execohax_start);
+		u32 tc=code_sz-free;
+		SysPrintf("%d code, %d pot, %.2f%%\n",tc,tt,tc/(float)tt*100);
+	}
 }
 
+u32* execohaxme(bool set)
+{
+	u32 used=execohax_pos-execohax_start;
+	u32 free=2*1024*1024-used;
+
+	if (execohax_pos == 0 || free<128)
+	{
+		SysPrintf("Leaking 2 megabytes of ram\n");
+		execohax_start=execohax_pos=(u8*)VirtualAlloc(0,2*1024*1024,MEM_COMMIT,PAGE_EXECUTE_READWRITE);
+		execohx_sz+=2*1024*1024;
+	}
+	static u8* saved;
+	static u8* mod;
+	if (set)
+	{
+		write8<_EmitterId_>( 0x81 ); 
+		ModRM<_EmitterId_>( 3, 0, EAX );
+		write32<_EmitterId_>( (uptr)execohax_pos );
+
+		saved=x86SetPtr(execohax_pos);
+		mod=execohax_pos;
+		write8<_EmitterId_>(0);	//size, in bytes
+		write32<_EmitterId_>(0); //return address
+	}
+	else
+	{
+		//x86AlignExecutable(4);
+		//x86Align(64);
+		execohax_pos=x86SetPtr(mod);
+		write8<_EmitterId_>(execohax_pos-mod-5);
+		return (u32*)x86SetPtr(saved);
+	}
+
+	return 0;
+}
+
+uptr _vtlb_HandleRewrite(uptr block)
+{
+	u8 size=*(u8*)block;
+	u32 ra=*(u32*)(block+1);
+	u8* pcode=(u8*)(block+5);
+	execuCode(true);
+	uptr rv=(uptr)code_pos;
+
+	while(size--)
+	{
+		write8<_EmitterId_>(*pcode++);
+	}
+	JMP32(ra-(uptr)x86Ptr[_EmitterId_]-5);
+	
+	execuCode(false);
+	//do magic
+	return rv;
+}
+
+PCSX2_ALIGNED16( static u64 g_globalXMMData[2*XMMREGS] );
+void MOVx_SSE( x86IntRegType destRm, x86IntRegType srcRm,u32 srcAddr=0,u32 dstAddr=0,bool half=false )
+{
+	int reg;
+	bool free_reg=false;
+	if( _hasFreeXMMreg() )
+	{
+		free_reg=true;
+		reg=_allocTempXMMreg( XMMT_INT, -1 );
+	}
+	else
+	{
+		SSE2_MOVDQA_XMM_to_M128((uptr)g_globalXMMData,XMM0);
+		reg=XMM0;
+	}
+
+	if (half)
+	{
+		if (srcAddr)
+			SSE_MOVLPS_M64_to_XMM(reg,srcAddr);
+		else
+			SSE_MOVLPS_RmOffset_to_XMM(reg,srcRm,0);
+
+		if (dstAddr)
+			SSE_MOVLPS_XMM_to_M64(dstAddr,reg);
+		else
+			SSE_MOVLPS_XMM_to_RmOffset(destRm,reg,0);
+	}
+	else
+	{
+		if (srcAddr)
+			SSE2_MOVDQA_M128_to_XMM(reg,srcAddr);
+		else
+			SSE2_MOVDQARmtoROffset(reg,srcRm,0);
+
+		if (dstAddr)
+			SSE2_MOVDQA_XMM_to_M128(dstAddr,reg);
+		else
+			SSE2_MOVDQARtoRmOffset(destRm,reg,0);
+	}
+
+
+	if (free_reg)
+		_freeXMMreg(reg);
+	else
+	{
+		SSE2_MOVDQA_M128_to_XMM(XMM0,(uptr)g_globalXMMData);
+	}
+}
+void MOV64_MMX( x86IntRegType destRm, x86IntRegType srcRm,u32 srcAddr=0,u32 dstAddr=0)
+{
+	//if free xmm && fpu state then we use the SSE version.
+	if( !(_hasFreeXMMreg() && (x86FpuState ==  FPU_STATE)) &&  _hasFreeMMXreg() )
+	{
+		const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
+		if (srcAddr)
+			MOVQMtoR(freereg,srcAddr);
+		else
+			MOVQRmtoROffset(freereg,srcRm,0);
+
+		if (dstAddr)
+			MOVQRtoM(dstAddr,freereg);
+		else
+			MOVQRtoRmOffset(destRm,freereg,0);
+
+		_freeMMXreg(freereg);
+	}
+	else
+	{
+		MOVx_SSE(destRm,srcRm,srcAddr,dstAddr,true);
+	}
+}
 /*
 	// Pseudo-Code For the following Dynarec Implementations -->
 
@@ -118,38 +273,11 @@ static void _vtlb_DynGen_DirectRead( u32 bits, bool sign )
 		break;
 
 		case 64:
-			if( _hasFreeMMXreg() )
-			{
-				const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
-				MOVQRmtoROffset(freereg,ECX,0);
-				MOVQRtoRmOffset(EDX,freereg,0);
-				_freeMMXreg(freereg);
-			}
-			else
-			{
-				MOV32RmtoR(EAX,ECX);
-				MOV32RtoRm(EDX,EAX);
-
-				MOV32RmtoROffset(EAX,ECX,4);
-				MOV32RtoRmOffset(EDX,EAX,4);
-			}
+			MOV64_MMX(EDX,ECX);
 		break;
 
 		case 128:
-			if( _hasFreeXMMreg() )
-			{
-				const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
-				SSE2_MOVDQARmtoROffset(freereg,ECX,0);
-				SSE2_MOVDQARtoRmOffset(EDX,freereg,0);
-				_freeXMMreg(freereg);
-			}
-			else
-			{
-				// Could put in an MMX optimization here as well, but no point really.
-				// It's almost never used since there's almost always a free XMM reg.
-
-				MOV128_MtoM( EDX, ECX );		// dest <- src!
-			}
+			MOVx_SSE(EDX,ECX);
 		break;
 
 		jNO_DEFAULT
@@ -189,15 +317,16 @@ void vtlb_DynGenRead64(u32 bits)
 	SHR32ItoR(EAX,VTLB_PAGE_BITS);
 	MOV32RmSOffsettoR(EAX,EAX,(int)vtlbdata.vmap,2);
 	ADD32RtoR(ECX,EAX);
-	u8* _fullread = JS8(0);
+	//u8* _direct = JMP8(0);
+	execohaxme(true);
 
-	_vtlb_DynGen_DirectRead( bits, false );
-	u8* cont = JMP8(0);
-
-	x86SetJ8(_fullread);
 	_vtlb_DynGen_IndirectRead( bits );
-
-	x86SetJ8(cont);
+	
+	u32* patch=execohaxme(false);
+	
+	_vtlb_DynGen_DirectRead( bits, false );
+	
+	*patch=(uptr)x86Ptr[_EmitterId_];
 }
 
 // Recompiled input registers:
@@ -211,12 +340,9 @@ void vtlb_DynGenRead32(u32 bits, bool sign)
 	SHR32ItoR(EAX,VTLB_PAGE_BITS);
 	MOV32RmSOffsettoR(EAX,EAX,(int)vtlbdata.vmap,2);
 	ADD32RtoR(ECX,EAX);
-	u8* _fullread = JS8(0);
+	//u8* _direct = JMP8(0);
+	execohaxme(true);
 
-	_vtlb_DynGen_DirectRead( bits, sign );
-	u8* cont = JMP8(0);
-
-	x86SetJ8(_fullread);
 	_vtlb_DynGen_IndirectRead( bits );
 
 	// perform sign extension on the result:
@@ -236,7 +362,11 @@ void vtlb_DynGenRead32(u32 bits, bool sign)
 			MOVZX32R16toR(EAX,EAX);
 	}
 
-	x86SetJ8(cont);
+	u32* patch=execohaxme(false);
+
+	_vtlb_DynGen_DirectRead( bits, sign );
+	
+	*patch=(uptr)x86Ptr[_EmitterId_];
 }
 
 //
@@ -251,39 +381,11 @@ void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const )
 		switch( bits )
 		{
 			case 64:
-				if( _hasFreeMMXreg() )
-				{
-					const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
-					MOVQMtoR(freereg,ppf);
-					MOVQRtoRmOffset(EDX,freereg,0);
-					_freeMMXreg(freereg);
-				}
-				else
-				{
-					MOV32MtoR(EAX,ppf);
-					MOV32RtoRm(EDX,EAX);
-
-					MOV32MtoR(EAX,ppf+4);
-					MOV32RtoRmOffset(EDX,EAX,4);
-				}
+				MOV64_MMX( EDX, ECX,ppf );		// dest <- src!
 			break;
 
 			case 128:
-				if( _hasFreeXMMreg() )
-				{
-					const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
-					SSE2_MOVDQA_M128_to_XMM( freereg, ppf );
-					SSE2_MOVDQARtoRmOffset(EDX,freereg,0);
-					_freeXMMreg(freereg);
-				}
-				else
-				{
-					// Could put in an MMX optimization here as well, but no point really.
-					// It's almost never used since there's almost always a free XMM reg.
-
-					MOV32ItoR( ECX, ppf );
-					MOV128_MtoM( EDX, ECX );		// dest <- src!
-				}
+				MOVx_SSE( EDX, ECX,ppf );		// dest <- src!
 			break;
 
 			jNO_DEFAULT
@@ -403,40 +505,16 @@ static void _vtlb_DynGen_DirectWrite( u32 bits )
 		break;
 
 		case 64:
-			if( _hasFreeMMXreg() )
-			{
-				const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
-				MOVQRmtoROffset(freereg,EDX,0);
-				MOVQRtoRmOffset(ECX,freereg,0);
-				_freeMMXreg( freereg );
-			}
-			else
-			{
-				MOV32RmtoR(EAX,EDX);
-				MOV32RtoRm(ECX,EAX);
-
-				MOV32RmtoROffset(EAX,EDX,4);
-				MOV32RtoRmOffset(ECX,EAX,4);
-			}
+			MOV64_MMX( ECX, EDX );
 		break;
 
 		case 128:
-			if( _hasFreeXMMreg() )
-			{
-				const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
-				SSE2_MOVDQARmtoROffset(freereg,EDX,0);
-				SSE2_MOVDQARtoRmOffset(ECX,freereg,0);
-				_freeXMMreg( freereg );
-			}
-			else
-			{
-				// Could put in an MMX optimization here as well, but no point really.
-				// It's almost never used since there's almost always a free XMM reg.
-
-				MOV128_MtoM( ECX, EDX );	// dest <- src!
-			}
+			MOVx_SSE( ECX, EDX );
 		break;
 	}
+
+//	SHR32ItoR(ECX,4);// do /16
+//	BTS_wtf(asdasd,ECX);
 }
 
 static void _vtlb_DynGen_IndirectWrite( u32 bits )
@@ -464,15 +542,17 @@ void vtlb_DynGenWrite(u32 sz)
 	SHR32ItoR(EAX,VTLB_PAGE_BITS);
 	MOV32RmSOffsettoR(EAX,EAX,(int)vtlbdata.vmap,2);
 	ADD32RtoR(ECX,EAX);
-	u8* _full=JS8(0);
 
-	_vtlb_DynGen_DirectWrite( sz );
-	u8* cont = JMP8(0);
+	//u8* _direct=JMP8(0);
 
-	x86SetJ8(_full);
+	execohaxme(true);
+	
 	_vtlb_DynGen_IndirectWrite( sz );
-
-	x86SetJ8(cont);
+	
+	u32* patch=execohaxme(false);
+	_vtlb_DynGen_DirectWrite( sz );
+	
+	*patch=(uptr)x86Ptr[_EmitterId_];
 }
 
 
@@ -499,39 +579,11 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const )
 			break;
 
 			case 64:
-				if( _hasFreeMMXreg() )
-				{
-					const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
-					MOVQRmtoROffset(freereg,EDX,0);
-					MOVQRtoM(ppf,freereg);
-					_freeMMXreg( freereg );
-				}
-				else
-				{
-					MOV32RmtoR(EAX,EDX);
-					MOV32RtoM(ppf,EAX);
-
-					MOV32RmtoROffset(EAX,EDX,4);
-					MOV32RtoM(ppf+4,EAX);
-				}
+				MOV64_MMX( ECX, EDX,0,ppf);	// dest <- src!
 			break;
 
 			case 128:
-				if( _hasFreeXMMreg() )
-				{
-					const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
-					SSE2_MOVDQARmtoROffset(freereg,EDX,0);
-					SSE2_MOVDQA_XMM_to_M128(ppf,freereg);
-					_freeXMMreg( freereg );
-				}
-				else
-				{
-					// Could put in an MMX optimization here as well, but no point really.
-					// It's almost never used since there's almost always a free XMM reg.
-
-					MOV32ItoR( ECX, ppf );
-					MOV128_MtoM( ECX, EDX );	// dest <- src!
-				}
+				MOVx_SSE( ECX, EDX,0,ppf);	// dest <- src!
 			break;
 		}
 
diff --git a/pcsx2/x86/ix86/ix86.h b/pcsx2/x86/ix86/ix86.h
index 797dc2eaf7..51a65d712a 100644
--- a/pcsx2/x86/ix86/ix86.h
+++ b/pcsx2/x86/ix86/ix86.h
@@ -79,7 +79,7 @@ emitterT void write64( u64 val ){
 //------------------------------------------------------------------
 // jump/align functions
 //------------------------------------------------------------------
-emitterT void ex86SetPtr( u8 *ptr );
+emitterT u8* ex86SetPtr( u8 *ptr );
 emitterT void ex86SetJ8( u8 *j8 );
 emitterT void ex86SetJ8A( u8 *j8 );
 emitterT void ex86SetJ16( u16 *j16 );
diff --git a/pcsx2/x86/ix86/ix86.inl b/pcsx2/x86/ix86/ix86.inl
index ae4f5829af..7394bd6a3a 100644
--- a/pcsx2/x86/ix86/ix86.inl
+++ b/pcsx2/x86/ix86/ix86.inl
@@ -159,9 +159,12 @@ emitterT void CMOV32MtoR( int cc, int to, uptr from )
 }
 
 ////////////////////////////////////////////////////
-emitterT void ex86SetPtr( u8* ptr ) 
+emitterT u8* ex86SetPtr( u8* ptr ) 
 {
-	x86Ptr[I] = ptr;
+	u8* rv= x86Ptr[I];
+	if (ptr!=0)
+		x86Ptr[I] = ptr;
+	return rv;
 }
 
 ////////////////////////////////////////////////////