Turn the X86 emitter into a class, so the code pointer is no longer a global, yay! Created XCodeBlock that derives from XEmitter, and the Jit now derives from XCodeBlock so it can call all ADD SUB JNZ etc without having to prefix them with "emit.". I think someone's gonna like this.

There's some cleanup still to be done, but hey, it works. There shouldn't be a noticable speed difference. I hope GCC doesn't have a problem with the "member function pointers" I used. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1594 8ced0084-cf51-0410-be5f-012b33b47a6e
2008-12-19 21:24:52 +00:00 · 2008-12-19 21:24:52 +00:00 · 104acd5bc1
parent b5dcdcf779
commit 104acd5bc1
31 changed files with 1297 additions and 1153 deletions
--- a/Source/Core/Common/Src/ABI.cpp
+++ b/Source/Core/Common/Src/ABI.cpp
@ -25,7 +25,7 @@ using namespace Gen;
 // ====================================

 // Sets up a __cdecl function.
-void ABI_EmitPrologue(int maxCallParams)
+void XEmitter::ABI_EmitPrologue(int maxCallParams)
 {
 #ifdef _M_IX86
 	// Don't really need to do anything
@ -40,7 +40,8 @@ void ABI_EmitPrologue(int maxCallParams)
 #error Arch not supported
 #endif
 }
-void ABI_EmitEpilogue(int maxCallParams)
+
+void XEmitter::ABI_EmitEpilogue(int maxCallParams)
 {
 #ifdef _M_IX86
 	RET();
@ -60,14 +61,14 @@ void ABI_EmitEpilogue(int maxCallParams)
 // Shared code between Win32 and Unix32
 // ====================================

-void ABI_CallFunctionC(void *func, u32 param1) {
+void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
 	ABI_AlignStack(1 * 4);
 	PUSH(32, Imm32(param1));
 	CALL(func);
 	ABI_RestoreStack(1 * 4);
 }

-void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
+void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
 	ABI_AlignStack(2 * 4);
 	PUSH(32, Imm32(param2));
 	PUSH(32, Imm32(param1));
@ -76,14 +77,14 @@ void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
 }

 // Pass a register as a paremeter.
-void ABI_CallFunctionR(void *func, X64Reg reg1) {
+void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
 	ABI_AlignStack(1 * 4);
 	PUSH(32, R(reg1));
 	CALL(func);
 	ABI_RestoreStack(1 * 4);
 }

-void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
+void XEmitter::ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
 {
 	ABI_AlignStack(2 * 4);
 	PUSH(32, R(reg2));
@ -92,7 +93,7 @@ void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
 	ABI_RestoreStack(2 * 4);
 }

-void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
+void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 {
 	ABI_AlignStack(2 * 4);
 	PUSH(32, arg1);
@ -101,7 +102,7 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 	ABI_RestoreStack(2 * 4);
 }

-void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	// Note: 4 * 4 = 16 bytes, so alignment is preserved.
 	PUSH(EBP);
 	PUSH(EBX);
@ -109,14 +110,14 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(EDI);
 }

-void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	POP(EDI);
 	POP(ESI);
 	POP(EBX);
 	POP(EBP);
 }

-unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
+unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
 	frameSize += 4; // reserve space for return address
 	unsigned int alignedSize =
 #ifdef __GNUC__
@ -128,7 +129,7 @@ unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
 }


-void ABI_AlignStack(unsigned int frameSize) {
+void XEmitter::ABI_AlignStack(unsigned int frameSize) {
 // Mac OS X requires the stack to be 16-byte aligned before every call.
 // Linux requires the stack to be 16-byte aligned before calls that put SSE
 // vectors on the stack, but since we do not keep track of which calls do that,
@ -145,7 +146,7 @@ void ABI_AlignStack(unsigned int frameSize) {
 #endif
 }

-void ABI_RestoreStack(unsigned int frameSize) {
+void XEmitter::ABI_RestoreStack(unsigned int frameSize) {
 	unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize);
 	alignedSize -= 4; // return address is POPped at end of call
 	if (alignedSize != 0) {
@ -155,26 +156,26 @@ void ABI_RestoreStack(unsigned int frameSize) {

 #else

-void ABI_CallFunctionC(void *func, u32 param1) {
+void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
 	MOV(32, R(ABI_PARAM1), Imm32(param1));
 	CALL(func);
 }

-void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
+void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
 	MOV(32, R(ABI_PARAM1), Imm32(param1));
 	MOV(32, R(ABI_PARAM2), Imm32(param2));
 	CALL(func);
 }

 // Pass a register as a paremeter.
-void ABI_CallFunctionR(void *func, X64Reg reg1) {
+void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
 	if (reg1 != ABI_PARAM1)
 		MOV(32, R(ABI_PARAM1), R(reg1));
 	CALL(func);
 }

 // Pass a register as a paremeter.
-void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
+void XEmitter::ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
 	if (reg1 != ABI_PARAM1)
 		MOV(32, R(ABI_PARAM1), R(reg1));
 	if (reg2 != ABI_PARAM2)
@ -182,7 +183,7 @@ void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
 	CALL(func);
 }

-void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
+void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 {
 	if (!arg1.IsSimpleReg(ABI_PARAM1))
 		MOV(32, R(ABI_PARAM1), arg1);
@ -190,21 +191,21 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
 	CALL(func);
 }

-unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
+unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
 	return frameSize;
 }

-void ABI_AlignStack(unsigned int /*frameSize*/) {
+void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
 }

-void ABI_RestoreStack(unsigned int /*frameSize*/) {
+void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
 }

 #ifdef _WIN32

 // Win64 Specific Code
 // ====================================
-void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	//we only want to do this once
 	PUSH(RBX); 
 	PUSH(RSI); 
@ -218,7 +219,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	SUB(64, R(RSP), Imm8(0x28));
 }

-void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	ADD(64, R(RSP), Imm8(0x28));
 	POP(R15);
 	POP(R14); 
@ -232,7 +233,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {

 // Win64 Specific Code
 // ====================================
-void ABI_PushAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	PUSH(RCX);
 	PUSH(RDX);
 	PUSH(RSI); 
@ -245,7 +246,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	SUB(64, R(RSP), Imm8(0x28));
 }

-void ABI_PopAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
 	ADD(64, R(RSP), Imm8(0x28));
 	POP(R11);
 	POP(R10);
@ -260,7 +261,7 @@ void ABI_PopAllCallerSavedRegsAndAdjustStack() {
 #else
 // Unix64 Specific Code
 // ====================================
-void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(RBX); 
 	PUSH(RBP);
 	PUSH(R12); 
@ -270,7 +271,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
 }

-void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	POP(R15);
 	POP(R15);
 	POP(R14); 
@ -280,7 +281,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
 	POP(RBX); 
 }

-void ABI_PushAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	PUSH(RCX);
 	PUSH(RDX);
 	PUSH(RSI); 
@ -292,7 +293,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
 	PUSH(R11);
 }

-void ABI_PopAllCallerSavedRegsAndAdjustStack() {
+void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
 	POP(R11);
 	POP(R11);
 	POP(R10);
--- a/Source/Core/Common/Src/ABI.h
+++ b/Source/Core/Common/Src/ABI.h
@ -18,8 +18,6 @@
 #ifndef _JIT_ABI_H
 #define _JIT_ABI_H

-#include "x64Emitter.h"
-
 // x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
 // All convensions return values in EAX (+ possibly EDX).

@ -81,42 +79,5 @@

 #endif

-// Utility functions
-// These only support u32 parameters, but that's enough for a lot of uses.
-// These will destroy the 1 or 2 first "parameter regs".
-void ABI_CallFunctionC(void *func, u32 param1);
-void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
-void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
-
-// Pass a register as a paremeter.
-void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
-void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
-
-// A function that doesn't have any control over what it will do to regs,
-// such as the dispatcher, should be surrounded by these.
-void ABI_PushAllCalleeSavedRegsAndAdjustStack();
-void ABI_PopAllCalleeSavedRegsAndAdjustStack();
-
-// A function that doesn't know anything about it's surroundings, should
-// be surrounded by these to establish a safe environment, where it can roam free.
-// An example is a backpatch injected function.
-void ABI_PushAllCallerSavedRegsAndAdjustStack();
-void ABI_PopAllCallerSavedRegsAndAdjustStack();
-
-unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
-void ABI_AlignStack(unsigned int frameSize);
-void ABI_RestoreStack(unsigned int frameSize);
-
-// Sets up a __cdecl function.
-// Only x64 really needs the parameter.
-void ABI_EmitPrologue(int maxCallParams);
-void ABI_EmitEpilogue(int maxCallParams);
-
-#ifdef _M_IX86
-inline int ABI_GetNumXMMRegs() { return 8; }
-#else
-inline int ABI_GetNumXMMRegs() { return 16; }
-#endif
-
 #endif  // _JIT_ABI_H

--- a/Source/Core/Common/Src/MemoryUtil.cpp
+++ b/Source/Core/Common/Src/MemoryUtil.cpp
@ -38,7 +38,7 @@
 // This is purposedely not a full wrapper for virtualalloc/mmap, but it
 // provides exactly the primitive operations that Dolphin needs.

-void* AllocateExecutableMemory(int size, bool low)
+void* AllocateExecutableMemory(size_t size, bool low)
 {
 #ifdef _WIN32
 	void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
@ -71,7 +71,7 @@ void* AllocateExecutableMemory(int size, bool low)
 }


-void* AllocateMemoryPages(int size)
+void* AllocateMemoryPages(size_t size)
 {
 #ifdef _WIN32
 	void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_READWRITE);
@ -99,7 +99,7 @@ void* AllocateMemoryPages(int size)
 }


-void FreeMemoryPages(void* ptr, int size)
+void FreeMemoryPages(void* ptr, size_t size)
 {
 #ifdef _WIN32
 	if (ptr)
@ -113,7 +113,7 @@ void FreeMemoryPages(void* ptr, int size)
 }


-void WriteProtectMemory(void* ptr, int size, bool allowExecute)
+void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
 {
 #ifdef _WIN32
 	VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, 0);
@ -123,7 +123,7 @@ void WriteProtectMemory(void* ptr, int size, bool allowExecute)
 }


-void UnWriteProtectMemory(void* ptr, int size, bool allowExecute)
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
 {
 #ifdef _WIN32
 	VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READONLY, 0);
--- a/Source/Core/Common/Src/MemoryUtil.h
+++ b/Source/Core/Common/Src/MemoryUtil.h
@ -18,14 +18,14 @@
 #ifndef _MEMORYUTIL_H
 #define _MEMORYUTIL_H

-void* AllocateExecutableMemory(int size, bool low = true);
-void* AllocateMemoryPages(int size);
-void FreeMemoryPages(void* ptr, int size);
-void WriteProtectMemory(void* ptr, int size, bool executable = false);
-void UnWriteProtectMemory(void* ptr, int size, bool allowExecute);
+void* AllocateExecutableMemory(size_t size, bool low = true);
+void* AllocateMemoryPages(size_t size);
+void FreeMemoryPages(void* ptr, size_t size);
+void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
+void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute);


-inline int GetPageSize() {return(4096);}
+inline int GetPageSize() {return 4096;}


 #endif
--- a/Source/Core/Common/Src/Thunk.cpp
+++ b/Source/Core/Common/Src/Thunk.cpp
@ -18,33 +18,29 @@
 #include <map>

 #include "Common.h"
-#include "Thunk.h"
 #include "x64Emitter.h"
 #include "MemoryUtil.h"
 #include "ABI.h"
+#include "Thunk.h"

-using namespace Gen;
+ThunkManager thunks;

 #define THUNK_ARENA_SIZE 1024*1024*1

-namespace {
-static std::map<void *, const u8 *> thunks;
-u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
-u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
-
-static u8 *thunk_memory;
-static u8 *thunk_code;
-static const u8 *save_regs;
-static const u8 *load_regs;
-static u16 saved_mxcsr;
-}
-
-void Thunk_Init()
+namespace
 {
-	thunk_memory = (u8 *)AllocateExecutableMemory(THUNK_ARENA_SIZE);
-	thunk_code = thunk_memory;

-	GenContext ctx(&thunk_code);
+static u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
+static u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
+static u16 saved_mxcsr;
+
+}  // namespace
+
+using namespace Gen;
+
+void ThunkManager::Init()
+{
+	AllocCodeSpace(THUNK_ARENA_SIZE);
 	save_regs = GetCodePtr();
 	for (int i = 2; i < ABI_GetNumXMMRegs(); i++)
 		MOVAPS(M(saved_fp_state + i * 16), (X64Reg)(XMM0 + i));
@ -89,31 +85,27 @@ void Thunk_Init()
 	RET();
 }

-void Thunk_Reset()
+void ThunkManager::Reset()
 {
 	thunks.clear();
-	thunk_code = thunk_memory;
+	ResetCodePtr();
 }

-void Thunk_Shutdown()
+void ThunkManager::Shutdown()
 {
-	Thunk_Reset();
-	FreeMemoryPages(thunk_memory, THUNK_ARENA_SIZE);
-	thunk_memory = 0;
-	thunk_code = 0;
+	Reset();
+	FreeCodeSpace();
 }

-void *ProtectFunction(void *function, int num_params)
+void *ThunkManager::ProtectFunction(void *function, int num_params)
 {
 	std::map<void *, const u8 *>::iterator iter;
 	iter = thunks.find(function);
 	if (iter != thunks.end())
 		return (void *)iter->second;
-
-	if (!thunk_memory)
+	if (!region)
 		PanicAlert("Trying to protect functions before the emu is started. Bad bad bad.");

-	GenContext gen(&thunk_code);
 	const u8 *call_point = GetCodePtr();
 	// Make sure to align stack.
 #ifdef _M_X64
--- a/Source/Core/Common/Src/Thunk.h
+++ b/Source/Core/Common/Src/Thunk.h
@ -18,6 +18,11 @@
 #ifndef _THUNK_H
 #define _THUNK_H

+#include <map>
+
+#include "Common.h"
+#include "x64Emitter.h"
+
 // This simple class creates a wrapper around a C/C++ function that saves all fp state
 // before entering it, and restores it upon exit. This is required to be able to selectively
 // call functions from generated code, without inflicting the performance hit and increase
@ -30,10 +35,21 @@
 // NOT THREAD SAFE. This may only be used from the CPU thread.
 // Any other thread using this stuff will be FATAL.

-void Thunk_Init();
-void Thunk_Reset();
-void Thunk_Shutdown();
+class ThunkManager : public Gen::XCodeBlock
+{
+	std::map<void *, const u8 *> thunks;

-void *ProtectFunction(void *function, int num_params);
+	const u8 *save_regs;
+	const u8 *load_regs;
+
+public:
+	void Init();
+	void Reset();
+	void Shutdown();
+
+	void *ProtectFunction(void *function, int num_params);
+};
+
+extern ThunkManager thunks;

 #endif
--- a/Source/Core/Common/Src/x64Emitter.cpp
+++ b/Source/Core/Common/Src/x64Emitter.cpp
--- a/Source/Core/Common/Src/x64Emitter.h
+++ b/Source/Core/Common/Src/x64Emitter.h
@ -21,217 +21,264 @@
 #define _DOLPHIN_INTEL_CODEGEN

 #include "Common.h"
+#include "MemoryUtil.h"

 namespace Gen
 {
-	enum X64Reg
+
+enum X64Reg
+{
+	EAX = 0, EBX = 3, ECX = 1, EDX = 2,
+	ESI = 6, EDI = 7, EBP = 5, ESP = 4,
+	
+	RAX = 0, RBX = 3, RCX = 1, RDX = 2,
+	RSI = 6, RDI = 7, RBP = 5, RSP = 4,
+	R8  = 8, R9  = 9, R10 = 10,R11 = 11,
+	R12 = 12,R13 = 13,R14 = 14,R15 = 15,
+
+	AL = 0, BL = 3, CL = 1, DL = 2,
+	AH = 4, BH = 7, CH = 5, DH = 6,
+
+	AX = 0, BX = 3, CX = 1, DX = 2,
+	SI = 6, DI = 7, BP = 5, SP = 4,
+
+	XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
+
+	INVALID_REG = 0xFFFFFFFF
+};
+
+enum CCFlags
+{
+	CC_O   = 0,
+	CC_NO  = 1,
+	CC_B   = 2, CC_C  = 2, CC_NAE = 2,
+	CC_NB  = 3, CC_NC = 3, CC_AE  = 3,
+	CC_Z   = 4, CC_E   = 4,
+	CC_NZ  = 5,	CC_NE  = 5, 
+	CC_BE  = 6, CC_NA  = 6,
+	CC_NBE = 7, CC_A   = 7,
+	CC_S   = 8,
+	CC_NS  = 9,
+	CC_P   = 0xA, CC_PE  = 0xA,
+	CC_NP  = 0xB, CC_PO  = 0xB,
+	CC_L   = 0xC, CC_NGE = 0xC,
+	CC_NL  = 0xD, CC_GE  = 0xD,
+	CC_LE  = 0xE, CC_NG  = 0xE,
+	CC_NLE = 0xF, CC_G   = 0xF
+};
+
+enum
+{
+	NUMGPRs = 16,
+	NUMXMMs = 16,
+};
+
+enum
+{
+	SCALE_NONE = 0,
+	SCALE_1 = 1,
+	SCALE_2 = 2,
+	SCALE_4 = 4,
+	SCALE_8 = 8,
+	SCALE_ATREG = 16,
+	SCALE_RIP = 0xFF,
+	SCALE_IMM8  = 0xF0,
+	SCALE_IMM16 = 0xF1,
+	SCALE_IMM32 = 0xF2,
+	SCALE_IMM64 = 0xF3,
+};
+
+enum NormalOp {
+	nrmADD,
+	nrmADC,
+	nrmSUB,
+	nrmSBB,
+	nrmAND,
+	nrmOR ,
+	nrmXOR,
+	nrmMOV,
+	nrmTEST,
+	nrmCMP,
+	nrmXCHG,
+};
+
+class XEmitter;
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+	OpArg() {}  // dummy op arg, used for storage
+	OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
 	{
-		EAX = 0, EBX = 3, ECX = 1, EDX = 2,
-		ESI = 6, EDI = 7, EBP = 5, ESP = 4,
-		
-		RAX = 0, RBX = 3, RCX = 1, RDX = 2,
-		RSI = 6, RDI = 7, RBP = 5, RSP = 4,
-		R8  = 8, R9  = 9, R10 = 10,R11 = 11,
-		R12 = 12,R13 = 13,R14 = 14,R15 = 15,
+		operandReg = 0;
+		scale = (u8)_scale;
+		offsetOrBaseReg = (u8)rmReg;
+		indexReg = (u8)scaledReg;
+		//if scale == 0 never mind offseting
+		offset = _offset;
+	}
+	void WriteRex(XEmitter *emit, bool op64, int customOp = -1) const;
+	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
+	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
+	// This one is public - must be written to
+	u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available.
+	u8 operandReg;

-		AL = 0, BL = 3, CL = 1, DL = 2,
-		AH = 4, BH = 7, CH = 5, DH = 6,
+	void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
+	bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
+	bool IsSimpleReg() const {return scale == SCALE_NONE;}
+	bool IsSimpleReg(X64Reg reg) const {
+		if (!IsSimpleReg())
+			return false;
+		return GetSimpleReg() == reg;
+	}

-		AX = 0, BX = 3, CX = 1, DX = 2,
-		SI = 6, DI = 7, BP = 5, SP = 4,
-
-		XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
-		XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
-
-		INVALID_REG = 0xFFFFFFFF
-	};
-
-	enum CCFlags
+	bool CanDoOpWith(const OpArg &other) const
 	{
-		CC_O   = 0,
-		CC_NO  = 1,
-		CC_B   = 2, CC_C  = 2, CC_NAE = 2,
-		CC_NB  = 3, CC_NC = 3, CC_AE  = 3,
-		CC_Z   = 4, CC_E   = 4,
-		CC_NZ  = 5,	CC_NE  = 5, 
-		CC_BE  = 6, CC_NA  = 6,
-		CC_NBE = 7, CC_A   = 7,
-		CC_S   = 8,
-		CC_NS  = 9,
-		CC_P   = 0xA, CC_PE  = 0xA,
-		CC_NP  = 0xB, CC_PO  = 0xB,
-		CC_L   = 0xC, CC_NGE = 0xC,
-		CC_NL  = 0xD, CC_GE  = 0xD,
-		CC_LE  = 0xE, CC_NG  = 0xE,
-		CC_NLE = 0xF, CC_G   = 0xF
-	};
+		if (IsSimpleReg()) return true;
+		if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
+		return true;
+	}

-	enum
+	int GetImmBits() const
 	{
-		NUMGPRs = 16,
-		NUMXMMs = 16,
-	};
+		switch (scale)
+		{
+		case SCALE_IMM8: return 8;
+		case SCALE_IMM16: return 16;
+		case SCALE_IMM32: return 32;
+		case SCALE_IMM64: return 64;
+		default: return -1;
+		}
+	}

-	enum
+	X64Reg GetSimpleReg() const
 	{
-		SCALE_NONE = 0,
-		SCALE_1 = 1,
-		SCALE_2 = 2,
-		SCALE_4 = 4,
-		SCALE_8 = 8,
-		SCALE_ATREG = 16,
-		SCALE_RIP = 0xFF,
-		SCALE_IMM8  = 0xF0,
-		SCALE_IMM16 = 0xF1,
-		SCALE_IMM32 = 0xF2,
-		SCALE_IMM64 = 0xF3,
-	};
+		if (scale == SCALE_NONE)
+			return (X64Reg)offsetOrBaseReg;
+		else
+			return INVALID_REG;
+	}
+private:
+	u8 scale;
+	u8 offsetOrBaseReg;
+	u8 indexReg;
+};
+
+inline OpArg M(void *ptr)	    {return OpArg((u64)ptr, (int)SCALE_RIP);}
+inline OpArg R(X64Reg value)	{return OpArg(0, SCALE_NONE, value);}
+inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
+inline OpArg MDisp(X64Reg value, int offset) {
+	return OpArg((u32)offset, SCALE_ATREG, value); }
+inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+	return OpArg(offset, scale, base, scaled);
+}
+inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);}
+inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
+inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
+inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
+#ifdef _M_X64
+inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
+#else
+inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
+#endif
+
+struct FixupBranch
+{
+	u8 *ptr;
+	int type; //0 = 8bit 1 = 32bit
+};
+
+enum SSECompare
+{
+	EQ = 0,
+	LT,
+	LE,
+	UNORD,
+	NEQ,
+	NLT,
+	NLE,
+	ORD,
+};
+
+typedef const u8* JumpTarget;
+
+class XEmitter
+{
+	friend struct OpArg;  // for Write8 etc
+private:
+	u8 *code;
+
+	void Rex(int w, int r, int x, int b);
+	void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+	void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+	void WriteMulDivType(int bits, OpArg src, int ext);
+	void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2);
+	void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
+	void WriteMXCSR(OpArg arg, int ext);
+	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
+
+protected:
+	inline void Write8(u8 value)   {*code++ = value;}
+	inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
+	inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
+	inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
+
+public:
+	XEmitter() { code = NULL; }
+	XEmitter(u8 *code_ptr) { code = code_ptr; }
+
+	void WriteModRM(int mod, int rm, int reg);
+	void WriteSIB(int scale, int index, int base);

 	void SetCodePtr(u8 *ptr);
 	void ReserveCodeSpace(int bytes);
 	const u8 *AlignCode4();
 	const u8 *AlignCode16();
 	const u8 *AlignCodePage();
-	const u8 *GetCodePtr();
+	const u8 *GetCodePtr() const;
 	u8 *GetWritableCodePtr();

+	// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+	// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., 
+	// INC and DEC are slow on Intel Core, but not on AMD. They create a
+	// false flag dependency because they only update a subset of the flags.
+	// XCHG is SLOW and should be avoided.

-	// Safe way to temporarily redirect the code generator.
-	class GenContext 
-	{
-		u8 **code_ptr_ptr;
-		u8 *saved_ptr;
-	public:
-		GenContext(u8 **code_ptr_ptr_)
-		{
-			saved_ptr = GetWritableCodePtr();
-			code_ptr_ptr = code_ptr_ptr_;
-			SetCodePtr(*code_ptr_ptr);
-		}
-		~GenContext()
-		{
-			*code_ptr_ptr = GetWritableCodePtr();
-			SetCodePtr(saved_ptr);
-		}
-	};
-
-	enum NormalOp {
-		nrmADD,
-		nrmADC,
-		nrmSUB,
-		nrmSBB,
-		nrmAND,
-		nrmOR ,
-		nrmXOR,
-		nrmMOV,
-		nrmTEST,
-		nrmCMP,
-		nrmXCHG,
-	};
-
-	// Make the generation routine examine which direction to go
-	// probably has to be a static
-
-	// RIP addressing does not benefit from micro op fusion on Core arch
-	struct OpArg
-	{
-		OpArg() {} //dummy op arg, used for storage
-		OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
-		{
-			operandReg = 0;
-			scale = (u8)_scale;
-			offsetOrBaseReg = (u8)rmReg;
-			indexReg = (u8)scaledReg;
-			//if scale == 0 never mind offseting
-			offset = _offset;
-		}
-		void WriteRex(bool op64, int customOp = -1) const;
-		void WriteRest(int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
-		void WriteSingleByteOp(u8 op, X64Reg operandReg, int bits);
-		//This one is public - must be written to
-		u64 offset; //use RIP-relative as much as possible - avoid 64-bit immediates at all costs
-		u8 operandReg;
-
-		void WriteNormalOp(bool toRM, NormalOp op, const OpArg &operand, int bits) const;
-		bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
-		bool IsSimpleReg() const {return scale == SCALE_NONE;}
-		bool IsSimpleReg(X64Reg reg) const {
-			if (!IsSimpleReg())
-				return false;
-			return GetSimpleReg() == reg;
-		}
-		bool CanDoOpWith(const OpArg &other) const
-		{
-			if (IsSimpleReg()) return true;
-			if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
-			return true;
-		}
-
-		int GetImmBits() const
-		{
-			switch (scale)
-			{
-			case SCALE_IMM8: return 8;
-			case SCALE_IMM16: return 16;
-			case SCALE_IMM32: return 32;
-			case SCALE_IMM64: return 64;
-			default: return -1;
-			}
-		}
-		X64Reg GetSimpleReg() const
-		{
-			if (scale == SCALE_NONE)
-				return (X64Reg)offsetOrBaseReg;
-			else
-				return INVALID_REG;
-		}
-	private:
-		u8 scale;
-		u8 offsetOrBaseReg;
-		u8 indexReg;
-	};
-
-	inline OpArg M(void *ptr)	    {return OpArg((u64)ptr, (int)SCALE_RIP);}
-	inline OpArg R(X64Reg value)	{return OpArg(0, SCALE_NONE, value);}
-	inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
-	inline OpArg MDisp(X64Reg value, int offset) {
-		return OpArg((u32)offset, SCALE_ATREG, value); }
-	inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
-	{
-		return OpArg(offset, scale, base, scaled);
-	}
-	inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);}
-	inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
-	inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
-	inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
-#ifdef _M_X64
-	inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
-#else
-	inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
-#endif
-
+	// Debug breakpoint
 	void INT3();
+
+	// Do nothing
 	void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
+
+	// Save energy in wait-loops on P4 only. Probably not too useful.
 	void PAUSE();
-	void RET();
+
+	// Flag control
 	void STC();
 	void CLC();
 	void CMC();
+
+	// These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
+	void LAHF(); // 3 cycle vector path
+	void SAHF(); // direct path fast
+
+
+	// Stack control
 	void PUSH(X64Reg reg);
 	void POP(X64Reg reg);
 	void PUSH(int bits, const OpArg &reg);
 	void POP(int bits, const OpArg &reg);
 	void PUSHF();
 	void POPF();
-	
-	typedef const u8* JumpTarget;
-	
-	struct FixupBranch
-	{
-		u8 *ptr;
-		int type; //0 = 8bit 1 = 32bit
-	};

+	// Flow control
+	void RET();
+	void RET_FAST();
+	void UD2();
 	FixupBranch J(bool force5bytes = false);

 	void JMP(const u8 * addr, bool force5Bytes = false);
@ -239,7 +286,7 @@ namespace Gen
 	void JMPptr(const OpArg &arg);
 	void JMPself(); //infinite loop!

-	void CALL(void *fnptr);
+	void CALL(const void *fnptr);
 	void CALLptr(OpArg arg);

 	FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
@ -248,66 +295,20 @@ namespace Gen

 	void SetJumpTarget(const FixupBranch &branch);

-	//WARNING - INC and DEC slow on Intel Core, but not on AMD, since it creates
-	//false flags dependencies because they only update a subset of the flags
-
-	// ector - I hereby BAN inc and dec due to their horribleness :P
-	// void INC(int bits, OpArg arg);
-	// void DEC(int bits, OpArg arg);
-
 	void SETcc(CCFlags flag, OpArg dest);
-	// Note: CMOV brings small if any benefit on current cpus, unfortunately.
+	// Note: CMOV brings small if any benefit on current cpus.
 	void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);

+	// Fences
 	void LFENCE();
 	void MFENCE();
 	void SFENCE();

+	// Bit scan
 	void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
 	void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit

-	//These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
-
-	void LAHF(); // 3 cycle vector path
-	void SAHF(); // direct path fast
-	
-	//Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
-	//LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., 
-
-	//Actually REP MOVSD could be useful :P
-
-	void MOVNTI(int bits, OpArg dest, X64Reg src);
-
-	void MUL(int bits, OpArg src); //UNSIGNED
-	void DIV(int bits, OpArg src);
-	void IMUL(int bits, OpArg src); //SIGNED
-	void IDIV(int bits, OpArg src);
-	void IMUL(int bits, X64Reg regOp, OpArg src);
-	void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
-
-
-	void NEG(int bits, OpArg src);
-	void NOT(int bits, OpArg src);
-
-	void ROL(int bits, OpArg dest, OpArg shift);
-	void ROR(int bits, OpArg dest, OpArg shift);
-	void RCL(int bits, OpArg dest, OpArg shift);
-	void RCR(int bits, OpArg dest, OpArg shift);
-	void SHL(int bits, OpArg dest, OpArg shift);
-	void SHR(int bits, OpArg dest, OpArg shift);
-	void SAR(int bits, OpArg dest, OpArg shift);
-
-
-	void CWD(int bits = 16);
-	inline void CDQ() {CWD(32);}
-	inline void CQO() {CWD(64);}
-	void CBW(int bits = 8);
-	inline void CWDE() {CBW(16);}
-	inline void CDQE() {CBW(32);}
-
-	void LEA(int bits, X64Reg dest, OpArg src);
-
-
+	// Cache control
 	enum PrefetchLevel
 	{
 		PF_NTA, //Non-temporal (data used once and only once)
@ -316,58 +317,82 @@ namespace Gen
 		PF_T2,  //Levels 3+ (aliased to T0 on AMD)
 	};
 	void PREFETCH(PrefetchLevel level, OpArg arg);
-	
+	void MOVNTI(int bits, OpArg dest, X64Reg src);
+	void MOVNTDQ(OpArg arg, X64Reg regOp);
+	void MOVNTPS(OpArg arg, X64Reg regOp);
+	void MOVNTPD(OpArg arg, X64Reg regOp);

+	// Multiplication / division
+	void MUL(int bits, OpArg src); //UNSIGNED
+	void IMUL(int bits, OpArg src); //SIGNED
+	void IMUL(int bits, X64Reg regOp, OpArg src);
+	void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
+	void DIV(int bits, OpArg src);
+	void IDIV(int bits, OpArg src);
+
+	// Shift 
+	void ROL(int bits, OpArg dest, OpArg shift);
+	void ROR(int bits, OpArg dest, OpArg shift);
+	void RCL(int bits, OpArg dest, OpArg shift);
+	void RCR(int bits, OpArg dest, OpArg shift);
+	void SHL(int bits, OpArg dest, OpArg shift);
+	void SHR(int bits, OpArg dest, OpArg shift);
+	void SAR(int bits, OpArg dest, OpArg shift);
+
+	// Extend EAX into EDX in various ways
+	void CWD(int bits = 16);
+	inline void CDQ() {CWD(32);}
+	inline void CQO() {CWD(64);}
+	void CBW(int bits = 8);
+	inline void CWDE() {CBW(16);}
+	inline void CDQE() {CBW(32);}
+
+	// Load effective address
+	void LEA(int bits, X64Reg dest, OpArg src);
+
+	// Integer arithmetic
+	void NEG (int bits, OpArg src);
 	void ADD (int bits, const OpArg &a1, const OpArg &a2);
 	void ADC (int bits, const OpArg &a1, const OpArg &a2);
 	void SUB (int bits, const OpArg &a1, const OpArg &a2);
 	void SBB (int bits, const OpArg &a1, const OpArg &a2);
 	void AND (int bits, const OpArg &a1, const OpArg &a2);
+	void CMP (int bits, const OpArg &a1, const OpArg &a2);
+
+	// Bit operations
+	void NOT (int bits, OpArg src);
 	void OR  (int bits, const OpArg &a1, const OpArg &a2);
 	void XOR (int bits, const OpArg &a1, const OpArg &a2);
 	void MOV (int bits, const OpArg &a1, const OpArg &a2);
 	void TEST(int bits, const OpArg &a1, const OpArg &a2);
-	void CMP (int bits, const OpArg &a1, const OpArg &a2);
-	
-	// XCHG is SLOW and should be avoided.
-	//void XCHG(int bits, const OpArg &a1, const OpArg &a2);

+	// Are these useful at all? Consider removing.
+	void XCHG(int bits, const OpArg &a1, const OpArg &a2);
 	void XCHG_AHAL();
+
+	// Byte swapping (32 and 64-bit only).
 	void BSWAP(int bits, X64Reg reg);
+
+	// Sign/zero extension
 	void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
 	void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); 

-	enum SSECompare
-	{
-		EQ = 0,
-		LT,
-		LE,
-		UNORD,
-		NEQ,
-		NLT,
-		NLE,
-		ORD,
-	};
-
 	// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
 	void STMXCSR(OpArg memloc);
 	void LDMXCSR(OpArg memloc);

-	// Regular SSE/SSE2 instructions
+	// Prefixes
+	void LOCK();
+	void REP();
+	void REPNE();
+
+	void FWAIT();
+
+	// SSE/SSE2: Floating point arithmetic
 	void ADDSS(X64Reg regOp, OpArg arg);  
 	void ADDSD(X64Reg regOp, OpArg arg);  
 	void SUBSS(X64Reg regOp, OpArg arg);  
 	void SUBSD(X64Reg regOp, OpArg arg);  
-	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);  
-	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);  
-	void ANDSS(X64Reg regOp, OpArg arg);  
-	void ANDSD(X64Reg regOp, OpArg arg);  
-	void ANDNSS(X64Reg regOp, OpArg arg); 
-	void ANDNSD(X64Reg regOp, OpArg arg); 
-	void ORSS(X64Reg regOp, OpArg arg);   
-	void ORSD(X64Reg regOp, OpArg arg);   
-	void XORSS(X64Reg regOp, OpArg arg);   
-	void XORSD(X64Reg regOp, OpArg arg);   
 	void MULSS(X64Reg regOp, OpArg arg);  
 	void MULSD(X64Reg regOp, OpArg arg);  
 	void DIVSS(X64Reg regOp, OpArg arg);  
@ -381,45 +406,65 @@ namespace Gen
 	void RSQRTSS(X64Reg regOp, OpArg arg);
 	void RSQRTSD(X64Reg regOp, OpArg arg);

-	void COMISS(X64Reg regOp, OpArg arg);
-	void COMISD(X64Reg regOp, OpArg arg);
+	// SSE/SSE2: Floating point bitwise (yes)
+	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);  
+	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);  
+	void ANDSS(X64Reg regOp, OpArg arg);  
+	void ANDSD(X64Reg regOp, OpArg arg);  
+	void ANDNSS(X64Reg regOp, OpArg arg); 
+	void ANDNSD(X64Reg regOp, OpArg arg); 
+	void ORSS(X64Reg regOp, OpArg arg);   
+	void ORSD(X64Reg regOp, OpArg arg);   
+	void XORSS(X64Reg regOp, OpArg arg);   
+	void XORSD(X64Reg regOp, OpArg arg);   

+	// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
 	void ADDPS(X64Reg regOp, OpArg arg); 
 	void ADDPD(X64Reg regOp, OpArg arg); 
 	void SUBPS(X64Reg regOp, OpArg arg); 
 	void SUBPD(X64Reg regOp, OpArg arg); 
 	void CMPPS(X64Reg regOp, OpArg arg, u8 compare);  
-	void CMPPD(X64Reg regOp, OpArg arg, u8 compare);  
-	void ANDPS(X64Reg regOp, OpArg arg); 
-	void ANDPD(X64Reg regOp, OpArg arg); 
-	void ANDNPS(X64Reg regOp, OpArg arg);
-	void ANDNPD(X64Reg regOp, OpArg arg);
-	void ORPS(X64Reg regOp, OpArg arg);  
-	void ORPD(X64Reg regOp, OpArg arg);  
-	void XORPS(X64Reg regOp, OpArg arg);  
-	void XORPD(X64Reg regOp, OpArg arg);  
-	void MULPS(X64Reg regOp, OpArg arg); 
-	void MULPD(X64Reg regOp, OpArg arg); 
-	void DIVPS(X64Reg regOp, OpArg arg); 
-	void DIVPD(X64Reg regOp, OpArg arg); 
-	void MINPS(X64Reg regOp, OpArg arg); 
-	void MINPD(X64Reg regOp, OpArg arg); 
-	void MAXPS(X64Reg regOp, OpArg arg); 
-	void MAXPD(X64Reg regOp, OpArg arg); 
+	void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
+	void MULPS(X64Reg regOp, OpArg arg);
+	void MULPD(X64Reg regOp, OpArg arg);
+	void DIVPS(X64Reg regOp, OpArg arg);
+	void DIVPD(X64Reg regOp, OpArg arg);
+	void MINPS(X64Reg regOp, OpArg arg);
+	void MINPD(X64Reg regOp, OpArg arg);
+	void MAXPS(X64Reg regOp, OpArg arg);
+	void MAXPD(X64Reg regOp, OpArg arg);
 	void SQRTPS(X64Reg regOp, OpArg arg);
 	void SQRTPD(X64Reg regOp, OpArg arg);
 	void RSQRTPS(X64Reg regOp, OpArg arg);
 	void RSQRTPD(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+	void ANDPS(X64Reg regOp, OpArg arg); 
+	void ANDPD(X64Reg regOp, OpArg arg); 
+	void ANDNPS(X64Reg regOp, OpArg arg);
+	void ANDNPD(X64Reg regOp, OpArg arg);
+	void ORPS(X64Reg regOp, OpArg arg);
+	void ORPD(X64Reg regOp, OpArg arg);
+	void XORPS(X64Reg regOp, OpArg arg);
+	void XORPD(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
 	void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);  
 	void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);  
-
+	
+	// SSE/SSE2: Useful alternative to shuffle in some cases.
 	void MOVDDUP(X64Reg regOp, OpArg arg);

+	void UNPCKLPD(X64Reg dest, OpArg src);
+	void UNPCKHPD(X64Reg dest, OpArg src);
+
+	// SSE/SSE2: Compares.
 	void COMISS(X64Reg regOp, OpArg arg);
 	void COMISD(X64Reg regOp, OpArg arg);
 	void UCOMISS(X64Reg regOp, OpArg arg);
 	void UCOMISD(X64Reg regOp, OpArg arg);

+	// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
 	void MOVAPS(X64Reg regOp, OpArg arg);
 	void MOVAPD(X64Reg regOp, OpArg arg);
 	void MOVAPS(OpArg arg, X64Reg regOp);
@ -435,20 +480,20 @@ namespace Gen
 	void MOVSS(OpArg arg, X64Reg regOp);
 	void MOVSD(OpArg arg, X64Reg regOp);

-	void MOVMSKPS(X64Reg dest, OpArg arg);
-	void MOVMSKPD(X64Reg dest, OpArg arg);
-
 	void MOVD_xmm(X64Reg dest, const OpArg &arg);
 	void MOVQ_xmm(X64Reg dest, OpArg arg);
 	void MOVD_xmm(const OpArg &arg, X64Reg src);
 	void MOVQ_xmm(OpArg arg, X64Reg src);

+	// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
+	void MOVMSKPS(X64Reg dest, OpArg arg);
+	void MOVMSKPD(X64Reg dest, OpArg arg);
+
+	// SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
 	void MASKMOVDQU(X64Reg dest, X64Reg src);
 	void LDDQU(X64Reg dest, OpArg src);

-	void UNPCKLPD(X64Reg dest, OpArg src);
-	void UNPCKHPD(X64Reg dest, OpArg src);
-
+	// SSE/SSE2: Data type conversions.
 	void CVTPS2PD(X64Reg dest, OpArg src);
 	void CVTPD2PS(X64Reg dest, OpArg src);
 	void CVTSS2SD(X64Reg dest, OpArg src);
@ -458,7 +503,7 @@ namespace Gen
 	void CVTPD2DQ(X64Reg regOp, OpArg arg);
 	void CVTDQ2PS(X64Reg regOp, const OpArg &arg);

-	//Integer SSE instructions
+	// SSE2: Packed integer instructions
 	void PACKSSDW(X64Reg dest, OpArg arg);
 	void PACKSSWB(X64Reg dest, OpArg arg);
 	//void PACKUSDW(X64Reg dest, OpArg arg);
@ -528,42 +573,138 @@ namespace Gen

 	void RTDSC();

-void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
-void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
-void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+	// Utility functions
+	// These only support u32 parameters, but that's enough for a lot of uses.
+	// These will destroy the 1 or 2 first "parameter regs".
+	void ABI_CallFunctionC(void *func, u32 param1);
+	void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
+	void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
+
+	// Pass a register as a paremeter.
+	void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
+	void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
+
+	// A function that doesn't have any control over what it will do to regs,
+	// such as the dispatcher, should be surrounded by these.
+	void ABI_PushAllCalleeSavedRegsAndAdjustStack();
+	void ABI_PopAllCalleeSavedRegsAndAdjustStack();
+
+	// A function that doesn't know anything about it's surroundings, should
+	// be surrounded by these to establish a safe environment, where it can roam free.
+	// An example is a backpatch injected function.
+	void ABI_PushAllCallerSavedRegsAndAdjustStack();
+	void ABI_PopAllCallerSavedRegsAndAdjustStack();
+
+	unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
+	void ABI_AlignStack(unsigned int frameSize);
+	void ABI_RestoreStack(unsigned int frameSize);
+
+	// Sets up a __cdecl function.
+	// Only x64 really needs the parameter.
+	void ABI_EmitPrologue(int maxCallParams);
+	void ABI_EmitEpilogue(int maxCallParams);
+
+	#ifdef _M_IX86
+	inline int ABI_GetNumXMMRegs() { return 8; }
+	#else
+	inline int ABI_GetNumXMMRegs() { return 16; }
+	#endif
+
+	// Strange call wrappers.
+	void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
+	void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
+	void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+	void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);

 #if defined(_M_IX86) || !defined(_WIN32)

-#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
-#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e)) 
-#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f)) 
-#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g)) 
+	#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
+	#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e)) 
+	#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f)) 
+	#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g)) 

-#define DECLARE_IMPORT(x)
+	#define DECLARE_IMPORT(x)

 #else

-// Comments from VertexLoader.cpp about these horrors:
+	// Comments from VertexLoader.cpp about these horrors:

-// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
-// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
-// want to grab the function pointers from the import table instead.
+	// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
+	// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
+	// want to grab the function pointers from the import table instead.

-void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
-void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
-void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+	void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
+	void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
+	void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+	void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);

-#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
-#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
-#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
-#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
+	#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
+	#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
+	#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
+	#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)

-#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
+	#define DECLARE_IMPORT(x) extern "C" void *__imp_##x

 #endif
+};  // class XEmitter

-}
+
+// Everything that needs to generate X86 code should inherit from this.
+// You get memory management for free, plus, you can use all the MOV etc functions without
+// having to prefix them with gen-> or something similar.
+class XCodeBlock : public XEmitter
+{
+protected:
+	u8 *region;
+	size_t region_size;
+
+public:
+	XCodeBlock() : region(NULL), region_size(0) {}
+	virtual ~XCodeBlock() { if (region) FreeCodeSpace(); }
+
+	// Call this before you generate any code.
+	void AllocCodeSpace(int size)
+	{
+		region_size = size;
+		region = (u8*)AllocateExecutableMemory(region_size);
+		SetCodePtr(region);
+	}
+
+	// Always clear code space with breakpoints, so that if someone accidentally executes
+	// uninitialized, it just breaks into the debugger.
+	void ClearCodeSpace() 
+	{
+		// x86/64: 0xCC = breakpoint
+		memset(region, 0xCC, region_size);
+		ResetCodePtr();
+	}
+
+	// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
+	void FreeCodeSpace()
+	{
+		FreeMemoryPages(region, region_size);
+		region = NULL;
+		region_size = 0;
+	}
+
+	// Cannot currently be undone. Will write protect the entire code region.
+	// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
+	void WriteProtect()
+	{
+		WriteProtectMemory(region, region_size, true);		
+	}
+
+	void ResetCodePtr()
+	{
+		SetCodePtr(region);
+	}
+
+	size_t GetSpaceLeft() const
+	{
+		return region_size - (GetCodePtr() - region);
+	}
+};
+
+}  // namespace

 #endif
--- a/Source/Core/Core/Src/HW/HW.cpp
+++ b/Source/Core/Core/Src/HW/HW.cpp
@ -46,7 +46,7 @@ namespace HW
 	{
 		CoreTiming::Init();

-		Thunk_Init(); // not really hw, but this way we know it's inited early :P
+		thunks.Init(); // not really hw, but this way we know it's inited early :P
 		State_Init();

 		// Init the whole Hardware
@ -88,7 +88,7 @@ namespace HW
 		}
 		
 		State_Shutdown();
-		Thunk_Shutdown();
+		thunks.Shutdown();
 		CoreTiming::Shutdown();
 	}

--- a/Source/Core/Core/Src/MemTools.cpp
+++ b/Source/Core/Core/Src/MemTools.cpp
@ -104,7 +104,7 @@ LONG NTAPI Handler(PEXCEPTION_POINTERS pPtrs)

 			//We could emulate the memory accesses here, but then they would still be around to take up
 			//execution resources. Instead, we backpatch into a generic memory call and retry.
-			u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx);
+			const u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx);

 			// Rip/Eip needs to be updated.
 			if (new_rip)
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp
@ -164,6 +164,8 @@ ps_adds1
 Jit64 jit;
 PPCAnalyst::CodeBuffer code_buffer(32000);

+int CODE_SIZE = 1024*1024*16;
+
 namespace CPUCompare
 {
 	extern u32 m_BlockStart;
@ -171,6 +173,11 @@ namespace CPUCompare

 	void Jit64::Init()
 	{
+		if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
+		{
+			CODE_SIZE = 1024*1024*8*8;
+		}
+
 		jo.optimizeStack = true;
 		jo.enableBlocklink = true;  // Speed boost, but not 100% safe
 #ifdef _M_X64
@ -182,6 +189,23 @@ namespace CPUCompare
 		jo.fpAccurateFlags = true;
 		jo.optimizeGatherPipe = true;
 		jo.fastInterrupts = false;
+
+		gpr.SetEmitter(this);
+		fpr.SetEmitter(this);
+
+		trampolines.Init();
+		AllocCodeSpace(CODE_SIZE);
+		InitCache();
+		asm_routines.Init();
+	}
+
+	void Jit64::Shutdown()
+	{
+		FreeCodeSpace();
+		ShutdownCache();
+
+		trampolines.Shutdown();
+		asm_routines.Shutdown();
 	}

 	void Jit64::WriteCallInterpreter(UGeckoInstruction _inst)
@ -271,7 +295,7 @@ namespace CPUCompare
 		else 
 		{
 			MOV(32, M(&PC), Imm32(destination));
-			JMP(Asm::dispatcher, true);
+			JMP(asm_routines.dispatcher, true);
 		}
 	}

@ -280,7 +304,7 @@ namespace CPUCompare
 		MOV(32, M(&PC), R(EAX));
 		Cleanup();
 		SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); 
-		JMP(Asm::dispatcher, true);
+		JMP(asm_routines.dispatcher, true);
 	}

 	void Jit64::WriteRfiExitDestInEAX() 
@ -288,7 +312,7 @@ namespace CPUCompare
 		MOV(32, M(&PC), R(EAX));
 		Cleanup();
 		SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); 
-		JMP(Asm::testExceptions, true);
+		JMP(asm_routines.testExceptions, true);
 	}

 	void Jit64::WriteExceptionExit(u32 exception)
@ -296,7 +320,7 @@ namespace CPUCompare
 		Cleanup();
 		OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception));
 		MOV(32, M(&PC), Imm32(js.compilerPC + 4));
-		JMP(Asm::testExceptions, true);
+		JMP(asm_routines.testExceptions, true);
 	}

 	const u8* Jit64::DoJit(u32 emaddress, JitBlock &b)
@ -326,11 +350,13 @@ namespace CPUCompare
 		// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
 		FixupBranch skip = J_CC(CC_NBE);
 		MOV(32, M(&PC), Imm32(js.blockStart));
-		JMP(Asm::doTiming, true);  // downcount hit zero - go doTiming.
+		JMP(asm_routines.doTiming, true);  // downcount hit zero - go doTiming.
 		SetJumpTarget(skip);

 		const u8 *normalEntry = GetCodePtr();
-		if (ImHereDebug) CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
+		
+		if (ImHereDebug)
+			CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
 		
 		if (js.fpa.any)
 		{
@ -338,7 +364,7 @@ namespace CPUCompare
 			TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
 			FixupBranch b1 = J_CC(CC_NZ);
 			MOV(32, M(&PC), Imm32(js.blockStart));
-			JMP(Asm::fpException, true);
+			JMP(asm_routines.fpException, true);
 			SetJumpTarget(b1);
 		}

@ -348,7 +374,7 @@ namespace CPUCompare
 			TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
 			FixupBranch b1 = J_CC(CC_Z);
 			MOV(32, M(&PC), Imm32(js.blockStart));
-			JMP(Asm::testExceptions, true);
+			JMP(asm_routines.testExceptions, true);
 			SetJumpTarget(b1);
 		}

@ -404,7 +430,7 @@ namespace CPUCompare
 			if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
 			{
 				js.fifoBytesThisBlock -= 32;
-				CALL(ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
+				CALL(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
 			}

 			PPCTables::CompileInstruction(ops[i].inst);
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h
@ -24,7 +24,9 @@

 #include "../PPCAnalyst.h"
 #include "JitCache.h"
+#include "JitRegCache.h"
 #include "x64Emitter.h"
+#include "x64Analyzer.h"

 #ifdef _WIN32

@ -47,8 +49,24 @@ struct CONTEXT

 #endif

-class Jit64
+
+class TrampolineCache : public Gen::XCodeBlock
 {
+public:
+	void Init();
+	void Shutdown();
+
+	const u8 *GetReadTrampoline(const InstructionInfo &info);
+	const u8 *GetWriteTrampoline(const InstructionInfo &info);
+};
+
+
+class Jit64 : public Gen::XCodeBlock
+{
+	TrampolineCache trampolines;
+	GPRRegCache gpr;
+	FPURegCache fpr;
+
 public:
 	typedef void (*CompiledCode)();

@ -157,7 +175,7 @@ public:
 	bool RangeIntersect(int s1, int e1, int s2, int e2) const;
 	bool IsInJitCode(const u8 *codePtr);
 	
-	u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx);
+	const u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx);

 #define JIT_OPCODE 0

@ -165,6 +183,7 @@ public:
 	const u8* DoJit(u32 emaddress, JitBlock &b);

 	void Init();
+	void Shutdown();

 	// Utilities for use by opcodes

@ -188,10 +207,10 @@ public:
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
 	void JitClearCA();
 	void JitSetCA();
-	void tri_op(int d, int a, int b, bool reversible, void (*op)(Gen::X64Reg, Gen::OpArg));
+	void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
 	typedef u32 (*Operation)(u32 a, u32 b);
-	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
-	void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg));
+	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
+	void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));


 	// OPCODES
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp
@ -31,27 +31,12 @@
 #include "../../HW/CPUCompare.h"
 #include "../../HW/GPFifo.h"
 #include "../../Core.h"
+#include "JitAsm.h"

 using namespace Gen;
 int blocksExecuted;

-namespace Asm
-{
-const u8 *enterCode;
-const u8 *testExceptions;
-const u8 *fpException;
-const u8 *doTiming;
-const u8 *dispatcher;
-const u8 *dispatcherNoCheck;
-const u8 *dispatcherPcInEAX;
-const u8 *computeRc;
-const u8 *computeRcFp;
-
-const u8 *fifoDirectWrite8;
-const u8 *fifoDirectWrite16;
-const u8 *fifoDirectWrite32;
-const u8 *fifoDirectWriteFloat;
-const u8 *fifoDirectWriteXmm64;
+static int temp32;

 bool compareEnabled = false;

@ -72,16 +57,15 @@ static bool enableStatistics = false;
 //RBX - Base pointer of memory
 //R15 - Pointer to array of block pointers 

+AsmRoutineManager asm_routines;

 // PLAN: no more block numbers - crazy opcodes just contain offset within
 // dynarec buffer
 // At this offset - 4, there is an int specifying the block number.


-void GenerateCommon();
-
 #ifdef _M_IX86
-void Generate()
+void AsmRoutineManager::Generate()
 {
 	enterCode = AlignCode16();
 	PUSH(EBP);
@ -129,7 +113,6 @@ void Generate()
 					ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1));
 				}
 				//grab from list and jump to it
-				//INT3();
 				MOV(32, R(EDX), ImmPtr(jit.GetCodePointers()));
 				JMPptr(MComplex(EDX, EAX, 4, 0));
 			SetJumpTarget(notfound);
@ -180,12 +163,14 @@ void Generate()

 #elif defined(_M_X64)

-void Generate()
+void AsmRoutineManager::Generate()
 {
 	enterCode = AlignCode16();

 	ABI_PushAllCalleeSavedRegsAndAdjustStack();

+	if (!jit.GetCodePointers() || !Memory::base)
+		PanicAlert("Memory::base and jit.GetCodePointers() must return valid values");
 	MOV(64, R(RBX), Imm64((u64)Memory::base));
 	MOV(64, R(R15), Imm64((u64)jit.GetCodePointers())); //It's below 2GB so 32 bits are good enough
 	const u8 *outerLoop = GetCodePtr();
@ -264,7 +249,7 @@ void Generate()
 }
 #endif

-void GenFifoWrite(int size) 
+void AsmRoutineManager::GenFifoWrite(int size) 
 {
 	// Assume value in ABI_PARAM1
 	PUSH(ESI);
@ -287,8 +272,7 @@ void GenFifoWrite(int size)
 	RET();
 }

-static int temp32;
-void GenFifoFloatWrite() 
+void AsmRoutineManager::GenFifoFloatWrite() 
 {
 	// Assume value in XMM0
 	PUSH(ESI);
@ -306,7 +290,7 @@ void GenFifoFloatWrite()
 	RET();
 }

-void GenFifoXmm64Write() 
+void AsmRoutineManager::GenFifoXmm64Write() 
 {
 	// Assume value in XMM0. Assume pre-byteswapped (unlike the others here!)
 	PUSH(ESI);
@ -319,7 +303,7 @@ void GenFifoXmm64Write()
 	RET();
 }

-void GenerateCommon()
+void AsmRoutineManager::GenerateCommon()
 {
 	// USES_CR
 	computeRc = AlignCode16();
@ -364,5 +348,3 @@ void GenerateCommon()
 	SetJumpTarget(skip_fast_write);
 	CALL((void *)&Memory::Write_U8);*/
 }
-
-}  // namespace Asm
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h
@ -14,33 +14,71 @@

 // Official SVN repository and contact information can be found at
 // http://code.google.com/p/dolphin-emu/
+
 #ifndef _JITASM_H
 #define _JITASM_H

-namespace Asm
+#include "x64Emitter.h"
+
+// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
+// code at runtime. In the case of fixed code like this, after writing it, we write
+// protect the memory, essentially making it work just like precompiled code.
+
+// There are some advantages to this approach:
+//   1) No need to setup an external assembler in the build.
+//   2) Cross platform, as long as it's x86/x64.
+//   3) Can optimize code at runtime for the specific CPU model.
+// There aren't really any disadvantages other than having to maintain a x86 emitter,
+// which we have to do anyway :)
+// 
+// To add a new asm routine, just add another const here, and add the code to Generate.
+// Also, possibly increase the size of the code buffer.
+
+class AsmRoutineManager : public Gen::XCodeBlock
 {
-	extern const u8 *enterCode;
-
-	extern const u8 *dispatcher;
-	extern const u8 *dispatcherNoCheck;
-	extern const u8 *dispatcherPcInEAX;
-
-	extern const u8 *fpException;
-	extern const u8 *computeRc;
-	extern const u8 *computeRcFp;
-	extern const u8 *testExceptions;
-	extern const u8 *dispatchPcInEAX;
-	extern const u8 *doTiming;
-
-	extern const u8 *fifoDirectWrite8;
-	extern const u8 *fifoDirectWrite16;
-	extern const u8 *fifoDirectWrite32;
-	extern const u8 *fifoDirectWriteFloat;
-	extern const u8 *fifoDirectWriteXmm64;
-
-	extern bool compareEnabled;
+private:
 	void Generate();
-}
+	void GenerateCommon();
+	void GenFifoWrite(int size);
+	void GenFifoFloatWrite();
+	void GenFifoXmm64Write();
+
+public:
+	void Init() {
+		AllocCodeSpace(8192);
+		Generate();
+		WriteProtect();
+	}
+
+	void Shutdown() {
+		FreeCodeSpace();
+	}
+
+
+	// Public generated functions. Just CALL(M((void*)func)) them.
+
+	const u8 *enterCode;
+
+	const u8 *dispatcher;
+	const u8 *dispatcherNoCheck;
+	const u8 *dispatcherPcInEAX;
+
+	const u8 *fpException;
+	const u8 *computeRc;
+	const u8 *computeRcFp;
+	const u8 *testExceptions;
+	const u8 *dispatchPcInEAX;
+	const u8 *doTiming;
+
+	const u8 *fifoDirectWrite8;
+	const u8 *fifoDirectWrite16;
+	const u8 *fifoDirectWrite32;
+	const u8 *fifoDirectWriteFloat;
+	const u8 *fifoDirectWriteXmm64;
+
+	bool compareEnabled;
+};
+
+extern AsmRoutineManager asm_routines;

 #endif
-
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitBackpatch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitBackpatch.cpp
@ -33,7 +33,7 @@
 using namespace Gen;

 extern u8 *trampolineCodePtr;
-	
+
 void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
 	u64 code_addr = (u64)codePtr;
 	disassembler disasm;
@ -51,17 +51,105 @@ void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
 	return;
 }

+
+void TrampolineCache::Init()
+{
+	AllocCodeSpace(1024 * 1024);
+}
+
+void TrampolineCache::Shutdown()
+{
+	AllocCodeSpace(1024 * 1024);
+}
+
+// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
+const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info)
+{
+	if (GetSpaceLeft() < 1024)
+		PanicAlert("Trampoline cache full");
+
+	X64Reg addrReg = (X64Reg)info.scaledReg;
+	X64Reg dataReg = (X64Reg)info.regOperandReg;
+	const u8 *trampoline = GetCodePtr();
+#ifdef _M_X64
+	// It's a read. Easy.
+	ABI_PushAllCallerSavedRegsAndAdjustStack();
+	if (addrReg != ABI_PARAM1)
+		MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
+	if (info.displacement) {
+		ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
+	}
+	switch (info.operandSize) {
+	case 4:
+		CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1));
+		break;
+	}
+	ABI_PopAllCallerSavedRegsAndAdjustStack();
+	MOV(32, R(dataReg), R(EAX));
+	RET();
+#endif
+	return trampoline;
+}
+
+// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
+const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info)
+{
+	if (GetSpaceLeft() < 1024)
+		PanicAlert("Trampoline cache full");
+
+	X64Reg addrReg = (X64Reg)info.scaledReg;
+	X64Reg dataReg = (X64Reg)info.regOperandReg;
+	if (dataReg != EAX)
+		PanicAlert("Backpatch write - not through EAX");
+
+	const u8 *trampoline = GetCodePtr();
+
+#ifdef _M_X64
+
+	// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a 
+	// hardware access - we can take shortcuts.
+	//if (emAddress == 0xCC008000)
+	//	PanicAlert("caught a fifo write");
+	CMP(32, R(addrReg), Imm32(0xCC008000));
+	FixupBranch skip_fast = J_CC(CC_NE, false);
+	MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
+	CALL((void*)asm_routines.fifoDirectWrite32);
+	RET();
+	SetJumpTarget(skip_fast);
+	ABI_PushAllCallerSavedRegsAndAdjustStack();
+	if (addrReg != ABI_PARAM1) {
+		MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
+		MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
+	} else {
+		MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
+		MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
+	}
+	if (info.displacement) {
+		ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
+	}
+	switch (info.operandSize) {
+	case 4:
+		CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2));
+		break;
+	}
+	ABI_PopAllCallerSavedRegsAndAdjustStack();
+	RET();
+#endif
+
+	return trampoline;
+}
+
+
 // This generates some fairly heavy trampolines, but:
 // 1) It's really necessary. We don't know anything about the context.
 // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be 
 //    that many of them in a typical program/game.
-u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
+const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
 {
 #ifdef _M_X64
 	if (!IsInJitCode(codePtr))
 		return 0;  // this will become a regular crash real soon after this
 	
-	u8 *oldCodePtr = GetWritableCodePtr();
 	InstructionInfo info;
 	if (!DisassembleMov(codePtr, info, accessType)) {
 		BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress);
@ -81,108 +169,42 @@ u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
 		BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress);
 	}

-	X64Reg addrReg = (X64Reg)info.scaledReg;
-	X64Reg dataReg = (X64Reg)info.regOperandReg;
 	if (info.otherReg != RBX)
 		PanicAlert("BackPatch : Base reg not RBX."
 		           "\n\nAttempted to access %08x.", emAddress);
-	//if (accessType == OP_ACCESS_WRITE)
-	//	PanicAlert("BackPatch : Currently only supporting reads."
-	//	           "\n\nAttempted to write to %08x.", emAddress);
-
-	// OK, let's write a trampoline, and a jump to it.
-	// Later, let's share trampolines.
+	
+	if (accessType == OP_ACCESS_WRITE)
+		PanicAlert("BackPatch : Currently only supporting reads."
+		           "\n\nAttempted to write to %08x.", emAddress);

 	// In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads.
-	// Next step - support writes, special case FIFO writes. Also, support 32-bit mode.
-	u8 *trampoline = trampolineCodePtr;
-	SetCodePtr(trampolineCodePtr);
-
 	if (accessType == 0)
 	{
-		// It's a read. Easy.
-		ABI_PushAllCallerSavedRegsAndAdjustStack();
-		if (addrReg != ABI_PARAM1)
-			MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
-		if (info.displacement) {
-			ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
-		}
-		switch (info.operandSize) {
-		case 4:
-			CALL(ProtectFunction((void *)&Memory::Read_U32, 1));
-			break;
-		default:
-			BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress);
-			break;
-		}
-		ABI_PopAllCallerSavedRegsAndAdjustStack();
-		MOV(32, R(dataReg), R(EAX));
-		RET();
-		trampolineCodePtr = GetWritableCodePtr();
-
-		SetCodePtr(codePtr);
+		XEmitter emitter(codePtr);
 		int bswapNopCount;
 		// Check the following BSWAP for REX byte
-		if ((GetCodePtr()[info.instructionSize] & 0xF0) == 0x40)
+		if ((codePtr[info.instructionSize] & 0xF0) == 0x40)
 			bswapNopCount = 3;
 		else
 			bswapNopCount = 2;
-		CALL(trampoline);
-		NOP((int)info.instructionSize + bswapNopCount - 5);
-		SetCodePtr(oldCodePtr);
-
+		const u8 *trampoline = trampolines.GetReadTrampoline(info);
+		emitter.CALL((void *)trampoline);
+		emitter.NOP((int)info.instructionSize + bswapNopCount - 5);
 		return codePtr;
 	}
 	else if (accessType == 1)
 	{
-		// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a 
-		// hardware access - we can take shortcuts.
-		//if (emAddress == 0xCC008000)
-		//	PanicAlert("caught a fifo write");
-		if (dataReg != EAX)
-			PanicAlert("Backpatch write - not through EAX");
-		CMP(32, R(addrReg), Imm32(0xCC008000));
-		FixupBranch skip_fast = J_CC(CC_NE, false);
-		MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
-		CALL((void*)Asm::fifoDirectWrite32);
-		RET();
-		SetJumpTarget(skip_fast);
-		ABI_PushAllCallerSavedRegsAndAdjustStack();
-		if (addrReg != ABI_PARAM1) {
-			//INT3();
-			MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
-			MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
-		} else {
-			MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
-			MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
-		}
-		if (info.displacement) {
-			ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
-		}
-		switch (info.operandSize) {
-		case 4:
-			CALL(ProtectFunction((void *)&Memory::Write_U32, 2));
-			break;
-		default:
-			BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress);
-			break;
-		}
-		ABI_PopAllCallerSavedRegsAndAdjustStack();
-		RET();
-
-		trampolineCodePtr = GetWritableCodePtr();
-
+		// TODO: special case FIFO writes. Also, support 32-bit mode.
+		// Also, debug this so that it actually works correctly :P
+		XEmitter emitter(codePtr - 2);
 		// We know it's EAX so the BSWAP before will be two byte. Overwrite it.
-		SetCodePtr(codePtr - 2);
-		CALL(trampoline);
-		NOP((int)info.instructionSize - 3);
+		const u8 *trampoline = trampolines.GetWriteTrampoline(info);
+		emitter.CALL((void *)trampoline);
+		emitter.NOP((int)info.instructionSize - 3);
 		if (info.instructionSize < 3)
 			PanicAlert("instruction too small");
-		SetCodePtr(oldCodePtr);
-
 		// We entered here with a BSWAP-ed EAX. We'll have to swap it back.
 		ctx->Rax = Common::swap32(ctx->Rax);
-
 		return codePtr - 2;
 	}
 	return 0;
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp
@ -56,19 +56,15 @@ using namespace Gen;
 	op_agent_t agent;
 #endif
 	static u8 *codeCache;
-	static u8 *genFunctions;
 	static u8 *trampolineCache;
 	u8 *trampolineCodePtr;
 #define INVALID_EXIT 0xFFFFFFFF

 	enum 
 	{
-		//CODE_SIZE = 1024*1024*8,
-		GEN_SIZE = 4096,
 		TRAMPOLINE_SIZE = 1024*1024,
-		//MAX_NUM_BLOCKS = 65536,
 	};
-	int CODE_SIZE = 1024*1024*16;
+
 	int MAX_NUM_BLOCKS = 65536*2;

 	static u8 **blockCodePointers;
@ -89,36 +85,22 @@ using namespace Gen;

 	void Jit64::InitCache()
 	{
-		if(Core::g_CoreStartupParameter.bJITUnlimitedCache)
+		if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
 		{
-			CODE_SIZE = 1024*1024*8*8;
 			MAX_NUM_BLOCKS = 65536*8;
 		}

-		codeCache    = (u8*)AllocateExecutableMemory(CODE_SIZE);
-		genFunctions = (u8*)AllocateExecutableMemory(GEN_SIZE);
-		trampolineCache = (u8*)AllocateExecutableMemory(TRAMPOLINE_SIZE);
-		trampolineCodePtr = trampolineCache;
-
 #ifdef OPROFILE_REPORT
 		agent = op_open_agent();
 #endif
 		blocks = new JitBlock[MAX_NUM_BLOCKS];
 		blockCodePointers = new u8*[MAX_NUM_BLOCKS];
+
 		ClearCache();
-		SetCodePtr(genFunctions);
-		Asm::Generate();
-		// Protect the generated functions
-		WriteProtectMemory(genFunctions, GEN_SIZE, true);
-		SetCodePtr(codeCache);
 	}

 	void Jit64::ShutdownCache()
 	{
-		UnWriteProtectMemory(genFunctions, GEN_SIZE, true);
-		FreeMemoryPages(codeCache, CODE_SIZE);
-		FreeMemoryPages(genFunctions, GEN_SIZE);
-		FreeMemoryPages(trampolineCache, TRAMPOLINE_SIZE);
 		delete [] blocks;
 		delete [] blockCodePointers;
 		blocks = 0;
@ -135,21 +117,23 @@ using namespace Gen;
 	{
 		Core::DisplayMessage("Cleared code cache.", 3000);
 		// Is destroying the blocks really necessary?
-		for (int i = 0; i < numBlocks; i++) {
+		for (int i = 0; i < numBlocks; i++)
+		{
 			DestroyBlock(i, false);
 		}
 		links_to.clear();
-		trampolineCodePtr = trampolineCache;
 		numBlocks = 0;
 		memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS);
-		memset(codeCache, 0xCC, CODE_SIZE);
-		SetCodePtr(codeCache);
+
+		trampolines.ClearCodeSpace();
 	}

 	void Jit64::DestroyBlocksWithFlag(BlockFlag death_flag)
 	{
-		for (int i = 0; i < numBlocks; i++) {
-			if (blocks[i].flags & death_flag) {
+		for (int i = 0; i < numBlocks; i++)
+		{
+			if (blocks[i].flags & death_flag)
+			{
 				DestroyBlock(i, false);
 			}
 		}
@ -190,10 +174,10 @@ using namespace Gen;

 	const u8 *Jit64::Jit(u32 emAddress)
 	{
-		if (GetCodePtr() >= codeCache + CODE_SIZE - 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1)
+		if (GetSpaceLeft() < 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1)
 		{
 			LOG(DYNA_REC, "JIT cache full - clearing.")
-			if(Core::g_CoreStartupParameter.bJITUnlimitedCache)
+			if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
 			{
 				PanicAlert("What? JIT cache still full - clearing.");
 			}
@ -221,10 +205,8 @@ using namespace Gen;
 				}
 			}
 			
-			u8 *oldCodePtr = GetWritableCodePtr();
 			LinkBlock(numBlocks);
 			LinkBlockExits(numBlocks);
-			SetCodePtr(oldCodePtr);
 		}

 #ifdef OPROFILE_REPORT
@ -257,7 +239,7 @@ using namespace Gen;

 	void Jit64::EnterFastRun()
 	{
-		CompiledCode pExecAddr = (CompiledCode)Asm::enterCode;
+		CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
 		pExecAddr();
 		//Will return when PowerPC::state changes
 	}
@ -336,8 +318,8 @@ using namespace Gen;
 				int destinationBlock = GetBlockNumberFromAddress(b.exitAddress[e]);
 				if (destinationBlock != -1)
 				{
-					SetCodePtr(b.exitPtrs[e]);
-					JMP(blocks[destinationBlock].checkedEntry, true);
+					XEmitter emit(b.exitPtrs[e]);
+					emit.JMP(blocks[destinationBlock].checkedEntry, true);
 					b.linkStatus[e] = true;
 				}
 			}
@ -345,6 +327,7 @@ using namespace Gen;
 	}

 	using namespace std;
+
 	void Jit64::LinkBlock(int i)
 	{
 		LinkBlockExits(i);
@ -386,15 +369,15 @@ using namespace Gen;
 		// Not entirely ideal, but .. pretty good.

 		// TODO - make sure that the below stuff really is safe.
-		u8 *prev_code = GetWritableCodePtr();
+
 		// Spurious entrances from previously linked blocks can only come through checkedEntry
-		SetCodePtr((u8*)b.checkedEntry);
-		MOV(32, M(&PC), Imm32(b.originalAddress));
-		JMP(Asm::dispatcher, true);
-		SetCodePtr(blockCodePointers[blocknum]);
-		MOV(32, M(&PC), Imm32(b.originalAddress));
-		JMP(Asm::dispatcher, true);
-		SetCodePtr(prev_code);  // reset code pointer
+		XEmitter emit((u8*)b.checkedEntry);
+		emit.MOV(32, M(&PC), Imm32(b.originalAddress));
+		emit.JMP(asm_routines.dispatcher, true);
+
+		emit.SetCodePtr(blockCodePointers[blocknum]);
+		emit.MOV(32, M(&PC), Imm32(b.originalAddress));
+		emit.JMP(asm_routines.dispatcher, true);
 	}


--- a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.h
@ -19,6 +19,6 @@

 #include "../Gekko.h"

-// Will soon introduced the JitBlockCache class here.
+// Will soon introduce the JitBlockCache class here.

 #endif
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitCore.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCore.cpp
@ -34,13 +34,12 @@ namespace JitCore
 void Init()
 {
 	jit.Init();
-	jit.InitCache();
-	Asm::compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
+	asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
 }

 void Shutdown()
 {
-	jit.ShutdownCache();
+	jit.Shutdown();
 }

 void SingleStep()
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp
@ -27,8 +27,6 @@ using namespace Gen;
 using namespace PowerPC;


-	GPRRegCache gpr;
-	FPURegCache fpr;
 	
 	void RegCache::Start(PPCAnalyst::BlockRegStats &stats)
 	{
@ -267,7 +265,7 @@ using namespace PowerPC;
 			xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
 			OpArg newloc = ::Gen::R(xr);
 			if (doLoad || regs[i].location.IsImm())
-				MOV(32, newloc, regs[i].location);
+				emit->MOV(32, newloc, regs[i].location);
 			for (int j = 0; j < 32; j++)
 			{
 				if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr)
@ -309,7 +307,7 @@ using namespace PowerPC;
 			}
 			OpArg newLoc = GetDefaultLocation(i);
 			// if (doStore) //<-- Breaks JIT compilation
-				MOV(32, newLoc, regs[i].location);
+				emit->MOV(32, newLoc, regs[i].location);
 			regs[i].location = newLoc;
 			regs[i].away = false;
 		}
@ -327,11 +325,13 @@ using namespace PowerPC;
 			xregs[xr].free = false;
 			xregs[xr].dirty = makeDirty;
 			OpArg newloc = ::Gen::R(xr);
-			if (doLoad) {
-				if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF)) {
+			if (doLoad)
+			{
+				if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF))
+				{
 					PanicAlert("WARNING - misaligned fp register location %i", i);
 				}
-				MOVAPD(xr, regs[i].location);
+				emit->MOVAPD(xr, regs[i].location);
 			}
 			regs[i].location = newloc;
 			regs[i].away = true;
@ -352,7 +352,7 @@ using namespace PowerPC;
 			xregs[xr].dirty = false;
 			xregs[xr].ppcReg = -1;
 			OpArg newLoc = GetDefaultLocation(i);
-			MOVAPD(newLoc, xr);
+			emit->MOVAPD(newLoc, xr);
 			regs[i].location = newLoc;
 			regs[i].away = false;
 		}
--- a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h
@ -72,10 +72,15 @@

 		void DiscardRegContentsIfCached(int preg);
 		virtual const int *GetAllocationOrder(int &count) = 0;
+		
+		XEmitter *emit;

 	public:
 		virtual ~RegCache() {}
 		virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0;
+
+		void SetEmitter(XEmitter *emitter) {emit = emitter;}
+
 		void FlushR(X64Reg reg); 
 		void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);}
 		void FlushLockX(X64Reg reg) {
@ -142,8 +147,5 @@
 		OpArg GetDefaultLocation(int reg) const;
 	};

-	extern GPRRegCache gpr;
-	extern FPURegCache fpr;
-
 #endif

--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -33,39 +33,39 @@
 	const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
 	const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};

-	void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg))
+	void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
 	{
 		fpr.Lock(d, a, b);
 		if (d == a)
 		{
 			fpr.LoadToX64(d, true);
-			op(fpr.RX(d), fpr.R(b));
+			(this->*op)(fpr.RX(d), fpr.R(b));
 		}
 		else if (d == b && reversible)
 		{
 			fpr.LoadToX64(d, true);
-			op(fpr.RX(d), fpr.R(a));
+			(this->*op)(fpr.RX(d), fpr.R(a));
 		}
 		else if (a != d && b != d) 
 		{
 			// Sources different from d, can use rather quick solution
 			fpr.LoadToX64(d, !dupe);
 			MOVSD(fpr.RX(d), fpr.R(a));
-			op(fpr.RX(d), fpr.R(b));
+			(this->*op)(fpr.RX(d), fpr.R(b));
 		}
 		else if (b != d)
 		{
 			fpr.LoadToX64(d, !dupe);
 			MOVSD(XMM0, fpr.R(b));
 			MOVSD(fpr.RX(d), fpr.R(a));
-			op(fpr.RX(d), Gen::R(XMM0));
+			(this->*op)(fpr.RX(d), Gen::R(XMM0));
 		}
 		else // Other combo, must use two temps :(
 		{
 			MOVSD(XMM0, fpr.R(a));
 			MOVSD(XMM1, fpr.R(b));
 			fpr.LoadToX64(d, !dupe);
-			op(XMM0, Gen::R(XMM1));
+			(this->*op)(XMM0, Gen::R(XMM1));
 			MOVSD(fpr.RX(d), Gen::R(XMM0));
 		}
 		if (dupe) {
@ -86,16 +86,16 @@
 		bool dupe = inst.OPCD == 59;
 		switch (inst.SUBOP5)
 		{
-		case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &DIVSD); break; //div
-		case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &SUBSD); break; //sub
-		case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  dupe, &ADDSD); break; //add
+		case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
+		case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
+		case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  dupe, &XEmitter::ADDSD); break; //add
 		case 23: //sel
 			Default(inst);
 			break;
 		case 24: //res
 			Default(inst);
 			break;
-		case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &MULSD); break; //mul
+		case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
 		default:
 			_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
 		}
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp
@ -42,7 +42,7 @@
 	u32 And(u32 a, u32 b) {return a & b;}
 	u32 Xor(u32 a, u32 b) {return a ^ b;}

-	void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
+	void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
 	{
 		gpr.Lock(d, a);
 		if (a || binary || carry)  // yeh nasty special case addic
@ -57,7 +57,7 @@
 				{
 					if (gpr.R(d).IsImm())
 						gpr.LoadToX64(d, false);
-					op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
+					(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
 					if (carry)
 						GenerateCarry(EAX);
 				}
@ -66,7 +66,7 @@
 			{
 				gpr.LoadToX64(d, false);
 				MOV(32, gpr.R(d), gpr.R(a));
-				op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
+				(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
 				if (carry)
 					GenerateCarry(EAX);
 			}
@ -84,7 +84,7 @@
 		{
 			// Todo - special case immediates.
 			MOV(32, R(EAX), gpr.R(d));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 		gpr.UnlockAll();
 	}
@ -109,22 +109,22 @@
 				MOV(32, gpr.R(d), gpr.R(a));
 				gpr.UnlockAll();
 			} else {
-				regimmop(d, a, false, (u32)(s32)inst.SIMM_16,  Add, ADD); //addi
+				regimmop(d, a, false, (u32)(s32)inst.SIMM_16,  Add, &XEmitter::ADD); //addi
 			}
 			break;
-		case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, ADD); break; //addis
+		case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD); break; //addis
 		case 24: 
 			if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc)  //check for nop
 				{NOP(); return;} //make the nop visible in the generated code. not much use but interesting if we see one.
-			regimmop(a, s, true, inst.UIMM, Or, OR); 
+			regimmop(a, s, true, inst.UIMM, Or, &XEmitter::OR); 
 			break; //ori
-		case 25: regimmop(a, s, true, inst.UIMM << 16, Or,  OR, false); break;//oris
-		case 28: regimmop(a, s, true, inst.UIMM,       And, AND, true); break;
-		case 29: regimmop(a, s, true, inst.UIMM << 16, And, AND, true); break;
-		case 26: regimmop(a, s, true, inst.UIMM,       Xor, XOR, false); break; //xori
-		case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, XOR, false); break; //xoris
-		case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD, false, true); //addic
-		case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, ADD, true, true); //addic_rc
+		case 25: regimmop(a, s, true, inst.UIMM << 16, Or,  &XEmitter::OR, false); break;//oris
+		case 28: regimmop(a, s, true, inst.UIMM,       And, &XEmitter::AND, true); break;
+		case 29: regimmop(a, s, true, inst.UIMM << 16, And, &XEmitter::AND, true); break;
+		case 26: regimmop(a, s, true, inst.UIMM,       Xor, &XEmitter::XOR, false); break; //xori
+		case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, &XEmitter::XOR, false); break; //xoris
+		case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, false, true); //addic
+		case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, true, true); //addic_rc
 		default:
 			Default(inst);
 			break;
@ -295,7 +295,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -328,7 +328,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -353,7 +353,7 @@

 		if (inst.Rc) {
 			// result is already in eax
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -374,7 +374,7 @@
 		MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends
 		if (inst.Rc) {
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -394,7 +394,7 @@
 		MOVSX(32, 16, gpr.RX(a), gpr.R(s));
 		if (inst.Rc) {
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -474,7 +474,7 @@
 		if (inst.OE) PanicAlert("OE: subfx");
 		if (inst.Rc) {
 			// result is already in eax
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -514,7 +514,7 @@
 		gpr.UnlockAll();
 		if (inst.Rc) {
 			MOV(32, R(EAX), gpr.R(d));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -544,7 +544,7 @@
 			MOV(32, R(EAX), R(EDX));
 			MOV(32, gpr.R(d), R(EDX));
 			// result is already in eax
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		} else {
 			MOV(32, gpr.R(d), R(EDX));
 		}
@ -570,7 +570,7 @@
 		gpr.UnlockAll();
 		gpr.UnlockAllX();
 		if (inst.Rc) {
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -606,7 +606,7 @@
 			if (inst.Rc)
 			{
 				MOV(32, R(EAX), gpr.R(d));
-				CALL((u8*)Asm::computeRc);
+				CALL((u8*)asm_routines.computeRc);
 			}
 			gpr.UnlockAll();
 		}
@ -618,7 +618,7 @@
 			if (inst.Rc)
 			{
 				MOV(32, R(EAX), gpr.R(d));
-				CALL((u8*)Asm::computeRc);
+				CALL((u8*)asm_routines.computeRc);
 			}
 			gpr.UnlockAll();
 		}
@ -630,7 +630,7 @@
 			if (inst.Rc)
 			{
 				MOV(32, R(EAX), gpr.R(d));
-				CALL((u8*)Asm::computeRc);
+				CALL((u8*)asm_routines.computeRc);
 			}
 			gpr.UnlockAll();
 		}
@ -666,7 +666,7 @@
 		gpr.UnlockAllX();
 		if (inst.Rc)
 		{
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -730,7 +730,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -767,7 +767,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -799,7 +799,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -821,7 +821,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -851,7 +851,7 @@
 		if (inst.Rc) 
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -881,7 +881,7 @@
 		if (inst.Rc) 
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -929,7 +929,7 @@

 		if (inst.Rc) {
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -975,7 +975,7 @@

 		if (inst.Rc) {
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 		}
 	}

@ -1006,7 +1006,7 @@
 		if (inst.Rc)
 		{
 			MOV(32, R(EAX), gpr.R(a));
-			CALL((u8*)Asm::computeRc);
+			CALL((u8*)asm_routines.computeRc);
 			// TODO: Check PPC manual too
 		}
 	}
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp
@ -144,7 +144,7 @@
 			fpr.Flush(FLUSH_ALL);
 			ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
 			MOV(32, M(&PowerPC::ppcState.pc), Imm32(js.compilerPC + 12));
-			JMP(Asm::testExceptions, true);
+			JMP(asm_routines.testExceptions, true);
 			js.compilerPC += 8;
 			return;
 		}
@ -287,14 +287,13 @@
 						gpr.SetImmediate32(a, addr);
 					gpr.FlushLockX(ABI_PARAM1);
 					MOV(32, R(ABI_PARAM1), gpr.R(s));
-					// INT3();
 					switch (accessSize)
 					{	
 					// No need to protect these, they don't touch any state
 					// question - should we inline them instead? Pro: Lose a CALL   Con: Code bloat
-					case 8:  CALL((void *)Asm::fifoDirectWrite8);  break;
-					case 16: CALL((void *)Asm::fifoDirectWrite16); break;
-					case 32: CALL((void *)Asm::fifoDirectWrite32); break;
+					case 8:  CALL((void *)asm_routines.fifoDirectWrite8);  break;
+					case 16: CALL((void *)asm_routines.fifoDirectWrite16); break;
+					case 32: CALL((void *)asm_routines.fifoDirectWrite32); break;
 					}
 					js.fifoBytesThisBlock += accessSize >> 3;
 					gpr.UnlockAllX();
@ -377,9 +376,9 @@
 			SetJumpTarget(unsafe_addr);
 			switch (accessSize)
 			{
-			case 32: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break;
-			case 16: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break;
-			case 8:  ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U8,  2), ABI_PARAM1, ABI_PARAM2); break;
+			case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break;
+			case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break;
+			case 8:  ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8,  2), ABI_PARAM1, ABI_PARAM2); break;
 			}
 			SetJumpTarget(skip_call);
 			gpr.UnlockAll();
@ -402,7 +401,6 @@
 		//return _inst.RA ? (m_GPR[_inst.RA] + _inst.SIMM_16) : _inst.SIMM_16;
 		gpr.FlushLockX(ECX, EDX);
 		gpr.FlushLockX(ESI);
-		//INT3();
 		MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
 		if (inst.RA)
 			ADD(32, R(EAX), gpr.R(inst.RA));
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@ -242,7 +242,7 @@ void Jit64::stfs(UGeckoInstruction inst)
 		{
 			// Float directly to write gather pipe! Fun!
 			CVTSD2SS(XMM0, fpr.R(s));
-			CALL((void*)Asm::fifoDirectWriteFloat);
+			CALL((void*)asm_routines.fifoDirectWriteFloat);
 			// TODO
 			js.fifoBytesThisBlock += 4;
 			return;
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp
@ -161,7 +161,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
 #endif
 			FixupBranch skip_call = J();
 			SetJumpTarget(argh);
-			ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+			ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
 			SetJumpTarget(skip_call);
 			gpr.UnlockAll();
 			gpr.UnlockAllX();
@ -184,7 +184,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
 				// Writing to FIFO. Let's do fast method.
 				CVTPD2PS(XMM0, fpr.R(s));
 				PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
-				CALL((void*)Asm::fifoDirectWriteXmm64);
+				CALL((void*)asm_routines.fifoDirectWriteXmm64);
 				js.fifoBytesThisBlock += 8;
 				return;
 			}
@ -211,7 +211,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
 		MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
 		FixupBranch arg2 = J();
 		SetJumpTarget(argh);
-		CALL(ProtectFunction((void *)&WriteDual32, 0));
+		CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
 #else
 		FixupBranch argh = J_CC(CC_NZ);
 		MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
@ -224,10 +224,10 @@ void Jit64::psq_st(UGeckoInstruction inst)
 		FixupBranch arg2 = J();
 		SetJumpTarget(argh);
 		MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
-		ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+		ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
 		MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
 		ADD(32, R(ABI_PARAM2), Imm32(4));
-		ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+		ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
 #endif
 		SetJumpTarget(arg2);
 		gpr.UnlockAll();
@ -424,7 +424,6 @@ void Jit64::psq_l(UGeckoInstruction inst)
 #endif
 			BSWAP(32, EAX);
 			MOV(32, M(&temp64), R(EAX));
-			//INT3();
 			fpr.LoadToX64(inst.RS, false, true);
 			X64Reg r = fpr.R(inst.RS).GetSimpleReg();
 			MOVD_xmm(XMM0, M(&temp64));
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp
@ -163,40 +163,40 @@
 	*/

 	//There's still a little bit more optimization that can be squeezed out of this
-	void Jit64::tri_op(int d, int a, int b, bool reversible, void (*op)(X64Reg, OpArg))
+	void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
 	{
 		fpr.Lock(d, a, b);
 		
 		if (d == a)
 		{
 			fpr.LoadToX64(d, true);
-			op(fpr.RX(d), fpr.R(b));
+			(this->*op)(fpr.RX(d), fpr.R(b));
 		}
 		else if (d == b && reversible)
 		{
 			fpr.LoadToX64(d, true);
-			op(fpr.RX(d), fpr.R(a));
+			(this->*op)(fpr.RX(d), fpr.R(a));
 		}
 		else if (a != d && b != d) 
 		{
 			//sources different from d, can use rather quick solution
 			fpr.LoadToX64(d, false);
 			MOVAPD(fpr.RX(d), fpr.R(a));
-			op(fpr.RX(d), fpr.R(b));
+			(this->*op)(fpr.RX(d), fpr.R(b));
 		}
 		else if (b != d)
 		{
 			fpr.LoadToX64(d, false);
 			MOVAPD(XMM0, fpr.R(b));
 			MOVAPD(fpr.RX(d), fpr.R(a));
-			op(fpr.RX(d), Gen::R(XMM0));
+			(this->*op)(fpr.RX(d), Gen::R(XMM0));
 		}
 		else //Other combo, must use two temps :(
 		{
 			MOVAPD(XMM0, fpr.R(a));
 			MOVAPD(XMM1, fpr.R(b));
 			fpr.LoadToX64(d, false);
-			op(XMM0, Gen::R(XMM1));
+			(this->*op)(XMM0, Gen::R(XMM1));
 			MOVAPD(fpr.RX(d), Gen::R(XMM0));
 		}
 		ForceSinglePrecisionP(fpr.RX(d));
@ -213,16 +213,16 @@
 		}
 		switch (inst.SUBOP5)
 		{
-		case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &DIVPD); break; //div
-		case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &SUBPD); break; //sub
-		case 21: tri_op(inst.FD, inst.FA, inst.FB, true,  &ADDPD); break; //add
+		case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div
+		case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub 
+		case 21: tri_op(inst.FD, inst.FA, inst.FB, true,  &XEmitter::ADDPD); break; //add
 		case 23://sel
 			Default(inst);
 			break;
 		case 24://res
 			Default(inst);
 			break;
-		case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &MULPD); break; //mul
+		case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul
 		default:
 			_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
 		}
--- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp
@ -76,9 +76,9 @@ void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signEx
 	FixupBranch argh = J_CC(CC_Z);
 	switch (accessSize)
 	{
-	case 32: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
-	case 16: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
-	case 8:  ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U8, 1), reg);  break;
+	case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
+	case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
+	case 8:  ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg);  break;
 	}
 	if (signExtend && accessSize < 32) {
 		// Need to sign extend values coming from the Read_U* functions.
@ -114,7 +114,7 @@ void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize,
 	UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0);
 	FixupBranch skip_call = J();
 	SetJumpTarget(unsafe_addr);
-	ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
+	ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
 	SetJumpTarget(skip_call);
 }

--- a/Source/Plugins/Plugin_VideoOGL/Src/BPStructs.cpp
+++ b/Source/Plugins/Plugin_VideoOGL/Src/BPStructs.cpp
@ -463,7 +463,7 @@ void BPWritten(int addr, int changes, int newval)
 				{
 					// the number of lines copied is determined by the y scale * source efb height
 					float yScale = bpmem.dispcopyyscale / 256.0f;
-					float xfbLines = bpmem.copyTexSrcWH.y + 1.0 * yScale;
+					float xfbLines = bpmem.copyTexSrcWH.y + 1.0f * yScale;
 					XFB_Write(Memory_GetPtr(bpmem.copyTexDest<<5), multirc, (bpmem.copyMipMapStrideChannels << 4), (int)xfbLines);
 				}
 				else
--- a/Source/Plugins/Plugin_VideoOGL/Src/NativeVertexFormat.cpp
+++ b/Source/Plugins/Plugin_VideoOGL/Src/NativeVertexFormat.cpp
@ -82,68 +82,68 @@ void NativeVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl)
 	}

 #ifdef USE_JIT
+	Gen::XEmitter emit(m_compiledCode);
 	// Alright, we have our vertex declaration. Compile some crazy code to set it quickly using GL.
-	u8 *old_code_ptr = GetWritableCodePtr();
-	SetCodePtr(m_compiledCode);
-	ABI_EmitPrologue(6);
+	emit.ABI_EmitPrologue(6);
 	
-	CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0);
+	emit.CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0);

 	if (_vtx_decl.num_normals >= 1) {
-		CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]);
+		emit.CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]);
 		if (_vtx_decl.num_normals == 3) {
-			CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]);
-			CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]);
+			emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]);
+			emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]);
 		}
 	}

 	for (int i = 0; i < 2; i++) {
 		if (_vtx_decl.color_offset[i] != -1) {
 			if (i == 0)
-				CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
+				emit.CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
 			else
-				CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]); 
+				emit.CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]); 
 		}
 	}

-	for (int i = 0; i < 8; i++) {
-		if (_vtx_decl.texcoord_offset[i] != -1) {
+	for (int i = 0; i < 8; i++)
+	{
+		if (_vtx_decl.texcoord_offset[i] != -1)
+		{
 			int id = GL_TEXTURE0 + i;
 #ifdef _M_X64
 #ifdef _MSC_VER
-			MOV(32, R(RCX), Imm32(id));
+			emit.MOV(32, R(RCX), Imm32(id));
 #else
-			MOV(32, R(RDI), Imm32(id));
+			emit.MOV(32, R(RDI), Imm32(id));
 #endif
 #else
-			ABI_AlignStack(1 * 4);
-			PUSH(32, Imm32(id));
+			emit.ABI_AlignStack(1 * 4);
+			emit.PUSH(32, Imm32(id));
 #endif
-			CALL((void *)glClientActiveTexture);
+			emit.CALL((void *)glClientActiveTexture);
 #ifndef _M_X64
 #ifdef _WIN32
 			// don't inc stack on windows, stdcall
 #else
-			ABI_RestoreStack(1 * 4);
+			emit.ABI_RestoreStack(1 * 4);
 #endif
 #endif
-			CallCdeclFunction4_I(
+			emit.CallCdeclFunction4_I(
 				glTexCoordPointer, _vtx_decl.texcoord_size[i], VarToGL(_vtx_decl.texcoord_gl_type[i]),
 				_vtx_decl.stride, _vtx_decl.texcoord_offset[i]);
 		}
 	}

 	if (_vtx_decl.posmtx_offset != -1) {
-		CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset);
+		emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset);
 	}

-	ABI_EmitEpilogue(6);
-	if (Gen::GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE)
+	emit.ABI_EmitEpilogue(6);
+	if (emit.GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE)
 	{
 		Crash();
 	}

-	SetCodePtr(old_code_ptr);
 #endif
 	this->vtx_decl = _vtx_decl;
 }
--- a/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.cpp
+++ b/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.cpp
@ -44,7 +44,7 @@

 #define USE_JIT

-#define COMPILED_CODE_SIZE 4096*4
+#define COMPILED_CODE_SIZE 4096

 NativeVertexFormat *g_nativeVertexFmt;

@ -116,6 +116,7 @@ void LOADERDECL TexMtx_Write_Short3()

 VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) 
 {
+	m_compiledCode = NULL;
 	m_numLoadedVertices = 0;
 	m_VertexSize = 0;
 	m_numPipelineStages = 0;
@ -126,16 +127,14 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
 	m_VtxDesc = vtx_desc;
 	SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex);

-	m_compiledCode = (u8 *)AllocateExecutableMemory(COMPILED_CODE_SIZE, false);
-	if (m_compiledCode) {
-		memset(m_compiledCode, 0, COMPILED_CODE_SIZE);
-	}
+	AllocCodeSpace(COMPILED_CODE_SIZE);
 	CompileVertexTranslator();
+	WriteProtect();
 }

 VertexLoader::~VertexLoader() 
 {
-	FreeMemoryPages(m_compiledCode, COMPILED_CODE_SIZE);
+	FreeCodeSpace();
 	delete m_NativeFmt;
 }

@ -143,13 +142,14 @@ void VertexLoader::CompileVertexTranslator()
 {
 	m_VertexSize = 0;
 	const TVtxAttr &vtx_attr = m_VtxAttr;
-	//const TVtxDesc &vtx_desc = m_VtxDesc;

 #ifdef USE_JIT
-	u8 *old_code_ptr = GetWritableCodePtr();
-	SetCodePtr(m_compiledCode);
+	if (m_compiledCode)
+		PanicAlert("trying to recompile a vtx translator");
+
+	m_compiledCode = GetCodePtr();
 	ABI_EmitPrologue(4);
-	// MOV(32, R(EBX), M(&loop_counter));
+
 	// Start loop here
 	const u8 *loop_start = GetCodePtr();

@ -477,7 +477,6 @@ void VertexLoader::CompileVertexTranslator()
 	//SUB(32, R(EBX), Imm8(1));
 	J_CC(CC_NZ, loop_start, true);
 	ABI_EmitEpilogue(4);
-	SetCodePtr(old_code_ptr);
 #endif
 	m_NativeFmt->Initialize(vtx_decl);
 }
--- a/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.h
+++ b/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.h
@ -22,9 +22,10 @@

 #include "CPMemory.h"
 #include "DataReader.h"
-
 #include "NativeVertexFormat.h"

+#include "x64Emitter.h"
+
 class VertexLoaderUID
 {
 	u32 vid[5];
@ -52,7 +53,7 @@ public:
 	}
 };

-class VertexLoader
+class VertexLoader : public Gen::XCodeBlock
 {
 public:
 	VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
@ -86,7 +87,7 @@ private:
 	TPipelineFunction m_PipelineStages[64];  // TODO - figure out real max. it's lower.
 	int m_numPipelineStages;

-	u8 *m_compiledCode;
+	const u8 *m_compiledCode;

 	int m_numLoadedVertices;