From 104acd5bc186f1623306af6f1c6ddb233fad0780 Mon Sep 17 00:00:00 2001 From: hrydgard Date: Fri, 19 Dec 2008 21:24:52 +0000 Subject: [PATCH] Turn the X86 emitter into a class, so the code pointer is no longer a global, yay! Created XCodeBlock that derives from XEmitter, and the Jit now derives from XCodeBlock so it can call all ADD SUB JNZ etc without having to prefix them with "emit.". I think someone's gonna like this. There's some cleanup still to be done, but hey, it works. There shouldn't be a noticable speed difference. I hope GCC doesn't have a problem with the "member function pointers" I used. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1594 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Common/Src/ABI.cpp | 57 +- Source/Core/Common/Src/ABI.h | 39 - Source/Core/Common/Src/MemoryUtil.cpp | 10 +- Source/Core/Common/Src/MemoryUtil.h | 12 +- Source/Core/Common/Src/Thunk.cpp | 50 +- Source/Core/Common/Src/Thunk.h | 24 +- Source/Core/Common/Src/x64Emitter.cpp | 789 +++++++++--------- Source/Core/Common/Src/x64Emitter.h | 755 ++++++++++------- Source/Core/Core/Src/HW/HW.cpp | 4 +- Source/Core/Core/Src/MemTools.cpp | 2 +- Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp | 44 +- Source/Core/Core/Src/PowerPC/Jit64/Jit.h | 29 +- Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp | 40 +- Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h | 84 +- .../Core/Src/PowerPC/Jit64/JitBackpatch.cpp | 190 +++-- .../Core/Core/Src/PowerPC/Jit64/JitCache.cpp | 67 +- Source/Core/Core/Src/PowerPC/Jit64/JitCache.h | 2 +- .../Core/Core/Src/PowerPC/Jit64/JitCore.cpp | 5 +- .../Core/Src/PowerPC/Jit64/JitRegCache.cpp | 16 +- .../Core/Core/Src/PowerPC/Jit64/JitRegCache.h | 8 +- .../Src/PowerPC/Jit64/Jit_FloatingPoint.cpp | 20 +- .../Core/Src/PowerPC/Jit64/Jit_Integer.cpp | 72 +- .../Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp | 16 +- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 2 +- .../Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp | 11 +- .../Core/Src/PowerPC/Jit64/Jit_Paired.cpp | 20 +- .../Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp | 8 +- .../Plugins/Plugin_VideoOGL/Src/BPStructs.cpp | 2 +- .../Src/NativeVertexFormat.cpp | 44 +- .../Plugin_VideoOGL/Src/VertexLoader.cpp | 21 +- .../Plugin_VideoOGL/Src/VertexLoader.h | 7 +- 31 files changed, 1297 insertions(+), 1153 deletions(-) diff --git a/Source/Core/Common/Src/ABI.cpp b/Source/Core/Common/Src/ABI.cpp index 2fc6cd81cd..941a1e9ff8 100644 --- a/Source/Core/Common/Src/ABI.cpp +++ b/Source/Core/Common/Src/ABI.cpp @@ -25,7 +25,7 @@ using namespace Gen; // ==================================== // Sets up a __cdecl function. -void ABI_EmitPrologue(int maxCallParams) +void XEmitter::ABI_EmitPrologue(int maxCallParams) { #ifdef _M_IX86 // Don't really need to do anything @@ -40,7 +40,8 @@ void ABI_EmitPrologue(int maxCallParams) #error Arch not supported #endif } -void ABI_EmitEpilogue(int maxCallParams) + +void XEmitter::ABI_EmitEpilogue(int maxCallParams) { #ifdef _M_IX86 RET(); @@ -60,14 +61,14 @@ void ABI_EmitEpilogue(int maxCallParams) // Shared code between Win32 and Unix32 // ==================================== -void ABI_CallFunctionC(void *func, u32 param1) { +void XEmitter::ABI_CallFunctionC(void *func, u32 param1) { ABI_AlignStack(1 * 4); PUSH(32, Imm32(param1)); CALL(func); ABI_RestoreStack(1 * 4); } -void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) { +void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) { ABI_AlignStack(2 * 4); PUSH(32, Imm32(param2)); PUSH(32, Imm32(param1)); @@ -76,14 +77,14 @@ void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) { } // Pass a register as a paremeter. -void ABI_CallFunctionR(void *func, X64Reg reg1) { +void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) { ABI_AlignStack(1 * 4); PUSH(32, R(reg1)); CALL(func); ABI_RestoreStack(1 * 4); } -void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) +void XEmitter::ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) { ABI_AlignStack(2 * 4); PUSH(32, R(reg2)); @@ -92,7 +93,7 @@ void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) ABI_RestoreStack(2 * 4); } -void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2) +void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2) { ABI_AlignStack(2 * 4); PUSH(32, arg1); @@ -101,7 +102,7 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2) ABI_RestoreStack(2 * 4); } -void ABI_PushAllCalleeSavedRegsAndAdjustStack() { +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { // Note: 4 * 4 = 16 bytes, so alignment is preserved. PUSH(EBP); PUSH(EBX); @@ -109,14 +110,14 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() { PUSH(EDI); } -void ABI_PopAllCalleeSavedRegsAndAdjustStack() { +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { POP(EDI); POP(ESI); POP(EBX); POP(EBP); } -unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) { +unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { frameSize += 4; // reserve space for return address unsigned int alignedSize = #ifdef __GNUC__ @@ -128,7 +129,7 @@ unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) { } -void ABI_AlignStack(unsigned int frameSize) { +void XEmitter::ABI_AlignStack(unsigned int frameSize) { // Mac OS X requires the stack to be 16-byte aligned before every call. // Linux requires the stack to be 16-byte aligned before calls that put SSE // vectors on the stack, but since we do not keep track of which calls do that, @@ -145,7 +146,7 @@ void ABI_AlignStack(unsigned int frameSize) { #endif } -void ABI_RestoreStack(unsigned int frameSize) { +void XEmitter::ABI_RestoreStack(unsigned int frameSize) { unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); alignedSize -= 4; // return address is POPped at end of call if (alignedSize != 0) { @@ -155,26 +156,26 @@ void ABI_RestoreStack(unsigned int frameSize) { #else -void ABI_CallFunctionC(void *func, u32 param1) { +void XEmitter::ABI_CallFunctionC(void *func, u32 param1) { MOV(32, R(ABI_PARAM1), Imm32(param1)); CALL(func); } -void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) { +void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) { MOV(32, R(ABI_PARAM1), Imm32(param1)); MOV(32, R(ABI_PARAM2), Imm32(param2)); CALL(func); } // Pass a register as a paremeter. -void ABI_CallFunctionR(void *func, X64Reg reg1) { +void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) { if (reg1 != ABI_PARAM1) MOV(32, R(ABI_PARAM1), R(reg1)); CALL(func); } // Pass a register as a paremeter. -void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) { +void XEmitter::ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) { if (reg1 != ABI_PARAM1) MOV(32, R(ABI_PARAM1), R(reg1)); if (reg2 != ABI_PARAM2) @@ -182,7 +183,7 @@ void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) { CALL(func); } -void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2) +void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2) { if (!arg1.IsSimpleReg(ABI_PARAM1)) MOV(32, R(ABI_PARAM1), arg1); @@ -190,21 +191,21 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2) CALL(func); } -unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) { +unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { return frameSize; } -void ABI_AlignStack(unsigned int /*frameSize*/) { +void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { } -void ABI_RestoreStack(unsigned int /*frameSize*/) { +void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { } #ifdef _WIN32 // Win64 Specific Code // ==================================== -void ABI_PushAllCalleeSavedRegsAndAdjustStack() { +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { //we only want to do this once PUSH(RBX); PUSH(RSI); @@ -218,7 +219,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() { SUB(64, R(RSP), Imm8(0x28)); } -void ABI_PopAllCalleeSavedRegsAndAdjustStack() { +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { ADD(64, R(RSP), Imm8(0x28)); POP(R15); POP(R14); @@ -232,7 +233,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() { // Win64 Specific Code // ==================================== -void ABI_PushAllCallerSavedRegsAndAdjustStack() { +void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { PUSH(RCX); PUSH(RDX); PUSH(RSI); @@ -245,7 +246,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() { SUB(64, R(RSP), Imm8(0x28)); } -void ABI_PopAllCallerSavedRegsAndAdjustStack() { +void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { ADD(64, R(RSP), Imm8(0x28)); POP(R11); POP(R10); @@ -260,7 +261,7 @@ void ABI_PopAllCallerSavedRegsAndAdjustStack() { #else // Unix64 Specific Code // ==================================== -void ABI_PushAllCalleeSavedRegsAndAdjustStack() { +void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { PUSH(RBX); PUSH(RBP); PUSH(R12); @@ -270,7 +271,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() { PUSH(R15); //just to align stack. duped push/pop doesn't hurt. } -void ABI_PopAllCalleeSavedRegsAndAdjustStack() { +void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { POP(R15); POP(R15); POP(R14); @@ -280,7 +281,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() { POP(RBX); } -void ABI_PushAllCallerSavedRegsAndAdjustStack() { +void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { PUSH(RCX); PUSH(RDX); PUSH(RSI); @@ -292,7 +293,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() { PUSH(R11); } -void ABI_PopAllCallerSavedRegsAndAdjustStack() { +void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { POP(R11); POP(R11); POP(R10); diff --git a/Source/Core/Common/Src/ABI.h b/Source/Core/Common/Src/ABI.h index e29c333d5b..177a1254e4 100644 --- a/Source/Core/Common/Src/ABI.h +++ b/Source/Core/Common/Src/ABI.h @@ -18,8 +18,6 @@ #ifndef _JIT_ABI_H #define _JIT_ABI_H -#include "x64Emitter.h" - // x86/x64 ABI:s, and helpers to help follow them when JIT-ing code. // All convensions return values in EAX (+ possibly EDX). @@ -81,42 +79,5 @@ #endif -// Utility functions -// These only support u32 parameters, but that's enough for a lot of uses. -// These will destroy the 1 or 2 first "parameter regs". -void ABI_CallFunctionC(void *func, u32 param1); -void ABI_CallFunctionCC(void *func, u32 param1, u32 param2); -void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2); - -// Pass a register as a paremeter. -void ABI_CallFunctionR(void *func, Gen::X64Reg reg1); -void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2); - -// A function that doesn't have any control over what it will do to regs, -// such as the dispatcher, should be surrounded by these. -void ABI_PushAllCalleeSavedRegsAndAdjustStack(); -void ABI_PopAllCalleeSavedRegsAndAdjustStack(); - -// A function that doesn't know anything about it's surroundings, should -// be surrounded by these to establish a safe environment, where it can roam free. -// An example is a backpatch injected function. -void ABI_PushAllCallerSavedRegsAndAdjustStack(); -void ABI_PopAllCallerSavedRegsAndAdjustStack(); - -unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); -void ABI_AlignStack(unsigned int frameSize); -void ABI_RestoreStack(unsigned int frameSize); - -// Sets up a __cdecl function. -// Only x64 really needs the parameter. -void ABI_EmitPrologue(int maxCallParams); -void ABI_EmitEpilogue(int maxCallParams); - -#ifdef _M_IX86 -inline int ABI_GetNumXMMRegs() { return 8; } -#else -inline int ABI_GetNumXMMRegs() { return 16; } -#endif - #endif // _JIT_ABI_H diff --git a/Source/Core/Common/Src/MemoryUtil.cpp b/Source/Core/Common/Src/MemoryUtil.cpp index 10cf448f06..7dc0d787e4 100644 --- a/Source/Core/Common/Src/MemoryUtil.cpp +++ b/Source/Core/Common/Src/MemoryUtil.cpp @@ -38,7 +38,7 @@ // This is purposedely not a full wrapper for virtualalloc/mmap, but it // provides exactly the primitive operations that Dolphin needs. -void* AllocateExecutableMemory(int size, bool low) +void* AllocateExecutableMemory(size_t size, bool low) { #ifdef _WIN32 void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); @@ -71,7 +71,7 @@ void* AllocateExecutableMemory(int size, bool low) } -void* AllocateMemoryPages(int size) +void* AllocateMemoryPages(size_t size) { #ifdef _WIN32 void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_READWRITE); @@ -99,7 +99,7 @@ void* AllocateMemoryPages(int size) } -void FreeMemoryPages(void* ptr, int size) +void FreeMemoryPages(void* ptr, size_t size) { #ifdef _WIN32 if (ptr) @@ -113,7 +113,7 @@ void FreeMemoryPages(void* ptr, int size) } -void WriteProtectMemory(void* ptr, int size, bool allowExecute) +void WriteProtectMemory(void* ptr, size_t size, bool allowExecute) { #ifdef _WIN32 VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, 0); @@ -123,7 +123,7 @@ void WriteProtectMemory(void* ptr, int size, bool allowExecute) } -void UnWriteProtectMemory(void* ptr, int size, bool allowExecute) +void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute) { #ifdef _WIN32 VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READONLY, 0); diff --git a/Source/Core/Common/Src/MemoryUtil.h b/Source/Core/Common/Src/MemoryUtil.h index a59fe2dc29..29bd911c9d 100644 --- a/Source/Core/Common/Src/MemoryUtil.h +++ b/Source/Core/Common/Src/MemoryUtil.h @@ -18,14 +18,14 @@ #ifndef _MEMORYUTIL_H #define _MEMORYUTIL_H -void* AllocateExecutableMemory(int size, bool low = true); -void* AllocateMemoryPages(int size); -void FreeMemoryPages(void* ptr, int size); -void WriteProtectMemory(void* ptr, int size, bool executable = false); -void UnWriteProtectMemory(void* ptr, int size, bool allowExecute); +void* AllocateExecutableMemory(size_t size, bool low = true); +void* AllocateMemoryPages(size_t size); +void FreeMemoryPages(void* ptr, size_t size); +void WriteProtectMemory(void* ptr, size_t size, bool executable = false); +void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute); -inline int GetPageSize() {return(4096);} +inline int GetPageSize() {return 4096;} #endif diff --git a/Source/Core/Common/Src/Thunk.cpp b/Source/Core/Common/Src/Thunk.cpp index a5ba2c9b8d..a37d88b24c 100644 --- a/Source/Core/Common/Src/Thunk.cpp +++ b/Source/Core/Common/Src/Thunk.cpp @@ -18,33 +18,29 @@ #include #include "Common.h" -#include "Thunk.h" #include "x64Emitter.h" #include "MemoryUtil.h" #include "ABI.h" +#include "Thunk.h" -using namespace Gen; +ThunkManager thunks; #define THUNK_ARENA_SIZE 1024*1024*1 -namespace { -static std::map thunks; -u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]); -u8 GC_ALIGNED32(saved_gpr_state[16 * 8]); - -static u8 *thunk_memory; -static u8 *thunk_code; -static const u8 *save_regs; -static const u8 *load_regs; -static u16 saved_mxcsr; -} - -void Thunk_Init() +namespace { - thunk_memory = (u8 *)AllocateExecutableMemory(THUNK_ARENA_SIZE); - thunk_code = thunk_memory; - GenContext ctx(&thunk_code); +static u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]); +static u8 GC_ALIGNED32(saved_gpr_state[16 * 8]); +static u16 saved_mxcsr; + +} // namespace + +using namespace Gen; + +void ThunkManager::Init() +{ + AllocCodeSpace(THUNK_ARENA_SIZE); save_regs = GetCodePtr(); for (int i = 2; i < ABI_GetNumXMMRegs(); i++) MOVAPS(M(saved_fp_state + i * 16), (X64Reg)(XMM0 + i)); @@ -89,31 +85,27 @@ void Thunk_Init() RET(); } -void Thunk_Reset() +void ThunkManager::Reset() { thunks.clear(); - thunk_code = thunk_memory; + ResetCodePtr(); } -void Thunk_Shutdown() +void ThunkManager::Shutdown() { - Thunk_Reset(); - FreeMemoryPages(thunk_memory, THUNK_ARENA_SIZE); - thunk_memory = 0; - thunk_code = 0; + Reset(); + FreeCodeSpace(); } -void *ProtectFunction(void *function, int num_params) +void *ThunkManager::ProtectFunction(void *function, int num_params) { std::map::iterator iter; iter = thunks.find(function); if (iter != thunks.end()) return (void *)iter->second; - - if (!thunk_memory) + if (!region) PanicAlert("Trying to protect functions before the emu is started. Bad bad bad."); - GenContext gen(&thunk_code); const u8 *call_point = GetCodePtr(); // Make sure to align stack. #ifdef _M_X64 diff --git a/Source/Core/Common/Src/Thunk.h b/Source/Core/Common/Src/Thunk.h index 5d438e15e8..05960d19eb 100644 --- a/Source/Core/Common/Src/Thunk.h +++ b/Source/Core/Common/Src/Thunk.h @@ -18,6 +18,11 @@ #ifndef _THUNK_H #define _THUNK_H +#include + +#include "Common.h" +#include "x64Emitter.h" + // This simple class creates a wrapper around a C/C++ function that saves all fp state // before entering it, and restores it upon exit. This is required to be able to selectively // call functions from generated code, without inflicting the performance hit and increase @@ -30,10 +35,21 @@ // NOT THREAD SAFE. This may only be used from the CPU thread. // Any other thread using this stuff will be FATAL. -void Thunk_Init(); -void Thunk_Reset(); -void Thunk_Shutdown(); +class ThunkManager : public Gen::XCodeBlock +{ + std::map thunks; -void *ProtectFunction(void *function, int num_params); + const u8 *save_regs; + const u8 *load_regs; + +public: + void Init(); + void Reset(); + void Shutdown(); + + void *ProtectFunction(void *function, int num_params); +}; + +extern ThunkManager thunks; #endif diff --git a/Source/Core/Common/Src/x64Emitter.cpp b/Source/Core/Common/Src/x64Emitter.cpp index dd6c35918f..a489d42b26 100644 --- a/Source/Core/Common/Src/x64Emitter.cpp +++ b/Source/Core/Common/Src/x64Emitter.cpp @@ -14,6 +14,7 @@ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ + #include "Common.h" #include "x64Emitter.h" #include "ABI.h" @@ -21,31 +22,86 @@ namespace Gen { - static u8 *code; - static bool enableBranchHints = false; - void SetCodePtr(u8 *ptr) +static bool enableBranchHints = false; + +// TODO(ector): Add EAX special casing, for ever so slightly smaller code. +struct NormalOpDef +{ + u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, ext; +}; + +static const NormalOpDef nops[11] = +{ + {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0}, //ADD + {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 2}, //ADC + + {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 5}, //SUB + {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 3}, //SBB + + {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 4}, //AND + {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 1}, //OR + + {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 6}, //XOR + {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0}, //MOV + + {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0}, //TEST (to == from) + {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 7}, //CMP + + {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 7}, //XCHG +}; + +enum NormalSSEOps +{ + sseCMP = 0xC2, + sseADD = 0x58, //ADD + sseSUB = 0x5C, //SUB + sseAND = 0x54, //AND + sseANDN = 0x55, //ANDN + sseOR = 0x56, + sseXOR = 0x57, + sseMUL = 0x59, //MUL, + sseDIV = 0x5E, //DIV + sseMIN = 0x5D, //MIN + sseMAX = 0x5F, //MAX + sseCOMIS = 0x2F, //COMIS + sseUCOMIS = 0x2E, //UCOMIS + sseSQRT = 0x51, //SQRT + sseRSQRT = 0x52, //RSQRT (NO DOUBLE PRECISION!!!) + sseMOVAPfromRM = 0x28, //MOVAP from RM + sseMOVAPtoRM = 0x29, //MOVAP to RM + sseMOVUPfromRM = 0x10, //MOVUP from RM + sseMOVUPtoRM = 0x11, //MOVUP to RM + sseMASKMOVDQU = 0xF7, + sseLDDQU = 0xF0, + sseSHUF = 0xC6, + sseMOVNTDQ = 0xE7, + sseMOVNTP = 0x2B, +}; + + + void XEmitter::SetCodePtr(u8 *ptr) { code = ptr; } - const u8 *GetCodePtr() + const u8 *XEmitter::GetCodePtr() const { return code; } - u8 *GetWritableCodePtr() + u8 *XEmitter::GetWritableCodePtr() { return code; } - void ReserveCodeSpace(int bytes) + void XEmitter::ReserveCodeSpace(int bytes) { for (int i = 0; i < bytes; i++) *code++ = 0xCC; } - const u8 *AlignCode4() + const u8 *XEmitter::AlignCode4() { int c = int((u64)code & 3); if (c) @@ -53,7 +109,7 @@ namespace Gen return code; } - const u8 *AlignCode16() + const u8 *XEmitter::AlignCode16() { int c = int((u64)code & 15); if (c) @@ -61,7 +117,7 @@ namespace Gen return code; } - const u8 *AlignCodePage() + const u8 *XEmitter::AlignCodePage() { int c = int((u64)code & 4095); if (c) @@ -69,40 +125,17 @@ namespace Gen return code; } - inline void Write8(u8 value) - { - *code++ = value; - } - - inline void Write16(u16 value) - { - *(u16*)code = value; - code += 2; - } - - inline void Write32(u32 value) - { - *(u32*)code = value; - code += 4; - } - - inline void Write64(u64 value) - { - *(u64*)code = value; - code += 8; - } - - void WriteModRM( int mod, int rm, int reg ) + void XEmitter::WriteModRM(int mod, int rm, int reg) { Write8((u8)((mod << 6) | ((rm & 7) << 3) | (reg & 7))); } - void WriteSIB(int scale, int index, int base) + void XEmitter::WriteSIB(int scale, int index, int base) { Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7))); } - void OpArg::WriteRex(bool op64, int customOp) const + void OpArg::WriteRex(XEmitter *emit, bool op64, int customOp) const { #ifdef _M_X64 u8 op = 0x40; @@ -112,7 +145,7 @@ namespace Gen if (indexReg >> 3) op |= 2; if (offsetOrBaseReg >> 3) op |= 1; //TODO investigate if this is dangerous if (op != 0x40) - Write8(op); + emit->Write8(op); #else _dbg_assert_(DYNA_REC, (operandReg >> 3) == 0); _dbg_assert_(DYNA_REC, (indexReg >> 3) == 0); @@ -120,7 +153,7 @@ namespace Gen #endif } - void OpArg::WriteRest(int extraBytes, X64Reg _operandReg) const + void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg) const { if (_operandReg == 0xff) _operandReg = (X64Reg)this->operandReg; @@ -133,14 +166,14 @@ namespace Gen { // Oh, RIP addressing. _offsetOrBaseReg = 5; - WriteModRM(0, _operandReg&7, 5); + emit->WriteModRM(0, _operandReg&7, 5); //TODO : add some checks #ifdef _M_X64 - u64 ripAddr = (u64)code + 4 + extraBytes; + u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes; s32 offs = (s32)((s64)offset - (s64)ripAddr); - Write32((u32)offs); + emit->Write32((u32)offs); #else - Write32((u32)offset); + emit->Write32((u32)offset); #endif return; } @@ -213,7 +246,7 @@ namespace Gen //if (RIP) // oreg = 5; - WriteModRM(mod, _operandReg&7, oreg&7); + emit->WriteModRM(mod, _operandReg&7, oreg&7); if (SIB) { @@ -229,16 +262,16 @@ namespace Gen case SCALE_ATREG: ss = 0; break; default: _assert_msg_(DYNA_REC, 0, "Invalid scale for SIB byte"); ss = 0; break; } - Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7))); + emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7))); } if (mod == 1) //8-bit disp { - Write8((u8)(s8)(s32)offset); + emit->Write8((u8)(s8)(s32)offset); } else if (mod == 2) //32-bit disp { - Write32((u32)offset); + emit->Write32((u32)offset); } } @@ -247,7 +280,7 @@ namespace Gen // R = register# upper bit // X = scale amnt upper bit // B = base register# upper bit - void Rex(int w, int r, int x, int b) + void XEmitter::Rex(int w, int r, int x, int b) { w = w ? 1 : 0; r = r ? 1 : 0; @@ -258,7 +291,7 @@ namespace Gen Write8(rx); } - void JMP(const u8 *addr, bool force5Bytes) + void XEmitter::JMP(const u8 *addr, bool force5Bytes) { u64 fn = (u64)addr; if (!force5Bytes) @@ -276,34 +309,34 @@ namespace Gen } } - void JMPptr(const OpArg &arg2) + void XEmitter::JMPptr(const OpArg &arg2) { OpArg arg = arg2; if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "JMPptr - Imm argument"); arg.operandReg = 4; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0xFF); - arg.WriteRest(); + arg.WriteRest(this); } //Can be used to trap other processors, before overwriting their code // not used in dolphin - void JMPself() + void XEmitter::JMPself() { Write8(0xEB); Write8(0xFE); } - void CALLptr(OpArg arg) + void XEmitter::CALLptr(OpArg arg) { if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "CALLptr - Imm argument"); arg.operandReg = 2; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0xFF); - arg.WriteRest(); + arg.WriteRest(this); } - void CALL(void *fnptr) + void XEmitter::CALL(const void *fnptr) { u64 distance = u64(fnptr) - (u64(code) + 5); if (distance >= 0x0000000080000000ULL @@ -314,7 +347,7 @@ namespace Gen Write32(u32(distance)); } - FixupBranch J(bool force5bytes) + FixupBranch XEmitter::J(bool force5bytes) { FixupBranch branch; branch.type = force5bytes ? 1 : 0; @@ -333,14 +366,7 @@ namespace Gen return branch; } - // These are to be used with Jcc only. - // Found in intel manual 2A - // These do not really make a difference for any current X86 CPU, - // but are provided here for future use - void HINT_NOT_TAKEN() { if (enableBranchHints) Write8(0x2E); } - void HINT_TAKEN() { if (enableBranchHints) Write8(0x3E); } - - FixupBranch J_CC(CCFlags conditionCode, bool force5bytes) + FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes) { FixupBranch branch; branch.type = force5bytes ? 1 : 0; @@ -360,7 +386,7 @@ namespace Gen return branch; } - void J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes) + void XEmitter::J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes) { u64 fn = (u64)addr; if (!force5Bytes) @@ -379,7 +405,7 @@ namespace Gen } } - void SetJumpTarget(const FixupBranch &branch) + void XEmitter::SetJumpTarget(const FixupBranch &branch) { if (branch.type == 0) { @@ -392,33 +418,33 @@ namespace Gen } /* - void INC(int bits, OpArg arg) + void XEmitter::INC(int bits, OpArg arg) { if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "INC - Imm argument"); arg.operandReg = 0; if (bits == 16) {Write8(0x66);} - arg.WriteRex(bits == 64); + arg.WriteRex(this, bits == 64); Write8(bits == 8 ? 0xFE : 0xFF); - arg.WriteRest(); + arg.WriteRest(this); } - void DEC(int bits, OpArg arg) + void XEmitter::DEC(int bits, OpArg arg) { if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "DEC - Imm argument"); arg.operandReg = 1; if (bits == 16) {Write8(0x66);} - arg.WriteRex(bits == 64); + arg.WriteRex(this, bits == 64); Write8(bits == 8 ? 0xFE : 0xFF); - arg.WriteRest(); + arg.WriteRest(this); } */ //Single byte opcodes //There is no PUSHAD/POPAD in 64-bit mode. - void INT3() {Write8(0xCC);} - void RET() {Write8(0xC3);} - void RET_FAST() {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret + void XEmitter::INT3() {Write8(0xCC);} + void XEmitter::RET() {Write8(0xC3);} + void XEmitter::RET_FAST() {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret - void NOP(int count) + void XEmitter::NOP(int count) { // TODO: look up the fastest nop sleds for various sizes int i; @@ -438,13 +464,13 @@ namespace Gen } } - void PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu - void CLC() {Write8(0xF8);} //clear carry - void CMC() {Write8(0xF5);} //flip carry - void STC() {Write8(0xF9);} //set carry + void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu + void XEmitter::CLC() {Write8(0xF8);} //clear carry + void XEmitter::CMC() {Write8(0xF5);} //flip carry + void XEmitter::STC() {Write8(0xF9);} //set carry //TODO: xchg ah, al ??? - void XCHG_AHAL() + void XEmitter::XCHG_AHAL() { Write8(0x86); Write8(0xe0); @@ -452,24 +478,24 @@ namespace Gen } //These two can not be executed on early Intel 64-bit CPU:s, only on AMD! - void LAHF() {Write8(0x9F);} - void SAHF() {Write8(0x9E);} + void XEmitter::LAHF() {Write8(0x9F);} + void XEmitter::SAHF() {Write8(0x9E);} - void PUSHF() {Write8(0x9C);} - void POPF() {Write8(0x9D);} + void XEmitter::PUSHF() {Write8(0x9C);} + void XEmitter::POPF() {Write8(0x9D);} - void LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);} - void MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);} - void SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);} + void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);} + void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);} + void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);} - void WriteSimple1Byte(int bits, u8 byte, X64Reg reg) + void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg) { if (bits == 16) {Write8(0x66);} Rex(bits == 64, 0, 0, (int)reg >> 3); Write8(byte + ((int)reg & 7)); } - void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg) + void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg) { if (bits == 16) {Write8(0x66);} Rex(bits==64, 0, 0, (int)reg >> 3); @@ -477,14 +503,14 @@ namespace Gen Write8(byte2 + ((int)reg & 7)); } - void CWD(int bits) + void XEmitter::CWD(int bits) { if (bits == 16) {Write8(0x66);} Rex(bits == 64, 0, 0, 0); Write8(0x99); } - void CBW(int bits) + void XEmitter::CBW(int bits) { if (bits == 8) {Write8(0x66);} Rex(bits == 32, 0, 0, 0); @@ -495,10 +521,10 @@ namespace Gen //push/pop do not need wide to be 64-bit - void PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);} - void POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);} + void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);} + void XEmitter::POP(X64Reg reg) {WriteSimple1Byte(32, 0x58, reg);} - void PUSH(int bits, const OpArg ®) + void XEmitter::PUSH(int bits, const OpArg ®) { if (reg.IsSimpleReg()) PUSH(reg.GetSimpleReg()); @@ -526,16 +552,15 @@ namespace Gen } else { - //INT3(); if (bits == 16) Write8(0x66); - reg.WriteRex(bits == 64); + reg.WriteRex(this, bits == 64); Write8(0xFF); - reg.WriteRest(0,(X64Reg)6); + reg.WriteRest(this, 0, (X64Reg)6); } } - void POP(int /*bits*/, const OpArg ®) + void XEmitter::POP(int /*bits*/, const OpArg ®) { if (reg.IsSimpleReg()) POP(reg.GetSimpleReg()); @@ -543,7 +568,7 @@ namespace Gen INT3(); } - void BSWAP(int bits, X64Reg reg) + void XEmitter::BSWAP(int bits, X64Reg reg) { if (bits >= 32) { @@ -567,48 +592,48 @@ namespace Gen // Undefined opcode - reserved // If we ever need a way to always cause a non-breakpoint hard exception... - void UD2() + void XEmitter::UD2() { Write8(0x0F); Write8(0x0B); } - void PREFETCH(PrefetchLevel level, OpArg arg) + void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg) { if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "PREFETCH - Imm argument");; arg.operandReg = (u8)level; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0x0F); Write8(0x18); - arg.WriteRest(); + arg.WriteRest(this); } - void SETcc(CCFlags flag, OpArg dest) + void XEmitter::SETcc(CCFlags flag, OpArg dest) { if (dest.IsImm()) _assert_msg_(DYNA_REC, 0, "SETcc - Imm argument"); dest.operandReg = 0; - dest.WriteRex(false); + dest.WriteRex(this, false); Write8(0x0F); Write8(0x90 + (u8)flag); - dest.WriteRest(); + dest.WriteRest(this); } - void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag) + void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag) { if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "CMOVcc - Imm argument"); src.operandReg = dest; - src.WriteRex(bits == 64); + src.WriteRex(this, bits == 64); Write8(0x0F); Write8(0x40 + (u8)flag); - src.WriteRest(); + src.WriteRest(this); } - void WriteMulDivType(int bits, OpArg src, int ext) + void XEmitter::WriteMulDivType(int bits, OpArg src, int ext) { if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "WriteMulDivType - Imm argument"); src.operandReg = ext; if (bits == 16) Write8(0x66); - src.WriteRex(bits == 64); + src.WriteRex(this, bits == 64); if (bits == 8) { Write8(0xF6); @@ -617,37 +642,37 @@ namespace Gen { Write8(0xF7); } - src.WriteRest(); + src.WriteRest(this); } - void MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);} - void DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);} - void IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);} - void IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);} - void NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);} - void NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);} + void XEmitter::MUL(int bits, OpArg src) {WriteMulDivType(bits, src, 4);} + void XEmitter::DIV(int bits, OpArg src) {WriteMulDivType(bits, src, 6);} + void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);} + void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);} + void XEmitter::NEG(int bits, OpArg src) {WriteMulDivType(bits, src, 3);} + void XEmitter::NOT(int bits, OpArg src) {WriteMulDivType(bits, src, 2);} - void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2) + void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2) { if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "WriteBitSearchType - Imm argument"); src.operandReg = (u8)dest; if (bits == 16) Write8(0x66); - src.WriteRex(bits == 64); + src.WriteRex(this, bits == 64); Write8(0x0F); Write8(byte2); - src.WriteRest(); + src.WriteRest(this); } - void MOVNTI(int bits, OpArg dest, X64Reg src) + void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src) { if (bits <= 16) _assert_msg_(DYNA_REC, 0, "MOVNTI - bits<=16"); WriteBitSearchType(bits, src, dest, 0xC3); } - void BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit - void BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit + void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit + void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit - void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src) + void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src) { if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "MOVSX - Imm argument"); if (dbits == sbits) { @@ -656,7 +681,7 @@ namespace Gen } src.operandReg = (u8)dest; if (dbits == 16) Write8(0x66); - src.WriteRex(dbits == 64); + src.WriteRex(this, dbits == 64); if (sbits == 8) { Write8(0x0F); @@ -675,10 +700,10 @@ namespace Gen { Crash(); } - src.WriteRest(); + src.WriteRest(this); } - void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) + void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) { if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "MOVZX - Imm argument"); if (dbits == sbits) { @@ -687,7 +712,7 @@ namespace Gen } src.operandReg = (u8)dest; if (dbits == 16) Write8(0x66); - src.WriteRex(dbits == 64); + src.WriteRex(this, dbits == 64); if (sbits == 8) { Write8(0x0F); @@ -702,22 +727,22 @@ namespace Gen { Crash(); } - src.WriteRest(); + src.WriteRest(this); } - void LEA(int bits, X64Reg dest, OpArg src) + void XEmitter::LEA(int bits, X64Reg dest, OpArg src) { if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "LEA - Imm argument"); src.operandReg = (u8)dest; if (bits == 16) Write8(0x66); //TODO: performance warning - src.WriteRex(bits == 64); + src.WriteRex(this, bits == 64); Write8(0x8D); - src.WriteRest(); + src.WriteRest(this); } //shift can be either imm8 or cl - void WriteShift(int bits, OpArg dest, OpArg &shift, int ext) + void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext) { bool writeImm = false; if (dest.IsImm()) @@ -730,7 +755,7 @@ namespace Gen } dest.operandReg = ext; if (bits == 16) Write8(0x66); - dest.WriteRex(bits == 64); + dest.WriteRex(this, bits == 64); if (shift.GetImmBits() == 8) { //ok an imm @@ -749,60 +774,34 @@ namespace Gen { Write8(bits == 8 ? 0xD2 : 0xD3); } - dest.WriteRest(writeImm ? 1 : 0); + dest.WriteRest(this, writeImm ? 1 : 0); if (writeImm) Write8((u8)shift.offset); } // large rotates and shift are slower on intel than amd // intel likes to rotate by 1, and the op is smaller too - void ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);} - void ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);} - void RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);} - void RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);} - void SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);} - void SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);} - void SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);} + void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);} + void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);} + void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);} + void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);} + void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);} + void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);} + void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);} - void OpArg::WriteSingleByteOp(u8 op, X64Reg _operandReg, int bits) + void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bits) { if (bits == 16) - Write8(0x66); + emit->Write8(0x66); this->operandReg = (u8)_operandReg; - WriteRex(bits == 64); - Write8(op); - WriteRest(); + WriteRex(emit, bits == 64); + emit->Write8(op); + WriteRest(emit); } - //todo : add eax special casing - struct NormalOpDef - { - u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, ext; - }; - - const NormalOpDef nops[11] = - { - {0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0}, //ADD - {0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 2}, //ADC - - {0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 5}, //SUB - {0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 3}, //SBB - - {0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 4}, //AND - {0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 1}, //OR - - {0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 6}, //XOR - {0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0}, //MOV - - {0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0}, //TEST (to == from) - {0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 7}, //CMP - - {0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 7}, //XCHG - }; - //operand can either be immediate or register - void OpArg::WriteNormalOp(bool toRM, NormalOp op, const OpArg &operand, int bits) const + void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const { X64Reg _operandReg = (X64Reg)this->operandReg; if (IsImm()) @@ -811,14 +810,14 @@ namespace Gen } if (bits == 16) - Write8(0x66); + emit->Write8(0x66); int immToWrite = 0; if (operand.IsImm()) { _operandReg = (X64Reg)0; - WriteRex(bits == 64); + WriteRex(emit, bits == 64); if (!toRM) { @@ -827,32 +826,29 @@ namespace Gen if (operand.scale == SCALE_IMM8 && bits == 8) { - Write8(nops[op].imm8); - _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH1"); + emit->Write8(nops[op].imm8); immToWrite = 8; } else if ((operand.scale == SCALE_IMM16 && bits == 16) || (operand.scale == SCALE_IMM32 && bits == 32) || (operand.scale == SCALE_IMM32 && bits == 64)) { - Write8(nops[op].imm32); - _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH2"); + emit->Write8(nops[op].imm32); immToWrite = 32; } else if ((operand.scale == SCALE_IMM8 && bits == 16) || (operand.scale == SCALE_IMM8 && bits == 32) || (operand.scale == SCALE_IMM8 && bits == 64)) { - Write8(nops[op].simm8); - _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH3"); + emit->Write8(nops[op].simm8); immToWrite = 8; } else if (operand.scale == SCALE_IMM64 && bits == 64) { if (op == nrmMOV) { - Write8(0xB8 + (offsetOrBaseReg & 7)); - Write64((u64)operand.offset); + emit->Write8(0xB8 + (offsetOrBaseReg & 7)); + emit->Write64((u64)operand.offset); return; } _assert_msg_(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm"); @@ -866,36 +862,36 @@ namespace Gen else { _operandReg = (X64Reg)operand.offsetOrBaseReg; - WriteRex(bits == 64, _operandReg); + WriteRex(emit, bits == 64, _operandReg); // mem/reg or reg/reg op if (toRM) { - Write8(bits == 8 ? nops[op].toRm8 : nops[op].toRm32); - _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH4"); + emit->Write8(bits == 8 ? nops[op].toRm8 : nops[op].toRm32); + // _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH4"); } else { - Write8(bits == 8 ? nops[op].fromRm8 : nops[op].fromRm32); - _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH5"); + emit->Write8(bits == 8 ? nops[op].fromRm8 : nops[op].fromRm32); + // _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH5"); } } - WriteRest(immToWrite>>3, _operandReg); + WriteRest(emit, immToWrite>>3, _operandReg); switch (immToWrite) { case 0: break; case 8: - Write8((u8)operand.offset); + emit->Write8((u8)operand.offset); break; case 32: - Write32((u32)operand.offset); + emit->Write32((u32)operand.offset); break; default: _assert_msg_(DYNA_REC, 0, "WriteNormalOp - Unhandled case"); } } - void WriteNormalOp(int bits, NormalOp op, const OpArg &a1, const OpArg &a2) + void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2) { if (a1.IsImm()) { @@ -905,39 +901,39 @@ namespace Gen } if (a2.IsImm()) { - a1.WriteNormalOp(true, op, a2, bits); + a1.WriteNormalOp(emit, true, op, a2, bits); } else { if (a1.IsSimpleReg()) { - a2.WriteNormalOp(false, op, a1, bits); + a2.WriteNormalOp(emit, false, op, a1, bits); } else { - a1.WriteNormalOp(true, op, a2, bits); + a1.WriteNormalOp(emit, true, op, a2, bits); } } } - void ADD (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmADD, a1, a2);} - void ADC (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmADC, a1, a2);} - void SUB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmSUB, a1, a2);} - void SBB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmSBB, a1, a2);} - void AND (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmAND, a1, a2);} - void OR (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmOR , a1, a2);} - void XOR (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmXOR, a1, a2);} - void MOV (int bits, const OpArg &a1, const OpArg &a2) + void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmADD, a1, a2);} + void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmADC, a1, a2);} + void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmSUB, a1, a2);} + void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmSBB, a1, a2);} + void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmAND, a1, a2);} + void XEmitter::OR (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmOR , a1, a2);} + void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXOR, a1, a2);} + void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2) { _assert_msg_(DYNA_REC, !a1.IsSimpleReg() || !a2.IsSimpleReg() || a1.GetSimpleReg() != a2.GetSimpleReg(), "Redundant MOV @ %p", code); - WriteNormalOp(bits, nrmMOV, a1, a2); + WriteNormalOp(this, bits, nrmMOV, a1, a2); } - void TEST(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmTEST, a1, a2);} - void CMP (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmCMP, a1, a2);} - void XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(bits, nrmXCHG, a1, a2);} + void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmTEST, a1, a2);} + void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmCMP, a1, a2);} + void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);} - void IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) + void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2) { if (bits == 8) { _assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!"); @@ -955,20 +951,20 @@ namespace Gen if (bits == 16) Write8(0x66); - a1.WriteRex(bits == 64, regOp); + a1.WriteRex(this, bits == 64, regOp); if (a2.GetImmBits() == 8) { Write8(0x6B); - a1.WriteRest(1, regOp); + a1.WriteRest(this, 1, regOp); Write8((u8)a2.offset); } else { Write8(0x69); if (a2.GetImmBits() == 16 && bits == 16) { - a1.WriteRest(2, regOp); + a1.WriteRest(this, 2, regOp); Write16((u16)a2.offset); } else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64)) { - a1.WriteRest(4, regOp); + a1.WriteRest(this, 4, regOp); Write32((u32)a2.offset); } else { _assert_msg_(DYNA_REC, 0, "IMUL - unhandled case!"); @@ -976,7 +972,7 @@ namespace Gen } } - void IMUL(int bits, X64Reg regOp, OpArg a) + void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a) { if (bits == 8) { _assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!"); @@ -990,209 +986,178 @@ namespace Gen if (bits == 16) Write8(0x66); - a.WriteRex(bits == 64, regOp); + a.WriteRex(this, bits == 64, regOp); Write8(0x0F); Write8(0xAF); - a.WriteRest(0, regOp); + a.WriteRest(this, 0, regOp); } - void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0) + void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes) { if (size == 64 && packed) Write8(0x66); //this time, override goes upwards if (!packed) Write8(size == 64 ? 0xF2 : 0xF3); arg.operandReg = regOp; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0x0F); Write8(sseOp); - arg.WriteRest(extrabytes); + arg.WriteRest(this, extrabytes); } - void MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);} + void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);} + void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);} - void MOVQ_xmm(X64Reg dest, OpArg arg) { + void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) { #ifdef _M_X64 // Alternate encoding // This does not display correctly in MSVC's debugger, it thinks it's a MOVD arg.operandReg = dest; Write8(0x66); - arg.WriteRex(true); + arg.WriteRex(this, true); Write8(0x0f); Write8(0x6E); - arg.WriteRest(0); + arg.WriteRest(this, 0); #else arg.operandReg = dest; Write8(0xF3); Write8(0x0f); Write8(0x7E); - arg.WriteRest(0); + arg.WriteRest(this, 0); #endif } - void MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);} - void MOVQ_xmm(OpArg arg, X64Reg src) { + void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) { if (src > 7) { // Alternate encoding // This does not display correctly in MSVC's debugger, it thinks it's a MOVD arg.operandReg = src; Write8(0x66); - arg.WriteRex(true); + arg.WriteRex(this, true); Write8(0x0f); Write8(0x7E); - arg.WriteRest(0); + arg.WriteRest(this, 0); } else { - // INT3(); arg.operandReg = src; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0x66); Write8(0x0f); Write8(0xD6); - arg.WriteRest(0); + arg.WriteRest(this, 0); } } - - - void WriteMXCSR(OpArg arg, int ext) + void XEmitter::WriteMXCSR(OpArg arg, int ext) { if (arg.IsImm() || arg.IsSimpleReg()) _assert_msg_(DYNA_REC, 0, "MXCSR - invalid operand"); arg.operandReg = ext; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0x0F); Write8(0xAE); - arg.WriteRest(); + arg.WriteRest(this); } - enum NormalSSEOps - { - sseCMP = 0xC2, - sseADD = 0x58, //ADD - sseSUB = 0x5C, //SUB - sseAND = 0x54, //AND - sseANDN = 0x55, //ANDN - sseOR = 0x56, - sseXOR = 0x57, - sseMUL = 0x59, //MUL, - sseDIV = 0x5E, //DIV - sseMIN = 0x5D, //MIN - sseMAX = 0x5F, //MAX - sseCOMIS = 0x2F, //COMIS - sseUCOMIS = 0x2E, //UCOMIS - sseSQRT = 0x51, //SQRT - sseRSQRT = 0x52, //RSQRT (NO DOUBLE PRECISION!!!) - sseMOVAPfromRM = 0x28, //MOVAP from RM - sseMOVAPtoRM = 0x29, //MOVAP to RM - sseMOVUPfromRM = 0x10, //MOVUP from RM - sseMOVUPtoRM = 0x11, //MOVUP to RM - sseMASKMOVDQU = 0xF7, - sseLDDQU = 0xF0, - sseSHUF = 0xC6, - sseMOVNTDQ = 0xE7, - sseMOVNTP = 0x2B, - }; + void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} + void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} - void STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);} - void LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);} + void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTDQ, true, regOp, arg);} + void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVNTP, true, regOp, arg);} + void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTP, true, regOp, arg);} - void MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTDQ, true, regOp, arg);} - void MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVNTP, true, regOp, arg);} - void MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTP, true, regOp, arg);} + void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, false, regOp, arg);} + void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, false, regOp, arg);} + void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, false, regOp, arg);} + void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, false, regOp, arg);} + void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, false, regOp, arg,1); Write8(compare);} + void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, false, regOp, arg,1); Write8(compare);} + void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, false, regOp, arg);} + void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, false, regOp, arg);} + void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, false, regOp, arg);} + void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, false, regOp, arg);} + void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, false, regOp, arg);} + void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, false, regOp, arg);} + void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, false, regOp, arg);} + void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, false, regOp, arg);} + void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, false, regOp, arg);} + void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, false, regOp, arg);} + void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, false, regOp, arg);} + void XEmitter::RSQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseRSQRT, false, regOp, arg);} - void ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, false, regOp, arg);} - void ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, false, regOp, arg);} - void SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, false, regOp, arg);} - void SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, false, regOp, arg);} - void CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, false, regOp, arg,1); Write8(compare);} - void CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, false, regOp, arg,1); Write8(compare);} - void MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, false, regOp, arg);} - void MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, false, regOp, arg);} - void DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, false, regOp, arg);} - void DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, false, regOp, arg);} - void MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, false, regOp, arg);} - void MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, false, regOp, arg);} - void MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, false, regOp, arg);} - void MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, false, regOp, arg);} - void SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, false, regOp, arg);} - void SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, false, regOp, arg);} - void RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, false, regOp, arg);} - void RSQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseRSQRT, false, regOp, arg);} + void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, true, regOp, arg);} + void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, true, regOp, arg);} + void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, true, regOp, arg);} + void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, true, regOp, arg);} + void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, true, regOp, arg,1); Write8(compare);} + void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, true, regOp, arg,1); Write8(compare);} + void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseAND, true, regOp, arg);} + void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseAND, true, regOp, arg);} + void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseANDN, true, regOp, arg);} + void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseANDN, true, regOp, arg);} + void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseOR, true, regOp, arg);} + void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseOR, true, regOp, arg);} + void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseXOR, true, regOp, arg);} + void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseXOR, true, regOp, arg);} + void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, true, regOp, arg);} + void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, true, regOp, arg);} + void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, true, regOp, arg);} + void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, true, regOp, arg);} + void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, true, regOp, arg);} + void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, true, regOp, arg);} + void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, true, regOp, arg);} + void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, true, regOp, arg);} + void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, true, regOp, arg);} + void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, true, regOp, arg);} + void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);} + void XEmitter::RSQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseRSQRT, true, regOp, arg);} + void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);} + void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);} - void ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, true, regOp, arg);} - void ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, true, regOp, arg);} - void SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, true, regOp, arg);} - void SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, true, regOp, arg);} - void CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, true, regOp, arg,1); Write8(compare);} - void CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, true, regOp, arg,1); Write8(compare);} - void ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseAND, true, regOp, arg);} - void ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseAND, true, regOp, arg);} - void ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseANDN, true, regOp, arg);} - void ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseANDN, true, regOp, arg);} - void ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseOR, true, regOp, arg);} - void ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseOR, true, regOp, arg);} - void XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseXOR, true, regOp, arg);} - void XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseXOR, true, regOp, arg);} - void MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, true, regOp, arg);} - void MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, true, regOp, arg);} - void DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, true, regOp, arg);} - void DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, true, regOp, arg);} - void MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, true, regOp, arg);} - void MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, true, regOp, arg);} - void MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, true, regOp, arg);} - void MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, true, regOp, arg);} - void SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, true, regOp, arg);} - void SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, true, regOp, arg);} - void RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);} - void RSQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseRSQRT, true, regOp, arg);} - void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);} - void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);} + void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed + void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered + void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true, regOp, arg);} //unordered + void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);} - void COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed - void COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered - void UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true, regOp, arg);} //unordered - void UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);} + void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);} + void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);} + void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);} + void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);} - void MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);} - void MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);} - void MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);} - void MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);} + void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);} + void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);} + void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);} + void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);} - void MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);} - void MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);} - void MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);} - void MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);} + void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, false, regOp, arg);} + void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, false, regOp, arg);} + void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, false, regOp, arg);} + void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, false, regOp, arg);} - void MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, false, regOp, arg);} - void MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, false, regOp, arg);} - void MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, false, regOp, arg);} - void MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, false, regOp, arg);} + void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);} + void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);} - void CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);} - void CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);} + void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);} + void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);} + void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xF2, false, regOp, arg);} - void CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);} - void CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);} - void CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xF2, false, regOp, arg);} + void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);} + void XEmitter::CVTDQ2PS(X64Reg regOp, const OpArg &arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);} + void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);} - void CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);} - void CVTDQ2PS(X64Reg regOp, const OpArg &arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);} - void CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);} + void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));} - void MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));} + void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);} + void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x50, true, dest, arg);} - void MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);} - void MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x50, true, dest, arg);} + void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(64, sseLDDQU, false, dest, arg);} // For integer data only - void LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(64, sseLDDQU, false, dest, arg);} // For integer data only - - void UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);} - void UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);} + void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);} + void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);} - void MOVDDUP(X64Reg regOp, OpArg arg) + void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) { if (cpu_info.bSSE3) { @@ -1210,18 +1175,18 @@ namespace Gen //There are a few more left // Also some integer instrucitons are missing - void PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);} - void PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);} + void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);} + void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);} //void PACKUSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);} // WRONG - void PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x67, true, dest, arg);} + void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x67, true, dest, arg);} - void PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x60, true, dest, arg);} - void PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, true, dest, arg);} - void PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);} + void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x60, true, dest, arg);} + void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, true, dest, arg);} + void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);} //void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);} // WARNING not REX compatible - void PSRAW(X64Reg reg, int shift) { + void XEmitter::PSRAW(X64Reg reg, int shift) { if (reg > 7) PanicAlert("The PSRAW-emitter does not support regs above 7"); Write8(0x66); @@ -1231,18 +1196,18 @@ namespace Gen Write8(shift); } - void PSRLW(X64Reg reg, int shift) { + void XEmitter::PSRLW(X64Reg reg, int shift) { WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg)); Write8(shift); } - void PSLLW(X64Reg reg, int shift) { + void XEmitter::PSLLW(X64Reg reg, int shift) { WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg)); Write8(shift); } // WARNING not REX compatible - void PSRAD(X64Reg reg, int shift) { + void XEmitter::PSRAD(X64Reg reg, int shift) { if (reg > 7) PanicAlert("The PSRAD-emitter does not support regs above 7"); Write8(0x66); @@ -1252,85 +1217,85 @@ namespace Gen Write8(shift); } - void PSHUFB(X64Reg dest, OpArg arg) { + void XEmitter::PSHUFB(X64Reg dest, OpArg arg) { if (!cpu_info.bSSSE3) { PanicAlert("Trying to use PSHUFB on a system that doesn't support it. Bad programmer."); } Write8(0x66); arg.operandReg = dest; - arg.WriteRex(false); + arg.WriteRex(this, false); Write8(0x0f); Write8(0x38); Write8(0x00); - arg.WriteRest(0); + arg.WriteRest(this, 0); } - void PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} - void PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDF, true, dest, arg);} - void PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEF, true, dest, arg);} - void POR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEB, true, dest, arg);} + void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} + void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDF, true, dest, arg);} + void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEF, true, dest, arg);} + void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEB, true, dest, arg);} - void PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFC, true, dest, arg);} - void PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFD, true, dest, arg);} - void PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFE, true, dest, arg);} - void PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD4, true, dest, arg);} + void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFC, true, dest, arg);} + void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFD, true, dest, arg);} + void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFE, true, dest, arg);} + void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD4, true, dest, arg);} - void PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEC, true, dest, arg);} - void PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xED, true, dest, arg);} - void PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDC, true, dest, arg);} - void PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest, arg);} + void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEC, true, dest, arg);} + void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xED, true, dest, arg);} + void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDC, true, dest, arg);} + void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest, arg);} - void PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);} - void PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);} - void PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);} - void PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} + void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);} + void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);} + void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);} + void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);} - void PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);} - void PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);} - void PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD8, true, dest, arg);} - void PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD9, true, dest, arg);} + void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);} + void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);} + void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD8, true, dest, arg);} + void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD9, true, dest, arg);} - void PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE0, true, dest, arg);} - void PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE3, true, dest, arg);} + void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE0, true, dest, arg);} + void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE3, true, dest, arg);} - void PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x74, true, dest, arg);} - void PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x75, true, dest, arg);} - void PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x76, true, dest, arg);} + void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x74, true, dest, arg);} + void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x75, true, dest, arg);} + void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x76, true, dest, arg);} - void PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x64, true, dest, arg);} - void PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x65, true, dest, arg);} - void PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);} + void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x64, true, dest, arg);} + void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x65, true, dest, arg);} + void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);} - void PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0x64, true, dest, arg); Write8(subreg);} - void PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0x64, true, dest, arg); Write8(subreg);} + void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0x64, true, dest, arg); Write8(subreg);} + void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0x64, true, dest, arg); Write8(subreg);} - void PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF5, true, dest, arg); } - void PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF6, true, dest, arg);} + void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF5, true, dest, arg); } + void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF6, true, dest, arg);} - void PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEE, true, dest, arg); } - void PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDE, true, dest, arg); } - void PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEA, true, dest, arg); } - void PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDA, true, dest, arg); } + void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEE, true, dest, arg); } + void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDE, true, dest, arg); } + void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEA, true, dest, arg); } + void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDA, true, dest, arg); } - void PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD7, true, dest, arg); } + void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD7, true, dest, arg); } - void PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);} + void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);} // Prefixes - void LOCK() { Write8(0xF0); } - void REP() { Write8(0xF3); } - void REPNE(){ Write8(0xF2); } + void XEmitter::LOCK() { Write8(0xF0); } + void XEmitter::REP() { Write8(0xF3); } + void XEmitter::REPNE() { Write8(0xF2); } - void FWAIT() + void XEmitter::FWAIT() { Write8(0x9B); } - void RTDSC() { Write8(0x0F); Write8(0x31); } + void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); } // helper routines for setting pointers -void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2) +void XEmitter::CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2) { using namespace Gen; #ifdef _M_X64 @@ -1361,7 +1326,7 @@ void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2) #endif } -void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3) +void XEmitter::CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3) { using namespace Gen; #ifdef _M_X64 @@ -1395,7 +1360,7 @@ void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3) #endif } -void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4) +void XEmitter::CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4) { using namespace Gen; #ifdef _M_X64 @@ -1432,7 +1397,7 @@ void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 #endif } -void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5) +void XEmitter::CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5) { using namespace Gen; #ifdef _M_X64 @@ -1475,20 +1440,20 @@ void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 #ifdef _M_X64 // See header -void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2) { +void XEmitter::___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2) { MOV(32, R(RCX), Imm32(arg0)); MOV(32, R(RDX), Imm32(arg1)); MOV(32, R(R8), Imm32(arg2)); CALLptr(M(impptr)); } -void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3) { +void XEmitter::___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3) { MOV(32, R(RCX), Imm32(arg0)); MOV(32, R(RDX), Imm32(arg1)); MOV(32, R(R8), Imm32(arg2)); MOV(32, R(R9), Imm32(arg3)); CALLptr(M(impptr)); } -void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4) { +void XEmitter::___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4) { MOV(32, R(RCX), Imm32(arg0)); MOV(32, R(RDX), Imm32(arg1)); MOV(32, R(R8), Imm32(arg2)); @@ -1496,7 +1461,7 @@ void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u MOV(32, MDisp(RSP, 0x20), Imm32(arg4)); CALLptr(M(impptr)); } -void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5) { +void XEmitter::___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5) { MOV(32, R(RCX), Imm32(arg0)); MOV(32, R(RDX), Imm32(arg1)); MOV(32, R(R8), Imm32(arg2)); diff --git a/Source/Core/Common/Src/x64Emitter.h b/Source/Core/Common/Src/x64Emitter.h index bf32b10c9b..b05b0812c8 100644 --- a/Source/Core/Common/Src/x64Emitter.h +++ b/Source/Core/Common/Src/x64Emitter.h @@ -21,217 +21,264 @@ #define _DOLPHIN_INTEL_CODEGEN #include "Common.h" +#include "MemoryUtil.h" namespace Gen { - enum X64Reg + +enum X64Reg +{ + EAX = 0, EBX = 3, ECX = 1, EDX = 2, + ESI = 6, EDI = 7, EBP = 5, ESP = 4, + + RAX = 0, RBX = 3, RCX = 1, RDX = 2, + RSI = 6, RDI = 7, RBP = 5, RSP = 4, + R8 = 8, R9 = 9, R10 = 10,R11 = 11, + R12 = 12,R13 = 13,R14 = 14,R15 = 15, + + AL = 0, BL = 3, CL = 1, DL = 2, + AH = 4, BH = 7, CH = 5, DH = 6, + + AX = 0, BX = 3, CX = 1, DX = 2, + SI = 6, DI = 7, BP = 5, SP = 4, + + XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, + + INVALID_REG = 0xFFFFFFFF +}; + +enum CCFlags +{ + CC_O = 0, + CC_NO = 1, + CC_B = 2, CC_C = 2, CC_NAE = 2, + CC_NB = 3, CC_NC = 3, CC_AE = 3, + CC_Z = 4, CC_E = 4, + CC_NZ = 5, CC_NE = 5, + CC_BE = 6, CC_NA = 6, + CC_NBE = 7, CC_A = 7, + CC_S = 8, + CC_NS = 9, + CC_P = 0xA, CC_PE = 0xA, + CC_NP = 0xB, CC_PO = 0xB, + CC_L = 0xC, CC_NGE = 0xC, + CC_NL = 0xD, CC_GE = 0xD, + CC_LE = 0xE, CC_NG = 0xE, + CC_NLE = 0xF, CC_G = 0xF +}; + +enum +{ + NUMGPRs = 16, + NUMXMMs = 16, +}; + +enum +{ + SCALE_NONE = 0, + SCALE_1 = 1, + SCALE_2 = 2, + SCALE_4 = 4, + SCALE_8 = 8, + SCALE_ATREG = 16, + SCALE_RIP = 0xFF, + SCALE_IMM8 = 0xF0, + SCALE_IMM16 = 0xF1, + SCALE_IMM32 = 0xF2, + SCALE_IMM64 = 0xF3, +}; + +enum NormalOp { + nrmADD, + nrmADC, + nrmSUB, + nrmSBB, + nrmAND, + nrmOR , + nrmXOR, + nrmMOV, + nrmTEST, + nrmCMP, + nrmXCHG, +}; + +class XEmitter; + +// RIP addressing does not benefit from micro op fusion on Core arch +struct OpArg +{ + OpArg() {} // dummy op arg, used for storage + OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) { - EAX = 0, EBX = 3, ECX = 1, EDX = 2, - ESI = 6, EDI = 7, EBP = 5, ESP = 4, - - RAX = 0, RBX = 3, RCX = 1, RDX = 2, - RSI = 6, RDI = 7, RBP = 5, RSP = 4, - R8 = 8, R9 = 9, R10 = 10,R11 = 11, - R12 = 12,R13 = 13,R14 = 14,R15 = 15, + operandReg = 0; + scale = (u8)_scale; + offsetOrBaseReg = (u8)rmReg; + indexReg = (u8)scaledReg; + //if scale == 0 never mind offseting + offset = _offset; + } + void WriteRex(XEmitter *emit, bool op64, int customOp = -1) const; + void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const; + void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); + // This one is public - must be written to + u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available. + u8 operandReg; - AL = 0, BL = 3, CL = 1, DL = 2, - AH = 4, BH = 7, CH = 5, DH = 6, + void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const; + bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;} + bool IsSimpleReg() const {return scale == SCALE_NONE;} + bool IsSimpleReg(X64Reg reg) const { + if (!IsSimpleReg()) + return false; + return GetSimpleReg() == reg; + } - AX = 0, BX = 3, CX = 1, DX = 2, - SI = 6, DI = 7, BP = 5, SP = 4, - - XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, - - INVALID_REG = 0xFFFFFFFF - }; - - enum CCFlags + bool CanDoOpWith(const OpArg &other) const { - CC_O = 0, - CC_NO = 1, - CC_B = 2, CC_C = 2, CC_NAE = 2, - CC_NB = 3, CC_NC = 3, CC_AE = 3, - CC_Z = 4, CC_E = 4, - CC_NZ = 5, CC_NE = 5, - CC_BE = 6, CC_NA = 6, - CC_NBE = 7, CC_A = 7, - CC_S = 8, - CC_NS = 9, - CC_P = 0xA, CC_PE = 0xA, - CC_NP = 0xB, CC_PO = 0xB, - CC_L = 0xC, CC_NGE = 0xC, - CC_NL = 0xD, CC_GE = 0xD, - CC_LE = 0xE, CC_NG = 0xE, - CC_NLE = 0xF, CC_G = 0xF - }; + if (IsSimpleReg()) return true; + if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false; + return true; + } - enum + int GetImmBits() const { - NUMGPRs = 16, - NUMXMMs = 16, - }; + switch (scale) + { + case SCALE_IMM8: return 8; + case SCALE_IMM16: return 16; + case SCALE_IMM32: return 32; + case SCALE_IMM64: return 64; + default: return -1; + } + } - enum + X64Reg GetSimpleReg() const { - SCALE_NONE = 0, - SCALE_1 = 1, - SCALE_2 = 2, - SCALE_4 = 4, - SCALE_8 = 8, - SCALE_ATREG = 16, - SCALE_RIP = 0xFF, - SCALE_IMM8 = 0xF0, - SCALE_IMM16 = 0xF1, - SCALE_IMM32 = 0xF2, - SCALE_IMM64 = 0xF3, - }; + if (scale == SCALE_NONE) + return (X64Reg)offsetOrBaseReg; + else + return INVALID_REG; + } +private: + u8 scale; + u8 offsetOrBaseReg; + u8 indexReg; +}; + +inline OpArg M(void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);} +inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);} +inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);} +inline OpArg MDisp(X64Reg value, int offset) { + return OpArg((u32)offset, SCALE_ATREG, value); } +inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) +{ + return OpArg(offset, scale, base, scaled); +} +inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);} +inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used +inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);} +inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);} +#ifdef _M_X64 +inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);} +#else +inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);} +#endif + +struct FixupBranch +{ + u8 *ptr; + int type; //0 = 8bit 1 = 32bit +}; + +enum SSECompare +{ + EQ = 0, + LT, + LE, + UNORD, + NEQ, + NLT, + NLE, + ORD, +}; + +typedef const u8* JumpTarget; + +class XEmitter +{ + friend struct OpArg; // for Write8 etc +private: + u8 *code; + + void Rex(int w, int r, int x, int b); + void WriteSimple1Byte(int bits, u8 byte, X64Reg reg); + void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg); + void WriteMulDivType(int bits, OpArg src, int ext); + void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2); + void WriteShift(int bits, OpArg dest, OpArg &shift, int ext); + void WriteMXCSR(OpArg arg, int ext); + void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0); + void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); + +protected: + inline void Write8(u8 value) {*code++ = value;} + inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} + inline void Write32(u32 value) {*(u32*)code = (value); code += 4;} + inline void Write64(u64 value) {*(u64*)code = (value); code += 8;} + +public: + XEmitter() { code = NULL; } + XEmitter(u8 *code_ptr) { code = code_ptr; } + + void WriteModRM(int mod, int rm, int reg); + void WriteSIB(int scale, int index, int base); void SetCodePtr(u8 *ptr); void ReserveCodeSpace(int bytes); const u8 *AlignCode4(); const u8 *AlignCode16(); const u8 *AlignCodePage(); - const u8 *GetCodePtr(); + const u8 *GetCodePtr() const; u8 *GetWritableCodePtr(); + // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU + // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., + // INC and DEC are slow on Intel Core, but not on AMD. They create a + // false flag dependency because they only update a subset of the flags. + // XCHG is SLOW and should be avoided. - // Safe way to temporarily redirect the code generator. - class GenContext - { - u8 **code_ptr_ptr; - u8 *saved_ptr; - public: - GenContext(u8 **code_ptr_ptr_) - { - saved_ptr = GetWritableCodePtr(); - code_ptr_ptr = code_ptr_ptr_; - SetCodePtr(*code_ptr_ptr); - } - ~GenContext() - { - *code_ptr_ptr = GetWritableCodePtr(); - SetCodePtr(saved_ptr); - } - }; - - enum NormalOp { - nrmADD, - nrmADC, - nrmSUB, - nrmSBB, - nrmAND, - nrmOR , - nrmXOR, - nrmMOV, - nrmTEST, - nrmCMP, - nrmXCHG, - }; - - // Make the generation routine examine which direction to go - // probably has to be a static - - // RIP addressing does not benefit from micro op fusion on Core arch - struct OpArg - { - OpArg() {} //dummy op arg, used for storage - OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) - { - operandReg = 0; - scale = (u8)_scale; - offsetOrBaseReg = (u8)rmReg; - indexReg = (u8)scaledReg; - //if scale == 0 never mind offseting - offset = _offset; - } - void WriteRex(bool op64, int customOp = -1) const; - void WriteRest(int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const; - void WriteSingleByteOp(u8 op, X64Reg operandReg, int bits); - //This one is public - must be written to - u64 offset; //use RIP-relative as much as possible - avoid 64-bit immediates at all costs - u8 operandReg; - - void WriteNormalOp(bool toRM, NormalOp op, const OpArg &operand, int bits) const; - bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;} - bool IsSimpleReg() const {return scale == SCALE_NONE;} - bool IsSimpleReg(X64Reg reg) const { - if (!IsSimpleReg()) - return false; - return GetSimpleReg() == reg; - } - bool CanDoOpWith(const OpArg &other) const - { - if (IsSimpleReg()) return true; - if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false; - return true; - } - - int GetImmBits() const - { - switch (scale) - { - case SCALE_IMM8: return 8; - case SCALE_IMM16: return 16; - case SCALE_IMM32: return 32; - case SCALE_IMM64: return 64; - default: return -1; - } - } - X64Reg GetSimpleReg() const - { - if (scale == SCALE_NONE) - return (X64Reg)offsetOrBaseReg; - else - return INVALID_REG; - } - private: - u8 scale; - u8 offsetOrBaseReg; - u8 indexReg; - }; - - inline OpArg M(void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);} - inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);} - inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);} - inline OpArg MDisp(X64Reg value, int offset) { - return OpArg((u32)offset, SCALE_ATREG, value); } - inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) - { - return OpArg(offset, scale, base, scaled); - } - inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);} - inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used - inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);} - inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);} -#ifdef _M_X64 - inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);} -#else - inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);} -#endif - + // Debug breakpoint void INT3(); + + // Do nothing void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals) + + // Save energy in wait-loops on P4 only. Probably not too useful. void PAUSE(); - void RET(); + + // Flag control void STC(); void CLC(); void CMC(); + + // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD! + void LAHF(); // 3 cycle vector path + void SAHF(); // direct path fast + + + // Stack control void PUSH(X64Reg reg); void POP(X64Reg reg); void PUSH(int bits, const OpArg ®); void POP(int bits, const OpArg ®); void PUSHF(); void POPF(); - - typedef const u8* JumpTarget; - - struct FixupBranch - { - u8 *ptr; - int type; //0 = 8bit 1 = 32bit - }; + // Flow control + void RET(); + void RET_FAST(); + void UD2(); FixupBranch J(bool force5bytes = false); void JMP(const u8 * addr, bool force5Bytes = false); @@ -239,7 +286,7 @@ namespace Gen void JMPptr(const OpArg &arg); void JMPself(); //infinite loop! - void CALL(void *fnptr); + void CALL(const void *fnptr); void CALLptr(OpArg arg); FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); @@ -248,66 +295,20 @@ namespace Gen void SetJumpTarget(const FixupBranch &branch); - //WARNING - INC and DEC slow on Intel Core, but not on AMD, since it creates - //false flags dependencies because they only update a subset of the flags - - // ector - I hereby BAN inc and dec due to their horribleness :P - // void INC(int bits, OpArg arg); - // void DEC(int bits, OpArg arg); - void SETcc(CCFlags flag, OpArg dest); - // Note: CMOV brings small if any benefit on current cpus, unfortunately. + // Note: CMOV brings small if any benefit on current cpus. void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag); + // Fences void LFENCE(); void MFENCE(); void SFENCE(); + // Bit scan void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit - //These two can not be executed on early Intel 64-bit CPU:s, only on AMD! - - void LAHF(); // 3 cycle vector path - void SAHF(); // direct path fast - - //Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU - //LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., - - //Actually REP MOVSD could be useful :P - - void MOVNTI(int bits, OpArg dest, X64Reg src); - - void MUL(int bits, OpArg src); //UNSIGNED - void DIV(int bits, OpArg src); - void IMUL(int bits, OpArg src); //SIGNED - void IDIV(int bits, OpArg src); - void IMUL(int bits, X64Reg regOp, OpArg src); - void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm); - - - void NEG(int bits, OpArg src); - void NOT(int bits, OpArg src); - - void ROL(int bits, OpArg dest, OpArg shift); - void ROR(int bits, OpArg dest, OpArg shift); - void RCL(int bits, OpArg dest, OpArg shift); - void RCR(int bits, OpArg dest, OpArg shift); - void SHL(int bits, OpArg dest, OpArg shift); - void SHR(int bits, OpArg dest, OpArg shift); - void SAR(int bits, OpArg dest, OpArg shift); - - - void CWD(int bits = 16); - inline void CDQ() {CWD(32);} - inline void CQO() {CWD(64);} - void CBW(int bits = 8); - inline void CWDE() {CBW(16);} - inline void CDQE() {CBW(32);} - - void LEA(int bits, X64Reg dest, OpArg src); - - + // Cache control enum PrefetchLevel { PF_NTA, //Non-temporal (data used once and only once) @@ -316,58 +317,82 @@ namespace Gen PF_T2, //Levels 3+ (aliased to T0 on AMD) }; void PREFETCH(PrefetchLevel level, OpArg arg); - + void MOVNTI(int bits, OpArg dest, X64Reg src); + void MOVNTDQ(OpArg arg, X64Reg regOp); + void MOVNTPS(OpArg arg, X64Reg regOp); + void MOVNTPD(OpArg arg, X64Reg regOp); + // Multiplication / division + void MUL(int bits, OpArg src); //UNSIGNED + void IMUL(int bits, OpArg src); //SIGNED + void IMUL(int bits, X64Reg regOp, OpArg src); + void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm); + void DIV(int bits, OpArg src); + void IDIV(int bits, OpArg src); + + // Shift + void ROL(int bits, OpArg dest, OpArg shift); + void ROR(int bits, OpArg dest, OpArg shift); + void RCL(int bits, OpArg dest, OpArg shift); + void RCR(int bits, OpArg dest, OpArg shift); + void SHL(int bits, OpArg dest, OpArg shift); + void SHR(int bits, OpArg dest, OpArg shift); + void SAR(int bits, OpArg dest, OpArg shift); + + // Extend EAX into EDX in various ways + void CWD(int bits = 16); + inline void CDQ() {CWD(32);} + inline void CQO() {CWD(64);} + void CBW(int bits = 8); + inline void CWDE() {CBW(16);} + inline void CDQE() {CBW(32);} + + // Load effective address + void LEA(int bits, X64Reg dest, OpArg src); + + // Integer arithmetic + void NEG (int bits, OpArg src); void ADD (int bits, const OpArg &a1, const OpArg &a2); void ADC (int bits, const OpArg &a1, const OpArg &a2); void SUB (int bits, const OpArg &a1, const OpArg &a2); void SBB (int bits, const OpArg &a1, const OpArg &a2); void AND (int bits, const OpArg &a1, const OpArg &a2); + void CMP (int bits, const OpArg &a1, const OpArg &a2); + + // Bit operations + void NOT (int bits, OpArg src); void OR (int bits, const OpArg &a1, const OpArg &a2); void XOR (int bits, const OpArg &a1, const OpArg &a2); void MOV (int bits, const OpArg &a1, const OpArg &a2); void TEST(int bits, const OpArg &a1, const OpArg &a2); - void CMP (int bits, const OpArg &a1, const OpArg &a2); - - // XCHG is SLOW and should be avoided. - //void XCHG(int bits, const OpArg &a1, const OpArg &a2); + // Are these useful at all? Consider removing. + void XCHG(int bits, const OpArg &a1, const OpArg &a2); void XCHG_AHAL(); + + // Byte swapping (32 and 64-bit only). void BSWAP(int bits, X64Reg reg); + + // Sign/zero extension void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); - enum SSECompare - { - EQ = 0, - LT, - LE, - UNORD, - NEQ, - NLT, - NLE, - ORD, - }; - // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) void STMXCSR(OpArg memloc); void LDMXCSR(OpArg memloc); - // Regular SSE/SSE2 instructions + // Prefixes + void LOCK(); + void REP(); + void REPNE(); + + void FWAIT(); + + // SSE/SSE2: Floating point arithmetic void ADDSS(X64Reg regOp, OpArg arg); void ADDSD(X64Reg regOp, OpArg arg); void SUBSS(X64Reg regOp, OpArg arg); void SUBSD(X64Reg regOp, OpArg arg); - void CMPSS(X64Reg regOp, OpArg arg, u8 compare); - void CMPSD(X64Reg regOp, OpArg arg, u8 compare); - void ANDSS(X64Reg regOp, OpArg arg); - void ANDSD(X64Reg regOp, OpArg arg); - void ANDNSS(X64Reg regOp, OpArg arg); - void ANDNSD(X64Reg regOp, OpArg arg); - void ORSS(X64Reg regOp, OpArg arg); - void ORSD(X64Reg regOp, OpArg arg); - void XORSS(X64Reg regOp, OpArg arg); - void XORSD(X64Reg regOp, OpArg arg); void MULSS(X64Reg regOp, OpArg arg); void MULSD(X64Reg regOp, OpArg arg); void DIVSS(X64Reg regOp, OpArg arg); @@ -381,45 +406,65 @@ namespace Gen void RSQRTSS(X64Reg regOp, OpArg arg); void RSQRTSD(X64Reg regOp, OpArg arg); - void COMISS(X64Reg regOp, OpArg arg); - void COMISD(X64Reg regOp, OpArg arg); + // SSE/SSE2: Floating point bitwise (yes) + void CMPSS(X64Reg regOp, OpArg arg, u8 compare); + void CMPSD(X64Reg regOp, OpArg arg, u8 compare); + void ANDSS(X64Reg regOp, OpArg arg); + void ANDSD(X64Reg regOp, OpArg arg); + void ANDNSS(X64Reg regOp, OpArg arg); + void ANDNSD(X64Reg regOp, OpArg arg); + void ORSS(X64Reg regOp, OpArg arg); + void ORSD(X64Reg regOp, OpArg arg); + void XORSS(X64Reg regOp, OpArg arg); + void XORSD(X64Reg regOp, OpArg arg); + // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) void ADDPS(X64Reg regOp, OpArg arg); void ADDPD(X64Reg regOp, OpArg arg); void SUBPS(X64Reg regOp, OpArg arg); void SUBPD(X64Reg regOp, OpArg arg); void CMPPS(X64Reg regOp, OpArg arg, u8 compare); - void CMPPD(X64Reg regOp, OpArg arg, u8 compare); - void ANDPS(X64Reg regOp, OpArg arg); - void ANDPD(X64Reg regOp, OpArg arg); - void ANDNPS(X64Reg regOp, OpArg arg); - void ANDNPD(X64Reg regOp, OpArg arg); - void ORPS(X64Reg regOp, OpArg arg); - void ORPD(X64Reg regOp, OpArg arg); - void XORPS(X64Reg regOp, OpArg arg); - void XORPD(X64Reg regOp, OpArg arg); - void MULPS(X64Reg regOp, OpArg arg); - void MULPD(X64Reg regOp, OpArg arg); - void DIVPS(X64Reg regOp, OpArg arg); - void DIVPD(X64Reg regOp, OpArg arg); - void MINPS(X64Reg regOp, OpArg arg); - void MINPD(X64Reg regOp, OpArg arg); - void MAXPS(X64Reg regOp, OpArg arg); - void MAXPD(X64Reg regOp, OpArg arg); + void CMPPD(X64Reg regOp, OpArg arg, u8 compare); + void MULPS(X64Reg regOp, OpArg arg); + void MULPD(X64Reg regOp, OpArg arg); + void DIVPS(X64Reg regOp, OpArg arg); + void DIVPD(X64Reg regOp, OpArg arg); + void MINPS(X64Reg regOp, OpArg arg); + void MINPD(X64Reg regOp, OpArg arg); + void MAXPS(X64Reg regOp, OpArg arg); + void MAXPD(X64Reg regOp, OpArg arg); void SQRTPS(X64Reg regOp, OpArg arg); void SQRTPD(X64Reg regOp, OpArg arg); void RSQRTPS(X64Reg regOp, OpArg arg); void RSQRTPD(X64Reg regOp, OpArg arg); + + // SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double) + void ANDPS(X64Reg regOp, OpArg arg); + void ANDPD(X64Reg regOp, OpArg arg); + void ANDNPS(X64Reg regOp, OpArg arg); + void ANDNPD(X64Reg regOp, OpArg arg); + void ORPS(X64Reg regOp, OpArg arg); + void ORPD(X64Reg regOp, OpArg arg); + void XORPS(X64Reg regOp, OpArg arg); + void XORPD(X64Reg regOp, OpArg arg); + + // SSE/SSE2: Shuffle components. These are tricky - see Intel documentation. void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle); void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle); - + + // SSE/SSE2: Useful alternative to shuffle in some cases. void MOVDDUP(X64Reg regOp, OpArg arg); + void UNPCKLPD(X64Reg dest, OpArg src); + void UNPCKHPD(X64Reg dest, OpArg src); + + // SSE/SSE2: Compares. void COMISS(X64Reg regOp, OpArg arg); void COMISD(X64Reg regOp, OpArg arg); void UCOMISS(X64Reg regOp, OpArg arg); void UCOMISD(X64Reg regOp, OpArg arg); + // SSE/SSE2: Moves. Use the right data type for your data, in most cases. void MOVAPS(X64Reg regOp, OpArg arg); void MOVAPD(X64Reg regOp, OpArg arg); void MOVAPS(OpArg arg, X64Reg regOp); @@ -435,20 +480,20 @@ namespace Gen void MOVSS(OpArg arg, X64Reg regOp); void MOVSD(OpArg arg, X64Reg regOp); - void MOVMSKPS(X64Reg dest, OpArg arg); - void MOVMSKPD(X64Reg dest, OpArg arg); - void MOVD_xmm(X64Reg dest, const OpArg &arg); void MOVQ_xmm(X64Reg dest, OpArg arg); void MOVD_xmm(const OpArg &arg, X64Reg src); void MOVQ_xmm(OpArg arg, X64Reg src); + // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. + void MOVMSKPS(X64Reg dest, OpArg arg); + void MOVMSKPD(X64Reg dest, OpArg arg); + + // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. void MASKMOVDQU(X64Reg dest, X64Reg src); void LDDQU(X64Reg dest, OpArg src); - void UNPCKLPD(X64Reg dest, OpArg src); - void UNPCKHPD(X64Reg dest, OpArg src); - + // SSE/SSE2: Data type conversions. void CVTPS2PD(X64Reg dest, OpArg src); void CVTPD2PS(X64Reg dest, OpArg src); void CVTSS2SD(X64Reg dest, OpArg src); @@ -458,7 +503,7 @@ namespace Gen void CVTPD2DQ(X64Reg regOp, OpArg arg); void CVTDQ2PS(X64Reg regOp, const OpArg &arg); - //Integer SSE instructions + // SSE2: Packed integer instructions void PACKSSDW(X64Reg dest, OpArg arg); void PACKSSWB(X64Reg dest, OpArg arg); //void PACKUSDW(X64Reg dest, OpArg arg); @@ -528,42 +573,138 @@ namespace Gen void RTDSC(); -void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2); -void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3); -void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); -void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5); + // Utility functions + // These only support u32 parameters, but that's enough for a lot of uses. + // These will destroy the 1 or 2 first "parameter regs". + void ABI_CallFunctionC(void *func, u32 param1); + void ABI_CallFunctionCC(void *func, u32 param1, u32 param2); + void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2); + + // Pass a register as a paremeter. + void ABI_CallFunctionR(void *func, Gen::X64Reg reg1); + void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2); + + // A function that doesn't have any control over what it will do to regs, + // such as the dispatcher, should be surrounded by these. + void ABI_PushAllCalleeSavedRegsAndAdjustStack(); + void ABI_PopAllCalleeSavedRegsAndAdjustStack(); + + // A function that doesn't know anything about it's surroundings, should + // be surrounded by these to establish a safe environment, where it can roam free. + // An example is a backpatch injected function. + void ABI_PushAllCallerSavedRegsAndAdjustStack(); + void ABI_PopAllCallerSavedRegsAndAdjustStack(); + + unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); + void ABI_AlignStack(unsigned int frameSize); + void ABI_RestoreStack(unsigned int frameSize); + + // Sets up a __cdecl function. + // Only x64 really needs the parameter. + void ABI_EmitPrologue(int maxCallParams); + void ABI_EmitEpilogue(int maxCallParams); + + #ifdef _M_IX86 + inline int ABI_GetNumXMMRegs() { return 8; } + #else + inline int ABI_GetNumXMMRegs() { return 16; } + #endif + + // Strange call wrappers. + void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2); + void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3); + void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); + void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5); #if defined(_M_IX86) || !defined(_WIN32) -#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d)) -#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e)) -#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f)) -#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g)) + #define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d)) + #define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e)) + #define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f)) + #define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g)) -#define DECLARE_IMPORT(x) + #define DECLARE_IMPORT(x) #else -// Comments from VertexLoader.cpp about these horrors: + // Comments from VertexLoader.cpp about these horrors: -// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit -// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we -// want to grab the function pointers from the import table instead. + // This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit + // address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we + // want to grab the function pointers from the import table instead. -void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2); -void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3); -void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); -void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5); + void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2); + void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3); + void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); + void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5); -#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d) -#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e) -#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f) -#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g) + #define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d) + #define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e) + #define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f) + #define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g) -#define DECLARE_IMPORT(x) extern "C" void *__imp_##x + #define DECLARE_IMPORT(x) extern "C" void *__imp_##x #endif +}; // class XEmitter -} + +// Everything that needs to generate X86 code should inherit from this. +// You get memory management for free, plus, you can use all the MOV etc functions without +// having to prefix them with gen-> or something similar. +class XCodeBlock : public XEmitter +{ +protected: + u8 *region; + size_t region_size; + +public: + XCodeBlock() : region(NULL), region_size(0) {} + virtual ~XCodeBlock() { if (region) FreeCodeSpace(); } + + // Call this before you generate any code. + void AllocCodeSpace(int size) + { + region_size = size; + region = (u8*)AllocateExecutableMemory(region_size); + SetCodePtr(region); + } + + // Always clear code space with breakpoints, so that if someone accidentally executes + // uninitialized, it just breaks into the debugger. + void ClearCodeSpace() + { + // x86/64: 0xCC = breakpoint + memset(region, 0xCC, region_size); + ResetCodePtr(); + } + + // Call this when shutting down. Don't rely on the destructor, even though it'll do the job. + void FreeCodeSpace() + { + FreeMemoryPages(region, region_size); + region = NULL; + region_size = 0; + } + + // Cannot currently be undone. Will write protect the entire code region. + // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()). + void WriteProtect() + { + WriteProtectMemory(region, region_size, true); + } + + void ResetCodePtr() + { + SetCodePtr(region); + } + + size_t GetSpaceLeft() const + { + return region_size - (GetCodePtr() - region); + } +}; + +} // namespace #endif diff --git a/Source/Core/Core/Src/HW/HW.cpp b/Source/Core/Core/Src/HW/HW.cpp index 879999bc31..1321e6786f 100644 --- a/Source/Core/Core/Src/HW/HW.cpp +++ b/Source/Core/Core/Src/HW/HW.cpp @@ -46,7 +46,7 @@ namespace HW { CoreTiming::Init(); - Thunk_Init(); // not really hw, but this way we know it's inited early :P + thunks.Init(); // not really hw, but this way we know it's inited early :P State_Init(); // Init the whole Hardware @@ -88,7 +88,7 @@ namespace HW } State_Shutdown(); - Thunk_Shutdown(); + thunks.Shutdown(); CoreTiming::Shutdown(); } diff --git a/Source/Core/Core/Src/MemTools.cpp b/Source/Core/Core/Src/MemTools.cpp index 9123186186..b9e1539352 100644 --- a/Source/Core/Core/Src/MemTools.cpp +++ b/Source/Core/Core/Src/MemTools.cpp @@ -104,7 +104,7 @@ LONG NTAPI Handler(PEXCEPTION_POINTERS pPtrs) //We could emulate the memory accesses here, but then they would still be around to take up //execution resources. Instead, we backpatch into a generic memory call and retry. - u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx); + const u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx); // Rip/Eip needs to be updated. if (new_rip) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp index 21427735cc..ff27283b82 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.cpp @@ -164,6 +164,8 @@ ps_adds1 Jit64 jit; PPCAnalyst::CodeBuffer code_buffer(32000); +int CODE_SIZE = 1024*1024*16; + namespace CPUCompare { extern u32 m_BlockStart; @@ -171,6 +173,11 @@ namespace CPUCompare void Jit64::Init() { + if (Core::g_CoreStartupParameter.bJITUnlimitedCache) + { + CODE_SIZE = 1024*1024*8*8; + } + jo.optimizeStack = true; jo.enableBlocklink = true; // Speed boost, but not 100% safe #ifdef _M_X64 @@ -182,6 +189,23 @@ namespace CPUCompare jo.fpAccurateFlags = true; jo.optimizeGatherPipe = true; jo.fastInterrupts = false; + + gpr.SetEmitter(this); + fpr.SetEmitter(this); + + trampolines.Init(); + AllocCodeSpace(CODE_SIZE); + InitCache(); + asm_routines.Init(); + } + + void Jit64::Shutdown() + { + FreeCodeSpace(); + ShutdownCache(); + + trampolines.Shutdown(); + asm_routines.Shutdown(); } void Jit64::WriteCallInterpreter(UGeckoInstruction _inst) @@ -271,7 +295,7 @@ namespace CPUCompare else { MOV(32, M(&PC), Imm32(destination)); - JMP(Asm::dispatcher, true); + JMP(asm_routines.dispatcher, true); } } @@ -280,7 +304,7 @@ namespace CPUCompare MOV(32, M(&PC), R(EAX)); Cleanup(); SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); - JMP(Asm::dispatcher, true); + JMP(asm_routines.dispatcher, true); } void Jit64::WriteRfiExitDestInEAX() @@ -288,7 +312,7 @@ namespace CPUCompare MOV(32, M(&PC), R(EAX)); Cleanup(); SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); - JMP(Asm::testExceptions, true); + JMP(asm_routines.testExceptions, true); } void Jit64::WriteExceptionExit(u32 exception) @@ -296,7 +320,7 @@ namespace CPUCompare Cleanup(); OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception)); MOV(32, M(&PC), Imm32(js.compilerPC + 4)); - JMP(Asm::testExceptions, true); + JMP(asm_routines.testExceptions, true); } const u8* Jit64::DoJit(u32 emaddress, JitBlock &b) @@ -326,11 +350,13 @@ namespace CPUCompare // Downcount flag check. The last block decremented downcounter, and the flag should still be available. FixupBranch skip = J_CC(CC_NBE); MOV(32, M(&PC), Imm32(js.blockStart)); - JMP(Asm::doTiming, true); // downcount hit zero - go doTiming. + JMP(asm_routines.doTiming, true); // downcount hit zero - go doTiming. SetJumpTarget(skip); const u8 *normalEntry = GetCodePtr(); - if (ImHereDebug) CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful + + if (ImHereDebug) + CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful if (js.fpa.any) { @@ -338,7 +364,7 @@ namespace CPUCompare TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit FixupBranch b1 = J_CC(CC_NZ); MOV(32, M(&PC), Imm32(js.blockStart)); - JMP(Asm::fpException, true); + JMP(asm_routines.fpException, true); SetJumpTarget(b1); } @@ -348,7 +374,7 @@ namespace CPUCompare TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF)); FixupBranch b1 = J_CC(CC_Z); MOV(32, M(&PC), Imm32(js.blockStart)); - JMP(Asm::testExceptions, true); + JMP(asm_routines.testExceptions, true); SetJumpTarget(b1); } @@ -404,7 +430,7 @@ namespace CPUCompare if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) { js.fifoBytesThisBlock -= 32; - CALL(ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0)); + CALL(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0)); } PPCTables::CompileInstruction(ops[i].inst); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h index c93dbf1a58..55e06fe4ba 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit.h @@ -24,7 +24,9 @@ #include "../PPCAnalyst.h" #include "JitCache.h" +#include "JitRegCache.h" #include "x64Emitter.h" +#include "x64Analyzer.h" #ifdef _WIN32 @@ -47,8 +49,24 @@ struct CONTEXT #endif -class Jit64 + +class TrampolineCache : public Gen::XCodeBlock { +public: + void Init(); + void Shutdown(); + + const u8 *GetReadTrampoline(const InstructionInfo &info); + const u8 *GetWriteTrampoline(const InstructionInfo &info); +}; + + +class Jit64 : public Gen::XCodeBlock +{ + TrampolineCache trampolines; + GPRRegCache gpr; + FPURegCache fpr; + public: typedef void (*CompiledCode)(); @@ -157,7 +175,7 @@ public: bool RangeIntersect(int s1, int e1, int s2, int e2) const; bool IsInJitCode(const u8 *codePtr); - u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx); + const u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx); #define JIT_OPCODE 0 @@ -165,6 +183,7 @@ public: const u8* DoJit(u32 emaddress, JitBlock &b); void Init(); + void Shutdown(); // Utilities for use by opcodes @@ -188,10 +207,10 @@ public: void ForceSinglePrecisionP(Gen::X64Reg xmm); void JitClearCA(); void JitSetCA(); - void tri_op(int d, int a, int b, bool reversible, void (*op)(Gen::X64Reg, Gen::OpArg)); + void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); typedef u32 (*Operation)(u32 a, u32 b); - void regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg)); + void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); + void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); // OPCODES diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp index 35c33ca3a8..0877ab9a05 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp @@ -31,27 +31,12 @@ #include "../../HW/CPUCompare.h" #include "../../HW/GPFifo.h" #include "../../Core.h" +#include "JitAsm.h" using namespace Gen; int blocksExecuted; -namespace Asm -{ -const u8 *enterCode; -const u8 *testExceptions; -const u8 *fpException; -const u8 *doTiming; -const u8 *dispatcher; -const u8 *dispatcherNoCheck; -const u8 *dispatcherPcInEAX; -const u8 *computeRc; -const u8 *computeRcFp; - -const u8 *fifoDirectWrite8; -const u8 *fifoDirectWrite16; -const u8 *fifoDirectWrite32; -const u8 *fifoDirectWriteFloat; -const u8 *fifoDirectWriteXmm64; +static int temp32; bool compareEnabled = false; @@ -72,16 +57,15 @@ static bool enableStatistics = false; //RBX - Base pointer of memory //R15 - Pointer to array of block pointers +AsmRoutineManager asm_routines; // PLAN: no more block numbers - crazy opcodes just contain offset within // dynarec buffer // At this offset - 4, there is an int specifying the block number. -void GenerateCommon(); - #ifdef _M_IX86 -void Generate() +void AsmRoutineManager::Generate() { enterCode = AlignCode16(); PUSH(EBP); @@ -129,7 +113,6 @@ void Generate() ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1)); } //grab from list and jump to it - //INT3(); MOV(32, R(EDX), ImmPtr(jit.GetCodePointers())); JMPptr(MComplex(EDX, EAX, 4, 0)); SetJumpTarget(notfound); @@ -180,12 +163,14 @@ void Generate() #elif defined(_M_X64) -void Generate() +void AsmRoutineManager::Generate() { enterCode = AlignCode16(); ABI_PushAllCalleeSavedRegsAndAdjustStack(); + if (!jit.GetCodePointers() || !Memory::base) + PanicAlert("Memory::base and jit.GetCodePointers() must return valid values"); MOV(64, R(RBX), Imm64((u64)Memory::base)); MOV(64, R(R15), Imm64((u64)jit.GetCodePointers())); //It's below 2GB so 32 bits are good enough const u8 *outerLoop = GetCodePtr(); @@ -264,7 +249,7 @@ void Generate() } #endif -void GenFifoWrite(int size) +void AsmRoutineManager::GenFifoWrite(int size) { // Assume value in ABI_PARAM1 PUSH(ESI); @@ -287,8 +272,7 @@ void GenFifoWrite(int size) RET(); } -static int temp32; -void GenFifoFloatWrite() +void AsmRoutineManager::GenFifoFloatWrite() { // Assume value in XMM0 PUSH(ESI); @@ -306,7 +290,7 @@ void GenFifoFloatWrite() RET(); } -void GenFifoXmm64Write() +void AsmRoutineManager::GenFifoXmm64Write() { // Assume value in XMM0. Assume pre-byteswapped (unlike the others here!) PUSH(ESI); @@ -319,7 +303,7 @@ void GenFifoXmm64Write() RET(); } -void GenerateCommon() +void AsmRoutineManager::GenerateCommon() { // USES_CR computeRc = AlignCode16(); @@ -364,5 +348,3 @@ void GenerateCommon() SetJumpTarget(skip_fast_write); CALL((void *)&Memory::Write_U8);*/ } - -} // namespace Asm diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h index b2d0b4620a..4eac598057 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.h @@ -14,33 +14,71 @@ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ + #ifndef _JITASM_H #define _JITASM_H -namespace Asm +#include "x64Emitter.h" + +// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near +// code at runtime. In the case of fixed code like this, after writing it, we write +// protect the memory, essentially making it work just like precompiled code. + +// There are some advantages to this approach: +// 1) No need to setup an external assembler in the build. +// 2) Cross platform, as long as it's x86/x64. +// 3) Can optimize code at runtime for the specific CPU model. +// There aren't really any disadvantages other than having to maintain a x86 emitter, +// which we have to do anyway :) +// +// To add a new asm routine, just add another const here, and add the code to Generate. +// Also, possibly increase the size of the code buffer. + +class AsmRoutineManager : public Gen::XCodeBlock { - extern const u8 *enterCode; - - extern const u8 *dispatcher; - extern const u8 *dispatcherNoCheck; - extern const u8 *dispatcherPcInEAX; - - extern const u8 *fpException; - extern const u8 *computeRc; - extern const u8 *computeRcFp; - extern const u8 *testExceptions; - extern const u8 *dispatchPcInEAX; - extern const u8 *doTiming; - - extern const u8 *fifoDirectWrite8; - extern const u8 *fifoDirectWrite16; - extern const u8 *fifoDirectWrite32; - extern const u8 *fifoDirectWriteFloat; - extern const u8 *fifoDirectWriteXmm64; - - extern bool compareEnabled; +private: void Generate(); -} + void GenerateCommon(); + void GenFifoWrite(int size); + void GenFifoFloatWrite(); + void GenFifoXmm64Write(); + +public: + void Init() { + AllocCodeSpace(8192); + Generate(); + WriteProtect(); + } + + void Shutdown() { + FreeCodeSpace(); + } + + + // Public generated functions. Just CALL(M((void*)func)) them. + + const u8 *enterCode; + + const u8 *dispatcher; + const u8 *dispatcherNoCheck; + const u8 *dispatcherPcInEAX; + + const u8 *fpException; + const u8 *computeRc; + const u8 *computeRcFp; + const u8 *testExceptions; + const u8 *dispatchPcInEAX; + const u8 *doTiming; + + const u8 *fifoDirectWrite8; + const u8 *fifoDirectWrite16; + const u8 *fifoDirectWrite32; + const u8 *fifoDirectWriteFloat; + const u8 *fifoDirectWriteXmm64; + + bool compareEnabled; +}; + +extern AsmRoutineManager asm_routines; #endif - diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitBackpatch.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitBackpatch.cpp index b0e1f5dfa2..73b6175f45 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitBackpatch.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitBackpatch.cpp @@ -33,7 +33,7 @@ using namespace Gen; extern u8 *trampolineCodePtr; - + void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) { u64 code_addr = (u64)codePtr; disassembler disasm; @@ -51,17 +51,105 @@ void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) { return; } + +void TrampolineCache::Init() +{ + AllocCodeSpace(1024 * 1024); +} + +void TrampolineCache::Shutdown() +{ + AllocCodeSpace(1024 * 1024); +} + +// Extremely simplistic - just generate the requested trampoline. May reuse them in the future. +const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info) +{ + if (GetSpaceLeft() < 1024) + PanicAlert("Trampoline cache full"); + + X64Reg addrReg = (X64Reg)info.scaledReg; + X64Reg dataReg = (X64Reg)info.regOperandReg; + const u8 *trampoline = GetCodePtr(); +#ifdef _M_X64 + // It's a read. Easy. + ABI_PushAllCallerSavedRegsAndAdjustStack(); + if (addrReg != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg)); + if (info.displacement) { + ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); + } + switch (info.operandSize) { + case 4: + CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1)); + break; + } + ABI_PopAllCallerSavedRegsAndAdjustStack(); + MOV(32, R(dataReg), R(EAX)); + RET(); +#endif + return trampoline; +} + +// Extremely simplistic - just generate the requested trampoline. May reuse them in the future. +const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info) +{ + if (GetSpaceLeft() < 1024) + PanicAlert("Trampoline cache full"); + + X64Reg addrReg = (X64Reg)info.scaledReg; + X64Reg dataReg = (X64Reg)info.regOperandReg; + if (dataReg != EAX) + PanicAlert("Backpatch write - not through EAX"); + + const u8 *trampoline = GetCodePtr(); + +#ifdef _M_X64 + + // It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a + // hardware access - we can take shortcuts. + //if (emAddress == 0xCC008000) + // PanicAlert("caught a fifo write"); + CMP(32, R(addrReg), Imm32(0xCC008000)); + FixupBranch skip_fast = J_CC(CC_NE, false); + MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); + CALL((void*)asm_routines.fifoDirectWrite32); + RET(); + SetJumpTarget(skip_fast); + ABI_PushAllCallerSavedRegsAndAdjustStack(); + if (addrReg != ABI_PARAM1) { + MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); + MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg)); + } else { + MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg)); + MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); + } + if (info.displacement) { + ADD(32, R(ABI_PARAM2), Imm32(info.displacement)); + } + switch (info.operandSize) { + case 4: + CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2)); + break; + } + ABI_PopAllCallerSavedRegsAndAdjustStack(); + RET(); +#endif + + return trampoline; +} + + // This generates some fairly heavy trampolines, but: // 1) It's really necessary. We don't know anything about the context. // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // that many of them in a typical program/game. -u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx) +const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx) { #ifdef _M_X64 if (!IsInJitCode(codePtr)) return 0; // this will become a regular crash real soon after this - u8 *oldCodePtr = GetWritableCodePtr(); InstructionInfo info; if (!DisassembleMov(codePtr, info, accessType)) { BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress); @@ -81,108 +169,42 @@ u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx) BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress); } - X64Reg addrReg = (X64Reg)info.scaledReg; - X64Reg dataReg = (X64Reg)info.regOperandReg; if (info.otherReg != RBX) PanicAlert("BackPatch : Base reg not RBX." "\n\nAttempted to access %08x.", emAddress); - //if (accessType == OP_ACCESS_WRITE) - // PanicAlert("BackPatch : Currently only supporting reads." - // "\n\nAttempted to write to %08x.", emAddress); - - // OK, let's write a trampoline, and a jump to it. - // Later, let's share trampolines. + + if (accessType == OP_ACCESS_WRITE) + PanicAlert("BackPatch : Currently only supporting reads." + "\n\nAttempted to write to %08x.", emAddress); // In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads. - // Next step - support writes, special case FIFO writes. Also, support 32-bit mode. - u8 *trampoline = trampolineCodePtr; - SetCodePtr(trampolineCodePtr); - if (accessType == 0) { - // It's a read. Easy. - ABI_PushAllCallerSavedRegsAndAdjustStack(); - if (addrReg != ABI_PARAM1) - MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg)); - if (info.displacement) { - ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); - } - switch (info.operandSize) { - case 4: - CALL(ProtectFunction((void *)&Memory::Read_U32, 1)); - break; - default: - BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress); - break; - } - ABI_PopAllCallerSavedRegsAndAdjustStack(); - MOV(32, R(dataReg), R(EAX)); - RET(); - trampolineCodePtr = GetWritableCodePtr(); - - SetCodePtr(codePtr); + XEmitter emitter(codePtr); int bswapNopCount; // Check the following BSWAP for REX byte - if ((GetCodePtr()[info.instructionSize] & 0xF0) == 0x40) + if ((codePtr[info.instructionSize] & 0xF0) == 0x40) bswapNopCount = 3; else bswapNopCount = 2; - CALL(trampoline); - NOP((int)info.instructionSize + bswapNopCount - 5); - SetCodePtr(oldCodePtr); - + const u8 *trampoline = trampolines.GetReadTrampoline(info); + emitter.CALL((void *)trampoline); + emitter.NOP((int)info.instructionSize + bswapNopCount - 5); return codePtr; } else if (accessType == 1) { - // It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a - // hardware access - we can take shortcuts. - //if (emAddress == 0xCC008000) - // PanicAlert("caught a fifo write"); - if (dataReg != EAX) - PanicAlert("Backpatch write - not through EAX"); - CMP(32, R(addrReg), Imm32(0xCC008000)); - FixupBranch skip_fast = J_CC(CC_NE, false); - MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); - CALL((void*)Asm::fifoDirectWrite32); - RET(); - SetJumpTarget(skip_fast); - ABI_PushAllCallerSavedRegsAndAdjustStack(); - if (addrReg != ABI_PARAM1) { - //INT3(); - MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); - MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg)); - } else { - MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg)); - MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); - } - if (info.displacement) { - ADD(32, R(ABI_PARAM2), Imm32(info.displacement)); - } - switch (info.operandSize) { - case 4: - CALL(ProtectFunction((void *)&Memory::Write_U32, 2)); - break; - default: - BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress); - break; - } - ABI_PopAllCallerSavedRegsAndAdjustStack(); - RET(); - - trampolineCodePtr = GetWritableCodePtr(); - + // TODO: special case FIFO writes. Also, support 32-bit mode. + // Also, debug this so that it actually works correctly :P + XEmitter emitter(codePtr - 2); // We know it's EAX so the BSWAP before will be two byte. Overwrite it. - SetCodePtr(codePtr - 2); - CALL(trampoline); - NOP((int)info.instructionSize - 3); + const u8 *trampoline = trampolines.GetWriteTrampoline(info); + emitter.CALL((void *)trampoline); + emitter.NOP((int)info.instructionSize - 3); if (info.instructionSize < 3) PanicAlert("instruction too small"); - SetCodePtr(oldCodePtr); - // We entered here with a BSWAP-ed EAX. We'll have to swap it back. ctx->Rax = Common::swap32(ctx->Rax); - return codePtr - 2; } return 0; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp index 7b69b34457..2220665667 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp @@ -56,19 +56,15 @@ using namespace Gen; op_agent_t agent; #endif static u8 *codeCache; - static u8 *genFunctions; static u8 *trampolineCache; u8 *trampolineCodePtr; #define INVALID_EXIT 0xFFFFFFFF enum { - //CODE_SIZE = 1024*1024*8, - GEN_SIZE = 4096, TRAMPOLINE_SIZE = 1024*1024, - //MAX_NUM_BLOCKS = 65536, }; - int CODE_SIZE = 1024*1024*16; + int MAX_NUM_BLOCKS = 65536*2; static u8 **blockCodePointers; @@ -89,36 +85,22 @@ using namespace Gen; void Jit64::InitCache() { - if(Core::g_CoreStartupParameter.bJITUnlimitedCache) + if (Core::g_CoreStartupParameter.bJITUnlimitedCache) { - CODE_SIZE = 1024*1024*8*8; MAX_NUM_BLOCKS = 65536*8; } - codeCache = (u8*)AllocateExecutableMemory(CODE_SIZE); - genFunctions = (u8*)AllocateExecutableMemory(GEN_SIZE); - trampolineCache = (u8*)AllocateExecutableMemory(TRAMPOLINE_SIZE); - trampolineCodePtr = trampolineCache; - #ifdef OPROFILE_REPORT agent = op_open_agent(); #endif blocks = new JitBlock[MAX_NUM_BLOCKS]; blockCodePointers = new u8*[MAX_NUM_BLOCKS]; + ClearCache(); - SetCodePtr(genFunctions); - Asm::Generate(); - // Protect the generated functions - WriteProtectMemory(genFunctions, GEN_SIZE, true); - SetCodePtr(codeCache); } void Jit64::ShutdownCache() { - UnWriteProtectMemory(genFunctions, GEN_SIZE, true); - FreeMemoryPages(codeCache, CODE_SIZE); - FreeMemoryPages(genFunctions, GEN_SIZE); - FreeMemoryPages(trampolineCache, TRAMPOLINE_SIZE); delete [] blocks; delete [] blockCodePointers; blocks = 0; @@ -135,21 +117,23 @@ using namespace Gen; { Core::DisplayMessage("Cleared code cache.", 3000); // Is destroying the blocks really necessary? - for (int i = 0; i < numBlocks; i++) { + for (int i = 0; i < numBlocks; i++) + { DestroyBlock(i, false); } links_to.clear(); - trampolineCodePtr = trampolineCache; numBlocks = 0; memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS); - memset(codeCache, 0xCC, CODE_SIZE); - SetCodePtr(codeCache); + + trampolines.ClearCodeSpace(); } void Jit64::DestroyBlocksWithFlag(BlockFlag death_flag) { - for (int i = 0; i < numBlocks; i++) { - if (blocks[i].flags & death_flag) { + for (int i = 0; i < numBlocks; i++) + { + if (blocks[i].flags & death_flag) + { DestroyBlock(i, false); } } @@ -190,10 +174,10 @@ using namespace Gen; const u8 *Jit64::Jit(u32 emAddress) { - if (GetCodePtr() >= codeCache + CODE_SIZE - 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1) + if (GetSpaceLeft() < 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1) { LOG(DYNA_REC, "JIT cache full - clearing.") - if(Core::g_CoreStartupParameter.bJITUnlimitedCache) + if (Core::g_CoreStartupParameter.bJITUnlimitedCache) { PanicAlert("What? JIT cache still full - clearing."); } @@ -221,10 +205,8 @@ using namespace Gen; } } - u8 *oldCodePtr = GetWritableCodePtr(); LinkBlock(numBlocks); LinkBlockExits(numBlocks); - SetCodePtr(oldCodePtr); } #ifdef OPROFILE_REPORT @@ -257,7 +239,7 @@ using namespace Gen; void Jit64::EnterFastRun() { - CompiledCode pExecAddr = (CompiledCode)Asm::enterCode; + CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode; pExecAddr(); //Will return when PowerPC::state changes } @@ -336,8 +318,8 @@ using namespace Gen; int destinationBlock = GetBlockNumberFromAddress(b.exitAddress[e]); if (destinationBlock != -1) { - SetCodePtr(b.exitPtrs[e]); - JMP(blocks[destinationBlock].checkedEntry, true); + XEmitter emit(b.exitPtrs[e]); + emit.JMP(blocks[destinationBlock].checkedEntry, true); b.linkStatus[e] = true; } } @@ -345,6 +327,7 @@ using namespace Gen; } using namespace std; + void Jit64::LinkBlock(int i) { LinkBlockExits(i); @@ -386,15 +369,15 @@ using namespace Gen; // Not entirely ideal, but .. pretty good. // TODO - make sure that the below stuff really is safe. - u8 *prev_code = GetWritableCodePtr(); + // Spurious entrances from previously linked blocks can only come through checkedEntry - SetCodePtr((u8*)b.checkedEntry); - MOV(32, M(&PC), Imm32(b.originalAddress)); - JMP(Asm::dispatcher, true); - SetCodePtr(blockCodePointers[blocknum]); - MOV(32, M(&PC), Imm32(b.originalAddress)); - JMP(Asm::dispatcher, true); - SetCodePtr(prev_code); // reset code pointer + XEmitter emit((u8*)b.checkedEntry); + emit.MOV(32, M(&PC), Imm32(b.originalAddress)); + emit.JMP(asm_routines.dispatcher, true); + + emit.SetCodePtr(blockCodePointers[blocknum]); + emit.MOV(32, M(&PC), Imm32(b.originalAddress)); + emit.JMP(asm_routines.dispatcher, true); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.h b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.h index 87a8814536..49f865fde4 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.h @@ -19,6 +19,6 @@ #include "../Gekko.h" -// Will soon introduced the JitBlockCache class here. +// Will soon introduce the JitBlockCache class here. #endif diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitCore.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitCore.cpp index 5b4635a439..9f60331fae 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitCore.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCore.cpp @@ -34,13 +34,12 @@ namespace JitCore void Init() { jit.Init(); - jit.InitCache(); - Asm::compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient; + asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient; } void Shutdown() { - jit.ShutdownCache(); + jit.Shutdown(); } void SingleStep() diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp index 976eba58c6..11080033df 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.cpp @@ -27,8 +27,6 @@ using namespace Gen; using namespace PowerPC; - GPRRegCache gpr; - FPURegCache fpr; void RegCache::Start(PPCAnalyst::BlockRegStats &stats) { @@ -267,7 +265,7 @@ using namespace PowerPC; xregs[xr].dirty = makeDirty || regs[i].location.IsImm(); OpArg newloc = ::Gen::R(xr); if (doLoad || regs[i].location.IsImm()) - MOV(32, newloc, regs[i].location); + emit->MOV(32, newloc, regs[i].location); for (int j = 0; j < 32; j++) { if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr) @@ -309,7 +307,7 @@ using namespace PowerPC; } OpArg newLoc = GetDefaultLocation(i); // if (doStore) //<-- Breaks JIT compilation - MOV(32, newLoc, regs[i].location); + emit->MOV(32, newLoc, regs[i].location); regs[i].location = newLoc; regs[i].away = false; } @@ -327,11 +325,13 @@ using namespace PowerPC; xregs[xr].free = false; xregs[xr].dirty = makeDirty; OpArg newloc = ::Gen::R(xr); - if (doLoad) { - if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF)) { + if (doLoad) + { + if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF)) + { PanicAlert("WARNING - misaligned fp register location %i", i); } - MOVAPD(xr, regs[i].location); + emit->MOVAPD(xr, regs[i].location); } regs[i].location = newloc; regs[i].away = true; @@ -352,7 +352,7 @@ using namespace PowerPC; xregs[xr].dirty = false; xregs[xr].ppcReg = -1; OpArg newLoc = GetDefaultLocation(i); - MOVAPD(newLoc, xr); + emit->MOVAPD(newLoc, xr); regs[i].location = newLoc; regs[i].away = false; } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h index c3a0fa07a6..4888c7019c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitRegCache.h @@ -72,10 +72,15 @@ void DiscardRegContentsIfCached(int preg); virtual const int *GetAllocationOrder(int &count) = 0; + + XEmitter *emit; public: virtual ~RegCache() {} virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0; + + void SetEmitter(XEmitter *emitter) {emit = emitter;} + void FlushR(X64Reg reg); void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);} void FlushLockX(X64Reg reg) { @@ -142,8 +147,5 @@ OpArg GetDefaultLocation(int reg) const; }; - extern GPRRegCache gpr; - extern FPURegCache fpr; - #endif diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index fa3e003d51..64bb657a40 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -33,39 +33,39 @@ const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0}; - void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg)) + void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)) { fpr.Lock(d, a, b); if (d == a) { fpr.LoadToX64(d, true); - op(fpr.RX(d), fpr.R(b)); + (this->*op)(fpr.RX(d), fpr.R(b)); } else if (d == b && reversible) { fpr.LoadToX64(d, true); - op(fpr.RX(d), fpr.R(a)); + (this->*op)(fpr.RX(d), fpr.R(a)); } else if (a != d && b != d) { // Sources different from d, can use rather quick solution fpr.LoadToX64(d, !dupe); MOVSD(fpr.RX(d), fpr.R(a)); - op(fpr.RX(d), fpr.R(b)); + (this->*op)(fpr.RX(d), fpr.R(b)); } else if (b != d) { fpr.LoadToX64(d, !dupe); MOVSD(XMM0, fpr.R(b)); MOVSD(fpr.RX(d), fpr.R(a)); - op(fpr.RX(d), Gen::R(XMM0)); + (this->*op)(fpr.RX(d), Gen::R(XMM0)); } else // Other combo, must use two temps :( { MOVSD(XMM0, fpr.R(a)); MOVSD(XMM1, fpr.R(b)); fpr.LoadToX64(d, !dupe); - op(XMM0, Gen::R(XMM1)); + (this->*op)(XMM0, Gen::R(XMM1)); MOVSD(fpr.RX(d), Gen::R(XMM0)); } if (dupe) { @@ -86,16 +86,16 @@ bool dupe = inst.OPCD == 59; switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &DIVSD); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &SUBSD); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &ADDSD); break; //add + case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div + case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub + case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add case 23: //sel Default(inst); break; case 24: //res Default(inst); break; - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &MULSD); break; //mul + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp index 68d374136c..a30bcdf71e 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Integer.cpp @@ -42,7 +42,7 @@ u32 And(u32 a, u32 b) {return a & b;} u32 Xor(u32 a, u32 b) {return a ^ b;} - void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry) + void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry) { gpr.Lock(d, a); if (a || binary || carry) // yeh nasty special case addic @@ -57,7 +57,7 @@ { if (gpr.R(d).IsImm()) gpr.LoadToX64(d, false); - op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; if (carry) GenerateCarry(EAX); } @@ -66,7 +66,7 @@ { gpr.LoadToX64(d, false); MOV(32, gpr.R(d), gpr.R(a)); - op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; if (carry) GenerateCarry(EAX); } @@ -84,7 +84,7 @@ { // Todo - special case immediates. MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } gpr.UnlockAll(); } @@ -109,22 +109,22 @@ MOV(32, gpr.R(d), gpr.R(a)); gpr.UnlockAll(); } else { - regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD); //addi + regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD); //addi } break; - case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, ADD); break; //addis + case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD); break; //addis case 24: if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc) //check for nop {NOP(); return;} //make the nop visible in the generated code. not much use but interesting if we see one. - regimmop(a, s, true, inst.UIMM, Or, OR); + regimmop(a, s, true, inst.UIMM, Or, &XEmitter::OR); break; //ori - case 25: regimmop(a, s, true, inst.UIMM << 16, Or, OR, false); break;//oris - case 28: regimmop(a, s, true, inst.UIMM, And, AND, true); break; - case 29: regimmop(a, s, true, inst.UIMM << 16, And, AND, true); break; - case 26: regimmop(a, s, true, inst.UIMM, Xor, XOR, false); break; //xori - case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, XOR, false); break; //xoris - case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD, false, true); //addic - case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, ADD, true, true); //addic_rc + case 25: regimmop(a, s, true, inst.UIMM << 16, Or, &XEmitter::OR, false); break;//oris + case 28: regimmop(a, s, true, inst.UIMM, And, &XEmitter::AND, true); break; + case 29: regimmop(a, s, true, inst.UIMM << 16, And, &XEmitter::AND, true); break; + case 26: regimmop(a, s, true, inst.UIMM, Xor, &XEmitter::XOR, false); break; //xori + case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, &XEmitter::XOR, false); break; //xoris + case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, false, true); //addic + case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, true, true); //addic_rc default: Default(inst); break; @@ -295,7 +295,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -328,7 +328,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -353,7 +353,7 @@ if (inst.Rc) { // result is already in eax - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -374,7 +374,7 @@ MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -394,7 +394,7 @@ MOVSX(32, 16, gpr.RX(a), gpr.R(s)); if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -474,7 +474,7 @@ if (inst.OE) PanicAlert("OE: subfx"); if (inst.Rc) { // result is already in eax - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -514,7 +514,7 @@ gpr.UnlockAll(); if (inst.Rc) { MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -544,7 +544,7 @@ MOV(32, R(EAX), R(EDX)); MOV(32, gpr.R(d), R(EDX)); // result is already in eax - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } else { MOV(32, gpr.R(d), R(EDX)); } @@ -570,7 +570,7 @@ gpr.UnlockAll(); gpr.UnlockAllX(); if (inst.Rc) { - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -606,7 +606,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } gpr.UnlockAll(); } @@ -618,7 +618,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } gpr.UnlockAll(); } @@ -630,7 +630,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(d)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } gpr.UnlockAll(); } @@ -666,7 +666,7 @@ gpr.UnlockAllX(); if (inst.Rc) { - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -730,7 +730,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -767,7 +767,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -799,7 +799,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -821,7 +821,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -851,7 +851,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -881,7 +881,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -929,7 +929,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -975,7 +975,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); } } @@ -1006,7 +1006,7 @@ if (inst.Rc) { MOV(32, R(EAX), gpr.R(a)); - CALL((u8*)Asm::computeRc); + CALL((u8*)asm_routines.computeRc); // TODO: Check PPC manual too } } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp index 2e3156eea1..6beddc1f8a 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp @@ -144,7 +144,7 @@ fpr.Flush(FLUSH_ALL); ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); MOV(32, M(&PowerPC::ppcState.pc), Imm32(js.compilerPC + 12)); - JMP(Asm::testExceptions, true); + JMP(asm_routines.testExceptions, true); js.compilerPC += 8; return; } @@ -287,14 +287,13 @@ gpr.SetImmediate32(a, addr); gpr.FlushLockX(ABI_PARAM1); MOV(32, R(ABI_PARAM1), gpr.R(s)); - // INT3(); switch (accessSize) { // No need to protect these, they don't touch any state // question - should we inline them instead? Pro: Lose a CALL Con: Code bloat - case 8: CALL((void *)Asm::fifoDirectWrite8); break; - case 16: CALL((void *)Asm::fifoDirectWrite16); break; - case 32: CALL((void *)Asm::fifoDirectWrite32); break; + case 8: CALL((void *)asm_routines.fifoDirectWrite8); break; + case 16: CALL((void *)asm_routines.fifoDirectWrite16); break; + case 32: CALL((void *)asm_routines.fifoDirectWrite32); break; } js.fifoBytesThisBlock += accessSize >> 3; gpr.UnlockAllX(); @@ -377,9 +376,9 @@ SetJumpTarget(unsafe_addr); switch (accessSize) { - case 32: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break; - case 16: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break; - case 8: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U8, 2), ABI_PARAM1, ABI_PARAM2); break; + case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break; + case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break; + case 8: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), ABI_PARAM1, ABI_PARAM2); break; } SetJumpTarget(skip_call); gpr.UnlockAll(); @@ -402,7 +401,6 @@ //return _inst.RA ? (m_GPR[_inst.RA] + _inst.SIMM_16) : _inst.SIMM_16; gpr.FlushLockX(ECX, EDX); gpr.FlushLockX(ESI); - //INT3(); MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16)); if (inst.RA) ADD(32, R(EAX), gpr.R(inst.RA)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index bdeb188d40..33d6e7703c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -242,7 +242,7 @@ void Jit64::stfs(UGeckoInstruction inst) { // Float directly to write gather pipe! Fun! CVTSD2SS(XMM0, fpr.R(s)); - CALL((void*)Asm::fifoDirectWriteFloat); + CALL((void*)asm_routines.fifoDirectWriteFloat); // TODO js.fifoBytesThisBlock += 4; return; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp index ad225244aa..d98a0f9ece 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -161,7 +161,7 @@ void Jit64::psq_st(UGeckoInstruction inst) #endif FixupBranch skip_call = J(); SetJumpTarget(argh); - ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); SetJumpTarget(skip_call); gpr.UnlockAll(); gpr.UnlockAllX(); @@ -184,7 +184,7 @@ void Jit64::psq_st(UGeckoInstruction inst) // Writing to FIFO. Let's do fast method. CVTPD2PS(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void*)&pbswapShuffle2x4)); - CALL((void*)Asm::fifoDirectWriteXmm64); + CALL((void*)asm_routines.fifoDirectWriteXmm64); js.fifoBytesThisBlock += 8; return; } @@ -211,7 +211,7 @@ void Jit64::psq_st(UGeckoInstruction inst) MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); FixupBranch arg2 = J(); SetJumpTarget(argh); - CALL(ProtectFunction((void *)&WriteDual32, 0)); + CALL(thunks.ProtectFunction((void *)&WriteDual32, 0)); #else FixupBranch argh = J_CC(CC_NZ); MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4)); @@ -224,10 +224,10 @@ void Jit64::psq_st(UGeckoInstruction inst) FixupBranch arg2 = J(); SetJumpTarget(argh); MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4)); - ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); MOV(32, R(ABI_PARAM1), M(((char*)&temp64))); ADD(32, R(ABI_PARAM2), Imm32(4)); - ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); #endif SetJumpTarget(arg2); gpr.UnlockAll(); @@ -424,7 +424,6 @@ void Jit64::psq_l(UGeckoInstruction inst) #endif BSWAP(32, EAX); MOV(32, M(&temp64), R(EAX)); - //INT3(); fpr.LoadToX64(inst.RS, false, true); X64Reg r = fpr.R(inst.RS).GetSimpleReg(); MOVD_xmm(XMM0, M(&temp64)); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp index 5deda52833..91ca5829e5 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Paired.cpp @@ -163,40 +163,40 @@ */ //There's still a little bit more optimization that can be squeezed out of this - void Jit64::tri_op(int d, int a, int b, bool reversible, void (*op)(X64Reg, OpArg)) + void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg)) { fpr.Lock(d, a, b); if (d == a) { fpr.LoadToX64(d, true); - op(fpr.RX(d), fpr.R(b)); + (this->*op)(fpr.RX(d), fpr.R(b)); } else if (d == b && reversible) { fpr.LoadToX64(d, true); - op(fpr.RX(d), fpr.R(a)); + (this->*op)(fpr.RX(d), fpr.R(a)); } else if (a != d && b != d) { //sources different from d, can use rather quick solution fpr.LoadToX64(d, false); MOVAPD(fpr.RX(d), fpr.R(a)); - op(fpr.RX(d), fpr.R(b)); + (this->*op)(fpr.RX(d), fpr.R(b)); } else if (b != d) { fpr.LoadToX64(d, false); MOVAPD(XMM0, fpr.R(b)); MOVAPD(fpr.RX(d), fpr.R(a)); - op(fpr.RX(d), Gen::R(XMM0)); + (this->*op)(fpr.RX(d), Gen::R(XMM0)); } else //Other combo, must use two temps :( { MOVAPD(XMM0, fpr.R(a)); MOVAPD(XMM1, fpr.R(b)); fpr.LoadToX64(d, false); - op(XMM0, Gen::R(XMM1)); + (this->*op)(XMM0, Gen::R(XMM1)); MOVAPD(fpr.RX(d), Gen::R(XMM0)); } ForceSinglePrecisionP(fpr.RX(d)); @@ -213,16 +213,16 @@ } switch (inst.SUBOP5) { - case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &DIVPD); break; //div - case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &SUBPD); break; //sub - case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &ADDPD); break; //add + case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div + case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub + case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; //add case 23://sel Default(inst); break; case 24://res Default(inst); break; - case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &MULPD); break; //mul + case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp index fb3d535812..45d6721515 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_Util.cpp @@ -76,9 +76,9 @@ void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signEx FixupBranch argh = J_CC(CC_Z); switch (accessSize) { - case 32: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U32, 1), reg); break; - case 16: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U16, 1), reg); break; - case 8: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U8, 1), reg); break; + case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break; + case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break; + case 8: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break; } if (signExtend && accessSize < 32) { // Need to sign extend values coming from the Read_U* functions. @@ -114,7 +114,7 @@ void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0); FixupBranch skip_call = J(); SetJumpTarget(unsafe_addr); - ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); SetJumpTarget(skip_call); } diff --git a/Source/Plugins/Plugin_VideoOGL/Src/BPStructs.cpp b/Source/Plugins/Plugin_VideoOGL/Src/BPStructs.cpp index 2d3ba92d5c..022d770cae 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/BPStructs.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/BPStructs.cpp @@ -463,7 +463,7 @@ void BPWritten(int addr, int changes, int newval) { // the number of lines copied is determined by the y scale * source efb height float yScale = bpmem.dispcopyyscale / 256.0f; - float xfbLines = bpmem.copyTexSrcWH.y + 1.0 * yScale; + float xfbLines = bpmem.copyTexSrcWH.y + 1.0f * yScale; XFB_Write(Memory_GetPtr(bpmem.copyTexDest<<5), multirc, (bpmem.copyMipMapStrideChannels << 4), (int)xfbLines); } else diff --git a/Source/Plugins/Plugin_VideoOGL/Src/NativeVertexFormat.cpp b/Source/Plugins/Plugin_VideoOGL/Src/NativeVertexFormat.cpp index ebe1ae70b6..3243092dab 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/NativeVertexFormat.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/NativeVertexFormat.cpp @@ -82,68 +82,68 @@ void NativeVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl) } #ifdef USE_JIT + Gen::XEmitter emit(m_compiledCode); // Alright, we have our vertex declaration. Compile some crazy code to set it quickly using GL. - u8 *old_code_ptr = GetWritableCodePtr(); - SetCodePtr(m_compiledCode); - ABI_EmitPrologue(6); + emit.ABI_EmitPrologue(6); - CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0); + emit.CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0); if (_vtx_decl.num_normals >= 1) { - CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]); + emit.CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]); if (_vtx_decl.num_normals == 3) { - CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]); - CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]); + emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]); + emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]); } } for (int i = 0; i < 2; i++) { if (_vtx_decl.color_offset[i] != -1) { if (i == 0) - CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]); + emit.CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]); else - CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]); + emit.CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]); } } - for (int i = 0; i < 8; i++) { - if (_vtx_decl.texcoord_offset[i] != -1) { + for (int i = 0; i < 8; i++) + { + if (_vtx_decl.texcoord_offset[i] != -1) + { int id = GL_TEXTURE0 + i; #ifdef _M_X64 #ifdef _MSC_VER - MOV(32, R(RCX), Imm32(id)); + emit.MOV(32, R(RCX), Imm32(id)); #else - MOV(32, R(RDI), Imm32(id)); + emit.MOV(32, R(RDI), Imm32(id)); #endif #else - ABI_AlignStack(1 * 4); - PUSH(32, Imm32(id)); + emit.ABI_AlignStack(1 * 4); + emit.PUSH(32, Imm32(id)); #endif - CALL((void *)glClientActiveTexture); + emit.CALL((void *)glClientActiveTexture); #ifndef _M_X64 #ifdef _WIN32 // don't inc stack on windows, stdcall #else - ABI_RestoreStack(1 * 4); + emit.ABI_RestoreStack(1 * 4); #endif #endif - CallCdeclFunction4_I( + emit.CallCdeclFunction4_I( glTexCoordPointer, _vtx_decl.texcoord_size[i], VarToGL(_vtx_decl.texcoord_gl_type[i]), _vtx_decl.stride, _vtx_decl.texcoord_offset[i]); } } if (_vtx_decl.posmtx_offset != -1) { - CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset); + emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset); } - ABI_EmitEpilogue(6); - if (Gen::GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE) + emit.ABI_EmitEpilogue(6); + if (emit.GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE) { Crash(); } - SetCodePtr(old_code_ptr); #endif this->vtx_decl = _vtx_decl; } diff --git a/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.cpp b/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.cpp index fba64ea3f3..8207794088 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.cpp @@ -44,7 +44,7 @@ #define USE_JIT -#define COMPILED_CODE_SIZE 4096*4 +#define COMPILED_CODE_SIZE 4096 NativeVertexFormat *g_nativeVertexFmt; @@ -116,6 +116,7 @@ void LOADERDECL TexMtx_Write_Short3() VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) { + m_compiledCode = NULL; m_numLoadedVertices = 0; m_VertexSize = 0; m_numPipelineStages = 0; @@ -126,16 +127,14 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_VtxDesc = vtx_desc; SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex); - m_compiledCode = (u8 *)AllocateExecutableMemory(COMPILED_CODE_SIZE, false); - if (m_compiledCode) { - memset(m_compiledCode, 0, COMPILED_CODE_SIZE); - } + AllocCodeSpace(COMPILED_CODE_SIZE); CompileVertexTranslator(); + WriteProtect(); } VertexLoader::~VertexLoader() { - FreeMemoryPages(m_compiledCode, COMPILED_CODE_SIZE); + FreeCodeSpace(); delete m_NativeFmt; } @@ -143,13 +142,14 @@ void VertexLoader::CompileVertexTranslator() { m_VertexSize = 0; const TVtxAttr &vtx_attr = m_VtxAttr; - //const TVtxDesc &vtx_desc = m_VtxDesc; #ifdef USE_JIT - u8 *old_code_ptr = GetWritableCodePtr(); - SetCodePtr(m_compiledCode); + if (m_compiledCode) + PanicAlert("trying to recompile a vtx translator"); + + m_compiledCode = GetCodePtr(); ABI_EmitPrologue(4); - // MOV(32, R(EBX), M(&loop_counter)); + // Start loop here const u8 *loop_start = GetCodePtr(); @@ -477,7 +477,6 @@ void VertexLoader::CompileVertexTranslator() //SUB(32, R(EBX), Imm8(1)); J_CC(CC_NZ, loop_start, true); ABI_EmitEpilogue(4); - SetCodePtr(old_code_ptr); #endif m_NativeFmt->Initialize(vtx_decl); } diff --git a/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.h b/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.h index 3f430144eb..34ead98b88 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.h +++ b/Source/Plugins/Plugin_VideoOGL/Src/VertexLoader.h @@ -22,9 +22,10 @@ #include "CPMemory.h" #include "DataReader.h" - #include "NativeVertexFormat.h" +#include "x64Emitter.h" + class VertexLoaderUID { u32 vid[5]; @@ -52,7 +53,7 @@ public: } }; -class VertexLoader +class VertexLoader : public Gen::XCodeBlock { public: VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); @@ -86,7 +87,7 @@ private: TPipelineFunction m_PipelineStages[64]; // TODO - figure out real max. it's lower. int m_numPipelineStages; - u8 *m_compiledCode; + const u8 *m_compiledCode; int m_numLoadedVertices;