Turn the X86 emitter into a class, so the code pointer is no longer a global, yay! Created XCodeBlock that derives from XEmitter, and the Jit now derives from XCodeBlock so it can call all ADD SUB JNZ etc without having to prefix them with "emit.". I think someone's gonna like this.

There's some cleanup still to be done, but hey, it works. There shouldn't be a noticable speed difference.

I hope GCC doesn't have a problem with the "member function pointers" I used.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1594 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
hrydgard 2008-12-19 21:24:52 +00:00
parent b5dcdcf779
commit 104acd5bc1
31 changed files with 1297 additions and 1153 deletions

View File

@ -25,7 +25,7 @@ using namespace Gen;
// ====================================
// Sets up a __cdecl function.
void ABI_EmitPrologue(int maxCallParams)
void XEmitter::ABI_EmitPrologue(int maxCallParams)
{
#ifdef _M_IX86
// Don't really need to do anything
@ -40,7 +40,8 @@ void ABI_EmitPrologue(int maxCallParams)
#error Arch not supported
#endif
}
void ABI_EmitEpilogue(int maxCallParams)
void XEmitter::ABI_EmitEpilogue(int maxCallParams)
{
#ifdef _M_IX86
RET();
@ -60,14 +61,14 @@ void ABI_EmitEpilogue(int maxCallParams)
// Shared code between Win32 and Unix32
// ====================================
void ABI_CallFunctionC(void *func, u32 param1) {
void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
ABI_AlignStack(1 * 4);
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(1 * 4);
}
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
ABI_AlignStack(2 * 4);
PUSH(32, Imm32(param2));
PUSH(32, Imm32(param1));
@ -76,14 +77,14 @@ void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
}
// Pass a register as a paremeter.
void ABI_CallFunctionR(void *func, X64Reg reg1) {
void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
ABI_AlignStack(1 * 4);
PUSH(32, R(reg1));
CALL(func);
ABI_RestoreStack(1 * 4);
}
void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
void XEmitter::ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
{
ABI_AlignStack(2 * 4);
PUSH(32, R(reg2));
@ -92,7 +93,7 @@ void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
ABI_RestoreStack(2 * 4);
}
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
{
ABI_AlignStack(2 * 4);
PUSH(32, arg1);
@ -101,7 +102,7 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
ABI_RestoreStack(2 * 4);
}
void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
// Note: 4 * 4 = 16 bytes, so alignment is preserved.
PUSH(EBP);
PUSH(EBX);
@ -109,14 +110,14 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
PUSH(EDI);
}
void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
POP(EDI);
POP(ESI);
POP(EBX);
POP(EBP);
}
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
frameSize += 4; // reserve space for return address
unsigned int alignedSize =
#ifdef __GNUC__
@ -128,7 +129,7 @@ unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
}
void ABI_AlignStack(unsigned int frameSize) {
void XEmitter::ABI_AlignStack(unsigned int frameSize) {
// Mac OS X requires the stack to be 16-byte aligned before every call.
// Linux requires the stack to be 16-byte aligned before calls that put SSE
// vectors on the stack, but since we do not keep track of which calls do that,
@ -145,7 +146,7 @@ void ABI_AlignStack(unsigned int frameSize) {
#endif
}
void ABI_RestoreStack(unsigned int frameSize) {
void XEmitter::ABI_RestoreStack(unsigned int frameSize) {
unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize);
alignedSize -= 4; // return address is POPped at end of call
if (alignedSize != 0) {
@ -155,26 +156,26 @@ void ABI_RestoreStack(unsigned int frameSize) {
#else
void ABI_CallFunctionC(void *func, u32 param1) {
void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
MOV(32, R(ABI_PARAM1), Imm32(param1));
CALL(func);
}
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
MOV(32, R(ABI_PARAM1), Imm32(param1));
MOV(32, R(ABI_PARAM2), Imm32(param2));
CALL(func);
}
// Pass a register as a paremeter.
void ABI_CallFunctionR(void *func, X64Reg reg1) {
void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
if (reg1 != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R(reg1));
CALL(func);
}
// Pass a register as a paremeter.
void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
void XEmitter::ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
if (reg1 != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R(reg1));
if (reg2 != ABI_PARAM2)
@ -182,7 +183,7 @@ void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
CALL(func);
}
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
{
if (!arg1.IsSimpleReg(ABI_PARAM1))
MOV(32, R(ABI_PARAM1), arg1);
@ -190,21 +191,21 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
CALL(func);
}
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
return frameSize;
}
void ABI_AlignStack(unsigned int /*frameSize*/) {
void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
}
void ABI_RestoreStack(unsigned int /*frameSize*/) {
void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
}
#ifdef _WIN32
// Win64 Specific Code
// ====================================
void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
//we only want to do this once
PUSH(RBX);
PUSH(RSI);
@ -218,7 +219,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
SUB(64, R(RSP), Imm8(0x28));
}
void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
ADD(64, R(RSP), Imm8(0x28));
POP(R15);
POP(R14);
@ -232,7 +233,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
// Win64 Specific Code
// ====================================
void ABI_PushAllCallerSavedRegsAndAdjustStack() {
void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
PUSH(RCX);
PUSH(RDX);
PUSH(RSI);
@ -245,7 +246,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
SUB(64, R(RSP), Imm8(0x28));
}
void ABI_PopAllCallerSavedRegsAndAdjustStack() {
void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
ADD(64, R(RSP), Imm8(0x28));
POP(R11);
POP(R10);
@ -260,7 +261,7 @@ void ABI_PopAllCallerSavedRegsAndAdjustStack() {
#else
// Unix64 Specific Code
// ====================================
void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
PUSH(RBX);
PUSH(RBP);
PUSH(R12);
@ -270,7 +271,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
}
void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
POP(R15);
POP(R15);
POP(R14);
@ -280,7 +281,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
POP(RBX);
}
void ABI_PushAllCallerSavedRegsAndAdjustStack() {
void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
PUSH(RCX);
PUSH(RDX);
PUSH(RSI);
@ -292,7 +293,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
PUSH(R11);
}
void ABI_PopAllCallerSavedRegsAndAdjustStack() {
void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
POP(R11);
POP(R11);
POP(R10);

View File

@ -18,8 +18,6 @@
#ifndef _JIT_ABI_H
#define _JIT_ABI_H
#include "x64Emitter.h"
// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
// All convensions return values in EAX (+ possibly EDX).
@ -81,42 +79,5 @@
#endif
// Utility functions
// These only support u32 parameters, but that's enough for a lot of uses.
// These will destroy the 1 or 2 first "parameter regs".
void ABI_CallFunctionC(void *func, u32 param1);
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
// Pass a register as a paremeter.
void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
// A function that doesn't have any control over what it will do to regs,
// such as the dispatcher, should be surrounded by these.
void ABI_PushAllCalleeSavedRegsAndAdjustStack();
void ABI_PopAllCalleeSavedRegsAndAdjustStack();
// A function that doesn't know anything about it's surroundings, should
// be surrounded by these to establish a safe environment, where it can roam free.
// An example is a backpatch injected function.
void ABI_PushAllCallerSavedRegsAndAdjustStack();
void ABI_PopAllCallerSavedRegsAndAdjustStack();
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
void ABI_AlignStack(unsigned int frameSize);
void ABI_RestoreStack(unsigned int frameSize);
// Sets up a __cdecl function.
// Only x64 really needs the parameter.
void ABI_EmitPrologue(int maxCallParams);
void ABI_EmitEpilogue(int maxCallParams);
#ifdef _M_IX86
inline int ABI_GetNumXMMRegs() { return 8; }
#else
inline int ABI_GetNumXMMRegs() { return 16; }
#endif
#endif // _JIT_ABI_H

View File

@ -38,7 +38,7 @@
// This is purposedely not a full wrapper for virtualalloc/mmap, but it
// provides exactly the primitive operations that Dolphin needs.
void* AllocateExecutableMemory(int size, bool low)
void* AllocateExecutableMemory(size_t size, bool low)
{
#ifdef _WIN32
void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
@ -71,7 +71,7 @@ void* AllocateExecutableMemory(int size, bool low)
}
void* AllocateMemoryPages(int size)
void* AllocateMemoryPages(size_t size)
{
#ifdef _WIN32
void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_READWRITE);
@ -99,7 +99,7 @@ void* AllocateMemoryPages(int size)
}
void FreeMemoryPages(void* ptr, int size)
void FreeMemoryPages(void* ptr, size_t size)
{
#ifdef _WIN32
if (ptr)
@ -113,7 +113,7 @@ void FreeMemoryPages(void* ptr, int size)
}
void WriteProtectMemory(void* ptr, int size, bool allowExecute)
void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
{
#ifdef _WIN32
VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, 0);
@ -123,7 +123,7 @@ void WriteProtectMemory(void* ptr, int size, bool allowExecute)
}
void UnWriteProtectMemory(void* ptr, int size, bool allowExecute)
void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
{
#ifdef _WIN32
VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READONLY, 0);

View File

@ -18,14 +18,14 @@
#ifndef _MEMORYUTIL_H
#define _MEMORYUTIL_H
void* AllocateExecutableMemory(int size, bool low = true);
void* AllocateMemoryPages(int size);
void FreeMemoryPages(void* ptr, int size);
void WriteProtectMemory(void* ptr, int size, bool executable = false);
void UnWriteProtectMemory(void* ptr, int size, bool allowExecute);
void* AllocateExecutableMemory(size_t size, bool low = true);
void* AllocateMemoryPages(size_t size);
void FreeMemoryPages(void* ptr, size_t size);
void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute);
inline int GetPageSize() {return(4096);}
inline int GetPageSize() {return 4096;}
#endif

View File

@ -18,33 +18,29 @@
#include <map>
#include "Common.h"
#include "Thunk.h"
#include "x64Emitter.h"
#include "MemoryUtil.h"
#include "ABI.h"
#include "Thunk.h"
using namespace Gen;
ThunkManager thunks;
#define THUNK_ARENA_SIZE 1024*1024*1
namespace {
static std::map<void *, const u8 *> thunks;
u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
static u8 *thunk_memory;
static u8 *thunk_code;
static const u8 *save_regs;
static const u8 *load_regs;
static u16 saved_mxcsr;
}
void Thunk_Init()
namespace
{
thunk_memory = (u8 *)AllocateExecutableMemory(THUNK_ARENA_SIZE);
thunk_code = thunk_memory;
GenContext ctx(&thunk_code);
static u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
static u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
static u16 saved_mxcsr;
} // namespace
using namespace Gen;
void ThunkManager::Init()
{
AllocCodeSpace(THUNK_ARENA_SIZE);
save_regs = GetCodePtr();
for (int i = 2; i < ABI_GetNumXMMRegs(); i++)
MOVAPS(M(saved_fp_state + i * 16), (X64Reg)(XMM0 + i));
@ -89,31 +85,27 @@ void Thunk_Init()
RET();
}
void Thunk_Reset()
void ThunkManager::Reset()
{
thunks.clear();
thunk_code = thunk_memory;
ResetCodePtr();
}
void Thunk_Shutdown()
void ThunkManager::Shutdown()
{
Thunk_Reset();
FreeMemoryPages(thunk_memory, THUNK_ARENA_SIZE);
thunk_memory = 0;
thunk_code = 0;
Reset();
FreeCodeSpace();
}
void *ProtectFunction(void *function, int num_params)
void *ThunkManager::ProtectFunction(void *function, int num_params)
{
std::map<void *, const u8 *>::iterator iter;
iter = thunks.find(function);
if (iter != thunks.end())
return (void *)iter->second;
if (!thunk_memory)
if (!region)
PanicAlert("Trying to protect functions before the emu is started. Bad bad bad.");
GenContext gen(&thunk_code);
const u8 *call_point = GetCodePtr();
// Make sure to align stack.
#ifdef _M_X64

View File

@ -18,6 +18,11 @@
#ifndef _THUNK_H
#define _THUNK_H
#include <map>
#include "Common.h"
#include "x64Emitter.h"
// This simple class creates a wrapper around a C/C++ function that saves all fp state
// before entering it, and restores it upon exit. This is required to be able to selectively
// call functions from generated code, without inflicting the performance hit and increase
@ -30,10 +35,21 @@
// NOT THREAD SAFE. This may only be used from the CPU thread.
// Any other thread using this stuff will be FATAL.
void Thunk_Init();
void Thunk_Reset();
void Thunk_Shutdown();
class ThunkManager : public Gen::XCodeBlock
{
std::map<void *, const u8 *> thunks;
void *ProtectFunction(void *function, int num_params);
const u8 *save_regs;
const u8 *load_regs;
public:
void Init();
void Reset();
void Shutdown();
void *ProtectFunction(void *function, int num_params);
};
extern ThunkManager thunks;
#endif

File diff suppressed because it is too large Load Diff

View File

@ -21,217 +21,264 @@
#define _DOLPHIN_INTEL_CODEGEN
#include "Common.h"
#include "MemoryUtil.h"
namespace Gen
{
enum X64Reg
enum X64Reg
{
EAX = 0, EBX = 3, ECX = 1, EDX = 2,
ESI = 6, EDI = 7, EBP = 5, ESP = 4,
RAX = 0, RBX = 3, RCX = 1, RDX = 2,
RSI = 6, RDI = 7, RBP = 5, RSP = 4,
R8 = 8, R9 = 9, R10 = 10,R11 = 11,
R12 = 12,R13 = 13,R14 = 14,R15 = 15,
AL = 0, BL = 3, CL = 1, DL = 2,
AH = 4, BH = 7, CH = 5, DH = 6,
AX = 0, BX = 3, CX = 1, DX = 2,
SI = 6, DI = 7, BP = 5, SP = 4,
XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
INVALID_REG = 0xFFFFFFFF
};
enum CCFlags
{
CC_O = 0,
CC_NO = 1,
CC_B = 2, CC_C = 2, CC_NAE = 2,
CC_NB = 3, CC_NC = 3, CC_AE = 3,
CC_Z = 4, CC_E = 4,
CC_NZ = 5, CC_NE = 5,
CC_BE = 6, CC_NA = 6,
CC_NBE = 7, CC_A = 7,
CC_S = 8,
CC_NS = 9,
CC_P = 0xA, CC_PE = 0xA,
CC_NP = 0xB, CC_PO = 0xB,
CC_L = 0xC, CC_NGE = 0xC,
CC_NL = 0xD, CC_GE = 0xD,
CC_LE = 0xE, CC_NG = 0xE,
CC_NLE = 0xF, CC_G = 0xF
};
enum
{
NUMGPRs = 16,
NUMXMMs = 16,
};
enum
{
SCALE_NONE = 0,
SCALE_1 = 1,
SCALE_2 = 2,
SCALE_4 = 4,
SCALE_8 = 8,
SCALE_ATREG = 16,
SCALE_RIP = 0xFF,
SCALE_IMM8 = 0xF0,
SCALE_IMM16 = 0xF1,
SCALE_IMM32 = 0xF2,
SCALE_IMM64 = 0xF3,
};
enum NormalOp {
nrmADD,
nrmADC,
nrmSUB,
nrmSBB,
nrmAND,
nrmOR ,
nrmXOR,
nrmMOV,
nrmTEST,
nrmCMP,
nrmXCHG,
};
class XEmitter;
// RIP addressing does not benefit from micro op fusion on Core arch
struct OpArg
{
OpArg() {} // dummy op arg, used for storage
OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
{
EAX = 0, EBX = 3, ECX = 1, EDX = 2,
ESI = 6, EDI = 7, EBP = 5, ESP = 4,
RAX = 0, RBX = 3, RCX = 1, RDX = 2,
RSI = 6, RDI = 7, RBP = 5, RSP = 4,
R8 = 8, R9 = 9, R10 = 10,R11 = 11,
R12 = 12,R13 = 13,R14 = 14,R15 = 15,
operandReg = 0;
scale = (u8)_scale;
offsetOrBaseReg = (u8)rmReg;
indexReg = (u8)scaledReg;
//if scale == 0 never mind offseting
offset = _offset;
}
void WriteRex(XEmitter *emit, bool op64, int customOp = -1) const;
void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
// This one is public - must be written to
u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available.
u8 operandReg;
AL = 0, BL = 3, CL = 1, DL = 2,
AH = 4, BH = 7, CH = 5, DH = 6,
void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
bool IsSimpleReg() const {return scale == SCALE_NONE;}
bool IsSimpleReg(X64Reg reg) const {
if (!IsSimpleReg())
return false;
return GetSimpleReg() == reg;
}
AX = 0, BX = 3, CX = 1, DX = 2,
SI = 6, DI = 7, BP = 5, SP = 4,
XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
INVALID_REG = 0xFFFFFFFF
};
enum CCFlags
bool CanDoOpWith(const OpArg &other) const
{
CC_O = 0,
CC_NO = 1,
CC_B = 2, CC_C = 2, CC_NAE = 2,
CC_NB = 3, CC_NC = 3, CC_AE = 3,
CC_Z = 4, CC_E = 4,
CC_NZ = 5, CC_NE = 5,
CC_BE = 6, CC_NA = 6,
CC_NBE = 7, CC_A = 7,
CC_S = 8,
CC_NS = 9,
CC_P = 0xA, CC_PE = 0xA,
CC_NP = 0xB, CC_PO = 0xB,
CC_L = 0xC, CC_NGE = 0xC,
CC_NL = 0xD, CC_GE = 0xD,
CC_LE = 0xE, CC_NG = 0xE,
CC_NLE = 0xF, CC_G = 0xF
};
if (IsSimpleReg()) return true;
if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
return true;
}
enum
int GetImmBits() const
{
NUMGPRs = 16,
NUMXMMs = 16,
};
switch (scale)
{
case SCALE_IMM8: return 8;
case SCALE_IMM16: return 16;
case SCALE_IMM32: return 32;
case SCALE_IMM64: return 64;
default: return -1;
}
}
enum
X64Reg GetSimpleReg() const
{
SCALE_NONE = 0,
SCALE_1 = 1,
SCALE_2 = 2,
SCALE_4 = 4,
SCALE_8 = 8,
SCALE_ATREG = 16,
SCALE_RIP = 0xFF,
SCALE_IMM8 = 0xF0,
SCALE_IMM16 = 0xF1,
SCALE_IMM32 = 0xF2,
SCALE_IMM64 = 0xF3,
};
if (scale == SCALE_NONE)
return (X64Reg)offsetOrBaseReg;
else
return INVALID_REG;
}
private:
u8 scale;
u8 offsetOrBaseReg;
u8 indexReg;
};
inline OpArg M(void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);}
inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
inline OpArg MDisp(X64Reg value, int offset) {
return OpArg((u32)offset, SCALE_ATREG, value); }
inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
{
return OpArg(offset, scale, base, scaled);
}
inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);}
inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
#ifdef _M_X64
inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
#else
inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
#endif
struct FixupBranch
{
u8 *ptr;
int type; //0 = 8bit 1 = 32bit
};
enum SSECompare
{
EQ = 0,
LT,
LE,
UNORD,
NEQ,
NLT,
NLE,
ORD,
};
typedef const u8* JumpTarget;
class XEmitter
{
friend struct OpArg; // for Write8 etc
private:
u8 *code;
void Rex(int w, int r, int x, int b);
void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
void WriteMulDivType(int bits, OpArg src, int ext);
void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2);
void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
void WriteMXCSR(OpArg arg, int ext);
void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
protected:
inline void Write8(u8 value) {*code++ = value;}
inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
public:
XEmitter() { code = NULL; }
XEmitter(u8 *code_ptr) { code = code_ptr; }
void WriteModRM(int mod, int rm, int reg);
void WriteSIB(int scale, int index, int base);
void SetCodePtr(u8 *ptr);
void ReserveCodeSpace(int bytes);
const u8 *AlignCode4();
const u8 *AlignCode16();
const u8 *AlignCodePage();
const u8 *GetCodePtr();
const u8 *GetCodePtr() const;
u8 *GetWritableCodePtr();
// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
// INC and DEC are slow on Intel Core, but not on AMD. They create a
// false flag dependency because they only update a subset of the flags.
// XCHG is SLOW and should be avoided.
// Safe way to temporarily redirect the code generator.
class GenContext
{
u8 **code_ptr_ptr;
u8 *saved_ptr;
public:
GenContext(u8 **code_ptr_ptr_)
{
saved_ptr = GetWritableCodePtr();
code_ptr_ptr = code_ptr_ptr_;
SetCodePtr(*code_ptr_ptr);
}
~GenContext()
{
*code_ptr_ptr = GetWritableCodePtr();
SetCodePtr(saved_ptr);
}
};
enum NormalOp {
nrmADD,
nrmADC,
nrmSUB,
nrmSBB,
nrmAND,
nrmOR ,
nrmXOR,
nrmMOV,
nrmTEST,
nrmCMP,
nrmXCHG,
};
// Make the generation routine examine which direction to go
// probably has to be a static
// RIP addressing does not benefit from micro op fusion on Core arch
struct OpArg
{
OpArg() {} //dummy op arg, used for storage
OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
{
operandReg = 0;
scale = (u8)_scale;
offsetOrBaseReg = (u8)rmReg;
indexReg = (u8)scaledReg;
//if scale == 0 never mind offseting
offset = _offset;
}
void WriteRex(bool op64, int customOp = -1) const;
void WriteRest(int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
void WriteSingleByteOp(u8 op, X64Reg operandReg, int bits);
//This one is public - must be written to
u64 offset; //use RIP-relative as much as possible - avoid 64-bit immediates at all costs
u8 operandReg;
void WriteNormalOp(bool toRM, NormalOp op, const OpArg &operand, int bits) const;
bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
bool IsSimpleReg() const {return scale == SCALE_NONE;}
bool IsSimpleReg(X64Reg reg) const {
if (!IsSimpleReg())
return false;
return GetSimpleReg() == reg;
}
bool CanDoOpWith(const OpArg &other) const
{
if (IsSimpleReg()) return true;
if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
return true;
}
int GetImmBits() const
{
switch (scale)
{
case SCALE_IMM8: return 8;
case SCALE_IMM16: return 16;
case SCALE_IMM32: return 32;
case SCALE_IMM64: return 64;
default: return -1;
}
}
X64Reg GetSimpleReg() const
{
if (scale == SCALE_NONE)
return (X64Reg)offsetOrBaseReg;
else
return INVALID_REG;
}
private:
u8 scale;
u8 offsetOrBaseReg;
u8 indexReg;
};
inline OpArg M(void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);}
inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
inline OpArg MDisp(X64Reg value, int offset) {
return OpArg((u32)offset, SCALE_ATREG, value); }
inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
{
return OpArg(offset, scale, base, scaled);
}
inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);}
inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
#ifdef _M_X64
inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
#else
inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
#endif
// Debug breakpoint
void INT3();
// Do nothing
void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
// Save energy in wait-loops on P4 only. Probably not too useful.
void PAUSE();
void RET();
// Flag control
void STC();
void CLC();
void CMC();
// These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
void LAHF(); // 3 cycle vector path
void SAHF(); // direct path fast
// Stack control
void PUSH(X64Reg reg);
void POP(X64Reg reg);
void PUSH(int bits, const OpArg &reg);
void POP(int bits, const OpArg &reg);
void PUSHF();
void POPF();
typedef const u8* JumpTarget;
struct FixupBranch
{
u8 *ptr;
int type; //0 = 8bit 1 = 32bit
};
// Flow control
void RET();
void RET_FAST();
void UD2();
FixupBranch J(bool force5bytes = false);
void JMP(const u8 * addr, bool force5Bytes = false);
@ -239,7 +286,7 @@ namespace Gen
void JMPptr(const OpArg &arg);
void JMPself(); //infinite loop!
void CALL(void *fnptr);
void CALL(const void *fnptr);
void CALLptr(OpArg arg);
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
@ -248,66 +295,20 @@ namespace Gen
void SetJumpTarget(const FixupBranch &branch);
//WARNING - INC and DEC slow on Intel Core, but not on AMD, since it creates
//false flags dependencies because they only update a subset of the flags
// ector - I hereby BAN inc and dec due to their horribleness :P
// void INC(int bits, OpArg arg);
// void DEC(int bits, OpArg arg);
void SETcc(CCFlags flag, OpArg dest);
// Note: CMOV brings small if any benefit on current cpus, unfortunately.
// Note: CMOV brings small if any benefit on current cpus.
void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
// Fences
void LFENCE();
void MFENCE();
void SFENCE();
// Bit scan
void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit
//These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
void LAHF(); // 3 cycle vector path
void SAHF(); // direct path fast
//Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
//LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
//Actually REP MOVSD could be useful :P
void MOVNTI(int bits, OpArg dest, X64Reg src);
void MUL(int bits, OpArg src); //UNSIGNED
void DIV(int bits, OpArg src);
void IMUL(int bits, OpArg src); //SIGNED
void IDIV(int bits, OpArg src);
void IMUL(int bits, X64Reg regOp, OpArg src);
void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
void NEG(int bits, OpArg src);
void NOT(int bits, OpArg src);
void ROL(int bits, OpArg dest, OpArg shift);
void ROR(int bits, OpArg dest, OpArg shift);
void RCL(int bits, OpArg dest, OpArg shift);
void RCR(int bits, OpArg dest, OpArg shift);
void SHL(int bits, OpArg dest, OpArg shift);
void SHR(int bits, OpArg dest, OpArg shift);
void SAR(int bits, OpArg dest, OpArg shift);
void CWD(int bits = 16);
inline void CDQ() {CWD(32);}
inline void CQO() {CWD(64);}
void CBW(int bits = 8);
inline void CWDE() {CBW(16);}
inline void CDQE() {CBW(32);}
void LEA(int bits, X64Reg dest, OpArg src);
// Cache control
enum PrefetchLevel
{
PF_NTA, //Non-temporal (data used once and only once)
@ -316,58 +317,82 @@ namespace Gen
PF_T2, //Levels 3+ (aliased to T0 on AMD)
};
void PREFETCH(PrefetchLevel level, OpArg arg);
void MOVNTI(int bits, OpArg dest, X64Reg src);
void MOVNTDQ(OpArg arg, X64Reg regOp);
void MOVNTPS(OpArg arg, X64Reg regOp);
void MOVNTPD(OpArg arg, X64Reg regOp);
// Multiplication / division
void MUL(int bits, OpArg src); //UNSIGNED
void IMUL(int bits, OpArg src); //SIGNED
void IMUL(int bits, X64Reg regOp, OpArg src);
void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
void DIV(int bits, OpArg src);
void IDIV(int bits, OpArg src);
// Shift
void ROL(int bits, OpArg dest, OpArg shift);
void ROR(int bits, OpArg dest, OpArg shift);
void RCL(int bits, OpArg dest, OpArg shift);
void RCR(int bits, OpArg dest, OpArg shift);
void SHL(int bits, OpArg dest, OpArg shift);
void SHR(int bits, OpArg dest, OpArg shift);
void SAR(int bits, OpArg dest, OpArg shift);
// Extend EAX into EDX in various ways
void CWD(int bits = 16);
inline void CDQ() {CWD(32);}
inline void CQO() {CWD(64);}
void CBW(int bits = 8);
inline void CWDE() {CBW(16);}
inline void CDQE() {CBW(32);}
// Load effective address
void LEA(int bits, X64Reg dest, OpArg src);
// Integer arithmetic
void NEG (int bits, OpArg src);
void ADD (int bits, const OpArg &a1, const OpArg &a2);
void ADC (int bits, const OpArg &a1, const OpArg &a2);
void SUB (int bits, const OpArg &a1, const OpArg &a2);
void SBB (int bits, const OpArg &a1, const OpArg &a2);
void AND (int bits, const OpArg &a1, const OpArg &a2);
void CMP (int bits, const OpArg &a1, const OpArg &a2);
// Bit operations
void NOT (int bits, OpArg src);
void OR (int bits, const OpArg &a1, const OpArg &a2);
void XOR (int bits, const OpArg &a1, const OpArg &a2);
void MOV (int bits, const OpArg &a1, const OpArg &a2);
void TEST(int bits, const OpArg &a1, const OpArg &a2);
void CMP (int bits, const OpArg &a1, const OpArg &a2);
// XCHG is SLOW and should be avoided.
//void XCHG(int bits, const OpArg &a1, const OpArg &a2);
// Are these useful at all? Consider removing.
void XCHG(int bits, const OpArg &a1, const OpArg &a2);
void XCHG_AHAL();
// Byte swapping (32 and 64-bit only).
void BSWAP(int bits, X64Reg reg);
// Sign/zero extension
void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
enum SSECompare
{
EQ = 0,
LT,
LE,
UNORD,
NEQ,
NLT,
NLE,
ORD,
};
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
void STMXCSR(OpArg memloc);
void LDMXCSR(OpArg memloc);
// Regular SSE/SSE2 instructions
// Prefixes
void LOCK();
void REP();
void REPNE();
void FWAIT();
// SSE/SSE2: Floating point arithmetic
void ADDSS(X64Reg regOp, OpArg arg);
void ADDSD(X64Reg regOp, OpArg arg);
void SUBSS(X64Reg regOp, OpArg arg);
void SUBSD(X64Reg regOp, OpArg arg);
void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
void ANDSS(X64Reg regOp, OpArg arg);
void ANDSD(X64Reg regOp, OpArg arg);
void ANDNSS(X64Reg regOp, OpArg arg);
void ANDNSD(X64Reg regOp, OpArg arg);
void ORSS(X64Reg regOp, OpArg arg);
void ORSD(X64Reg regOp, OpArg arg);
void XORSS(X64Reg regOp, OpArg arg);
void XORSD(X64Reg regOp, OpArg arg);
void MULSS(X64Reg regOp, OpArg arg);
void MULSD(X64Reg regOp, OpArg arg);
void DIVSS(X64Reg regOp, OpArg arg);
@ -381,45 +406,65 @@ namespace Gen
void RSQRTSS(X64Reg regOp, OpArg arg);
void RSQRTSD(X64Reg regOp, OpArg arg);
void COMISS(X64Reg regOp, OpArg arg);
void COMISD(X64Reg regOp, OpArg arg);
// SSE/SSE2: Floating point bitwise (yes)
void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
void ANDSS(X64Reg regOp, OpArg arg);
void ANDSD(X64Reg regOp, OpArg arg);
void ANDNSS(X64Reg regOp, OpArg arg);
void ANDNSD(X64Reg regOp, OpArg arg);
void ORSS(X64Reg regOp, OpArg arg);
void ORSD(X64Reg regOp, OpArg arg);
void XORSS(X64Reg regOp, OpArg arg);
void XORSD(X64Reg regOp, OpArg arg);
// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
void ADDPS(X64Reg regOp, OpArg arg);
void ADDPD(X64Reg regOp, OpArg arg);
void SUBPS(X64Reg regOp, OpArg arg);
void SUBPD(X64Reg regOp, OpArg arg);
void CMPPS(X64Reg regOp, OpArg arg, u8 compare);
void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
void ANDPS(X64Reg regOp, OpArg arg);
void ANDPD(X64Reg regOp, OpArg arg);
void ANDNPS(X64Reg regOp, OpArg arg);
void ANDNPD(X64Reg regOp, OpArg arg);
void ORPS(X64Reg regOp, OpArg arg);
void ORPD(X64Reg regOp, OpArg arg);
void XORPS(X64Reg regOp, OpArg arg);
void XORPD(X64Reg regOp, OpArg arg);
void MULPS(X64Reg regOp, OpArg arg);
void MULPD(X64Reg regOp, OpArg arg);
void DIVPS(X64Reg regOp, OpArg arg);
void DIVPD(X64Reg regOp, OpArg arg);
void MINPS(X64Reg regOp, OpArg arg);
void MINPD(X64Reg regOp, OpArg arg);
void MAXPS(X64Reg regOp, OpArg arg);
void MAXPD(X64Reg regOp, OpArg arg);
void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
void MULPS(X64Reg regOp, OpArg arg);
void MULPD(X64Reg regOp, OpArg arg);
void DIVPS(X64Reg regOp, OpArg arg);
void DIVPD(X64Reg regOp, OpArg arg);
void MINPS(X64Reg regOp, OpArg arg);
void MINPD(X64Reg regOp, OpArg arg);
void MAXPS(X64Reg regOp, OpArg arg);
void MAXPD(X64Reg regOp, OpArg arg);
void SQRTPS(X64Reg regOp, OpArg arg);
void SQRTPD(X64Reg regOp, OpArg arg);
void RSQRTPS(X64Reg regOp, OpArg arg);
void RSQRTPD(X64Reg regOp, OpArg arg);
// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
void ANDPS(X64Reg regOp, OpArg arg);
void ANDPD(X64Reg regOp, OpArg arg);
void ANDNPS(X64Reg regOp, OpArg arg);
void ANDNPD(X64Reg regOp, OpArg arg);
void ORPS(X64Reg regOp, OpArg arg);
void ORPD(X64Reg regOp, OpArg arg);
void XORPS(X64Reg regOp, OpArg arg);
void XORPD(X64Reg regOp, OpArg arg);
// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);
void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);
// SSE/SSE2: Useful alternative to shuffle in some cases.
void MOVDDUP(X64Reg regOp, OpArg arg);
void UNPCKLPD(X64Reg dest, OpArg src);
void UNPCKHPD(X64Reg dest, OpArg src);
// SSE/SSE2: Compares.
void COMISS(X64Reg regOp, OpArg arg);
void COMISD(X64Reg regOp, OpArg arg);
void UCOMISS(X64Reg regOp, OpArg arg);
void UCOMISD(X64Reg regOp, OpArg arg);
// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
void MOVAPS(X64Reg regOp, OpArg arg);
void MOVAPD(X64Reg regOp, OpArg arg);
void MOVAPS(OpArg arg, X64Reg regOp);
@ -435,20 +480,20 @@ namespace Gen
void MOVSS(OpArg arg, X64Reg regOp);
void MOVSD(OpArg arg, X64Reg regOp);
void MOVMSKPS(X64Reg dest, OpArg arg);
void MOVMSKPD(X64Reg dest, OpArg arg);
void MOVD_xmm(X64Reg dest, const OpArg &arg);
void MOVQ_xmm(X64Reg dest, OpArg arg);
void MOVD_xmm(const OpArg &arg, X64Reg src);
void MOVQ_xmm(OpArg arg, X64Reg src);
// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
void MOVMSKPS(X64Reg dest, OpArg arg);
void MOVMSKPD(X64Reg dest, OpArg arg);
// SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
void MASKMOVDQU(X64Reg dest, X64Reg src);
void LDDQU(X64Reg dest, OpArg src);
void UNPCKLPD(X64Reg dest, OpArg src);
void UNPCKHPD(X64Reg dest, OpArg src);
// SSE/SSE2: Data type conversions.
void CVTPS2PD(X64Reg dest, OpArg src);
void CVTPD2PS(X64Reg dest, OpArg src);
void CVTSS2SD(X64Reg dest, OpArg src);
@ -458,7 +503,7 @@ namespace Gen
void CVTPD2DQ(X64Reg regOp, OpArg arg);
void CVTDQ2PS(X64Reg regOp, const OpArg &arg);
//Integer SSE instructions
// SSE2: Packed integer instructions
void PACKSSDW(X64Reg dest, OpArg arg);
void PACKSSWB(X64Reg dest, OpArg arg);
//void PACKUSDW(X64Reg dest, OpArg arg);
@ -528,42 +573,138 @@ namespace Gen
void RTDSC();
void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
// Utility functions
// These only support u32 parameters, but that's enough for a lot of uses.
// These will destroy the 1 or 2 first "parameter regs".
void ABI_CallFunctionC(void *func, u32 param1);
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
// Pass a register as a paremeter.
void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
// A function that doesn't have any control over what it will do to regs,
// such as the dispatcher, should be surrounded by these.
void ABI_PushAllCalleeSavedRegsAndAdjustStack();
void ABI_PopAllCalleeSavedRegsAndAdjustStack();
// A function that doesn't know anything about it's surroundings, should
// be surrounded by these to establish a safe environment, where it can roam free.
// An example is a backpatch injected function.
void ABI_PushAllCallerSavedRegsAndAdjustStack();
void ABI_PopAllCallerSavedRegsAndAdjustStack();
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
void ABI_AlignStack(unsigned int frameSize);
void ABI_RestoreStack(unsigned int frameSize);
// Sets up a __cdecl function.
// Only x64 really needs the parameter.
void ABI_EmitPrologue(int maxCallParams);
void ABI_EmitEpilogue(int maxCallParams);
#ifdef _M_IX86
inline int ABI_GetNumXMMRegs() { return 8; }
#else
inline int ABI_GetNumXMMRegs() { return 16; }
#endif
// Strange call wrappers.
void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
#if defined(_M_IX86) || !defined(_WIN32)
#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e))
#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f))
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g))
#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e))
#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f))
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g))
#define DECLARE_IMPORT(x)
#define DECLARE_IMPORT(x)
#else
// Comments from VertexLoader.cpp about these horrors:
// Comments from VertexLoader.cpp about these horrors:
// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
// want to grab the function pointers from the import table instead.
// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
// want to grab the function pointers from the import table instead.
void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
#endif
}; // class XEmitter
}
// Everything that needs to generate X86 code should inherit from this.
// You get memory management for free, plus, you can use all the MOV etc functions without
// having to prefix them with gen-> or something similar.
class XCodeBlock : public XEmitter
{
protected:
u8 *region;
size_t region_size;
public:
XCodeBlock() : region(NULL), region_size(0) {}
virtual ~XCodeBlock() { if (region) FreeCodeSpace(); }
// Call this before you generate any code.
void AllocCodeSpace(int size)
{
region_size = size;
region = (u8*)AllocateExecutableMemory(region_size);
SetCodePtr(region);
}
// Always clear code space with breakpoints, so that if someone accidentally executes
// uninitialized, it just breaks into the debugger.
void ClearCodeSpace()
{
// x86/64: 0xCC = breakpoint
memset(region, 0xCC, region_size);
ResetCodePtr();
}
// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
void FreeCodeSpace()
{
FreeMemoryPages(region, region_size);
region = NULL;
region_size = 0;
}
// Cannot currently be undone. Will write protect the entire code region.
// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
void WriteProtect()
{
WriteProtectMemory(region, region_size, true);
}
void ResetCodePtr()
{
SetCodePtr(region);
}
size_t GetSpaceLeft() const
{
return region_size - (GetCodePtr() - region);
}
};
} // namespace
#endif

View File

@ -46,7 +46,7 @@ namespace HW
{
CoreTiming::Init();
Thunk_Init(); // not really hw, but this way we know it's inited early :P
thunks.Init(); // not really hw, but this way we know it's inited early :P
State_Init();
// Init the whole Hardware
@ -88,7 +88,7 @@ namespace HW
}
State_Shutdown();
Thunk_Shutdown();
thunks.Shutdown();
CoreTiming::Shutdown();
}

View File

@ -104,7 +104,7 @@ LONG NTAPI Handler(PEXCEPTION_POINTERS pPtrs)
//We could emulate the memory accesses here, but then they would still be around to take up
//execution resources. Instead, we backpatch into a generic memory call and retry.
u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx);
const u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx);
// Rip/Eip needs to be updated.
if (new_rip)

View File

@ -164,6 +164,8 @@ ps_adds1
Jit64 jit;
PPCAnalyst::CodeBuffer code_buffer(32000);
int CODE_SIZE = 1024*1024*16;
namespace CPUCompare
{
extern u32 m_BlockStart;
@ -171,6 +173,11 @@ namespace CPUCompare
void Jit64::Init()
{
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
{
CODE_SIZE = 1024*1024*8*8;
}
jo.optimizeStack = true;
jo.enableBlocklink = true; // Speed boost, but not 100% safe
#ifdef _M_X64
@ -182,6 +189,23 @@ namespace CPUCompare
jo.fpAccurateFlags = true;
jo.optimizeGatherPipe = true;
jo.fastInterrupts = false;
gpr.SetEmitter(this);
fpr.SetEmitter(this);
trampolines.Init();
AllocCodeSpace(CODE_SIZE);
InitCache();
asm_routines.Init();
}
void Jit64::Shutdown()
{
FreeCodeSpace();
ShutdownCache();
trampolines.Shutdown();
asm_routines.Shutdown();
}
void Jit64::WriteCallInterpreter(UGeckoInstruction _inst)
@ -271,7 +295,7 @@ namespace CPUCompare
else
{
MOV(32, M(&PC), Imm32(destination));
JMP(Asm::dispatcher, true);
JMP(asm_routines.dispatcher, true);
}
}
@ -280,7 +304,7 @@ namespace CPUCompare
MOV(32, M(&PC), R(EAX));
Cleanup();
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
JMP(Asm::dispatcher, true);
JMP(asm_routines.dispatcher, true);
}
void Jit64::WriteRfiExitDestInEAX()
@ -288,7 +312,7 @@ namespace CPUCompare
MOV(32, M(&PC), R(EAX));
Cleanup();
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
JMP(Asm::testExceptions, true);
JMP(asm_routines.testExceptions, true);
}
void Jit64::WriteExceptionExit(u32 exception)
@ -296,7 +320,7 @@ namespace CPUCompare
Cleanup();
OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception));
MOV(32, M(&PC), Imm32(js.compilerPC + 4));
JMP(Asm::testExceptions, true);
JMP(asm_routines.testExceptions, true);
}
const u8* Jit64::DoJit(u32 emaddress, JitBlock &b)
@ -326,11 +350,13 @@ namespace CPUCompare
// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
FixupBranch skip = J_CC(CC_NBE);
MOV(32, M(&PC), Imm32(js.blockStart));
JMP(Asm::doTiming, true); // downcount hit zero - go doTiming.
JMP(asm_routines.doTiming, true); // downcount hit zero - go doTiming.
SetJumpTarget(skip);
const u8 *normalEntry = GetCodePtr();
if (ImHereDebug) CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
if (ImHereDebug)
CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
if (js.fpa.any)
{
@ -338,7 +364,7 @@ namespace CPUCompare
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
FixupBranch b1 = J_CC(CC_NZ);
MOV(32, M(&PC), Imm32(js.blockStart));
JMP(Asm::fpException, true);
JMP(asm_routines.fpException, true);
SetJumpTarget(b1);
}
@ -348,7 +374,7 @@ namespace CPUCompare
TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
FixupBranch b1 = J_CC(CC_Z);
MOV(32, M(&PC), Imm32(js.blockStart));
JMP(Asm::testExceptions, true);
JMP(asm_routines.testExceptions, true);
SetJumpTarget(b1);
}
@ -404,7 +430,7 @@ namespace CPUCompare
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
{
js.fifoBytesThisBlock -= 32;
CALL(ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
CALL(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
}
PPCTables::CompileInstruction(ops[i].inst);

View File

@ -24,7 +24,9 @@
#include "../PPCAnalyst.h"
#include "JitCache.h"
#include "JitRegCache.h"
#include "x64Emitter.h"
#include "x64Analyzer.h"
#ifdef _WIN32
@ -47,8 +49,24 @@ struct CONTEXT
#endif
class Jit64
class TrampolineCache : public Gen::XCodeBlock
{
public:
void Init();
void Shutdown();
const u8 *GetReadTrampoline(const InstructionInfo &info);
const u8 *GetWriteTrampoline(const InstructionInfo &info);
};
class Jit64 : public Gen::XCodeBlock
{
TrampolineCache trampolines;
GPRRegCache gpr;
FPURegCache fpr;
public:
typedef void (*CompiledCode)();
@ -157,7 +175,7 @@ public:
bool RangeIntersect(int s1, int e1, int s2, int e2) const;
bool IsInJitCode(const u8 *codePtr);
u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx);
const u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx);
#define JIT_OPCODE 0
@ -165,6 +183,7 @@ public:
const u8* DoJit(u32 emaddress, JitBlock &b);
void Init();
void Shutdown();
// Utilities for use by opcodes
@ -188,10 +207,10 @@ public:
void ForceSinglePrecisionP(Gen::X64Reg xmm);
void JitClearCA();
void JitSetCA();
void tri_op(int d, int a, int b, bool reversible, void (*op)(Gen::X64Reg, Gen::OpArg));
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg));
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
// OPCODES

View File

@ -31,27 +31,12 @@
#include "../../HW/CPUCompare.h"
#include "../../HW/GPFifo.h"
#include "../../Core.h"
#include "JitAsm.h"
using namespace Gen;
int blocksExecuted;
namespace Asm
{
const u8 *enterCode;
const u8 *testExceptions;
const u8 *fpException;
const u8 *doTiming;
const u8 *dispatcher;
const u8 *dispatcherNoCheck;
const u8 *dispatcherPcInEAX;
const u8 *computeRc;
const u8 *computeRcFp;
const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32;
const u8 *fifoDirectWriteFloat;
const u8 *fifoDirectWriteXmm64;
static int temp32;
bool compareEnabled = false;
@ -72,16 +57,15 @@ static bool enableStatistics = false;
//RBX - Base pointer of memory
//R15 - Pointer to array of block pointers
AsmRoutineManager asm_routines;
// PLAN: no more block numbers - crazy opcodes just contain offset within
// dynarec buffer
// At this offset - 4, there is an int specifying the block number.
void GenerateCommon();
#ifdef _M_IX86
void Generate()
void AsmRoutineManager::Generate()
{
enterCode = AlignCode16();
PUSH(EBP);
@ -129,7 +113,6 @@ void Generate()
ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1));
}
//grab from list and jump to it
//INT3();
MOV(32, R(EDX), ImmPtr(jit.GetCodePointers()));
JMPptr(MComplex(EDX, EAX, 4, 0));
SetJumpTarget(notfound);
@ -180,12 +163,14 @@ void Generate()
#elif defined(_M_X64)
void Generate()
void AsmRoutineManager::Generate()
{
enterCode = AlignCode16();
ABI_PushAllCalleeSavedRegsAndAdjustStack();
if (!jit.GetCodePointers() || !Memory::base)
PanicAlert("Memory::base and jit.GetCodePointers() must return valid values");
MOV(64, R(RBX), Imm64((u64)Memory::base));
MOV(64, R(R15), Imm64((u64)jit.GetCodePointers())); //It's below 2GB so 32 bits are good enough
const u8 *outerLoop = GetCodePtr();
@ -264,7 +249,7 @@ void Generate()
}
#endif
void GenFifoWrite(int size)
void AsmRoutineManager::GenFifoWrite(int size)
{
// Assume value in ABI_PARAM1
PUSH(ESI);
@ -287,8 +272,7 @@ void GenFifoWrite(int size)
RET();
}
static int temp32;
void GenFifoFloatWrite()
void AsmRoutineManager::GenFifoFloatWrite()
{
// Assume value in XMM0
PUSH(ESI);
@ -306,7 +290,7 @@ void GenFifoFloatWrite()
RET();
}
void GenFifoXmm64Write()
void AsmRoutineManager::GenFifoXmm64Write()
{
// Assume value in XMM0. Assume pre-byteswapped (unlike the others here!)
PUSH(ESI);
@ -319,7 +303,7 @@ void GenFifoXmm64Write()
RET();
}
void GenerateCommon()
void AsmRoutineManager::GenerateCommon()
{
// USES_CR
computeRc = AlignCode16();
@ -364,5 +348,3 @@ void GenerateCommon()
SetJumpTarget(skip_fast_write);
CALL((void *)&Memory::Write_U8);*/
}
} // namespace Asm

View File

@ -14,33 +14,71 @@
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#ifndef _JITASM_H
#define _JITASM_H
namespace Asm
#include "x64Emitter.h"
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
// code at runtime. In the case of fixed code like this, after writing it, we write
// protect the memory, essentially making it work just like precompiled code.
// There are some advantages to this approach:
// 1) No need to setup an external assembler in the build.
// 2) Cross platform, as long as it's x86/x64.
// 3) Can optimize code at runtime for the specific CPU model.
// There aren't really any disadvantages other than having to maintain a x86 emitter,
// which we have to do anyway :)
//
// To add a new asm routine, just add another const here, and add the code to Generate.
// Also, possibly increase the size of the code buffer.
class AsmRoutineManager : public Gen::XCodeBlock
{
extern const u8 *enterCode;
extern const u8 *dispatcher;
extern const u8 *dispatcherNoCheck;
extern const u8 *dispatcherPcInEAX;
extern const u8 *fpException;
extern const u8 *computeRc;
extern const u8 *computeRcFp;
extern const u8 *testExceptions;
extern const u8 *dispatchPcInEAX;
extern const u8 *doTiming;
extern const u8 *fifoDirectWrite8;
extern const u8 *fifoDirectWrite16;
extern const u8 *fifoDirectWrite32;
extern const u8 *fifoDirectWriteFloat;
extern const u8 *fifoDirectWriteXmm64;
extern bool compareEnabled;
private:
void Generate();
}
void GenerateCommon();
void GenFifoWrite(int size);
void GenFifoFloatWrite();
void GenFifoXmm64Write();
public:
void Init() {
AllocCodeSpace(8192);
Generate();
WriteProtect();
}
void Shutdown() {
FreeCodeSpace();
}
// Public generated functions. Just CALL(M((void*)func)) them.
const u8 *enterCode;
const u8 *dispatcher;
const u8 *dispatcherNoCheck;
const u8 *dispatcherPcInEAX;
const u8 *fpException;
const u8 *computeRc;
const u8 *computeRcFp;
const u8 *testExceptions;
const u8 *dispatchPcInEAX;
const u8 *doTiming;
const u8 *fifoDirectWrite8;
const u8 *fifoDirectWrite16;
const u8 *fifoDirectWrite32;
const u8 *fifoDirectWriteFloat;
const u8 *fifoDirectWriteXmm64;
bool compareEnabled;
};
extern AsmRoutineManager asm_routines;
#endif

View File

@ -33,7 +33,7 @@
using namespace Gen;
extern u8 *trampolineCodePtr;
void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
u64 code_addr = (u64)codePtr;
disassembler disasm;
@ -51,17 +51,105 @@ void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
return;
}
void TrampolineCache::Init()
{
AllocCodeSpace(1024 * 1024);
}
void TrampolineCache::Shutdown()
{
AllocCodeSpace(1024 * 1024);
}
// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info)
{
if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full");
X64Reg addrReg = (X64Reg)info.scaledReg;
X64Reg dataReg = (X64Reg)info.regOperandReg;
const u8 *trampoline = GetCodePtr();
#ifdef _M_X64
// It's a read. Easy.
ABI_PushAllCallerSavedRegsAndAdjustStack();
if (addrReg != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
if (info.displacement) {
ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
}
switch (info.operandSize) {
case 4:
CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1));
break;
}
ABI_PopAllCallerSavedRegsAndAdjustStack();
MOV(32, R(dataReg), R(EAX));
RET();
#endif
return trampoline;
}
// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info)
{
if (GetSpaceLeft() < 1024)
PanicAlert("Trampoline cache full");
X64Reg addrReg = (X64Reg)info.scaledReg;
X64Reg dataReg = (X64Reg)info.regOperandReg;
if (dataReg != EAX)
PanicAlert("Backpatch write - not through EAX");
const u8 *trampoline = GetCodePtr();
#ifdef _M_X64
// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a
// hardware access - we can take shortcuts.
//if (emAddress == 0xCC008000)
// PanicAlert("caught a fifo write");
CMP(32, R(addrReg), Imm32(0xCC008000));
FixupBranch skip_fast = J_CC(CC_NE, false);
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
CALL((void*)asm_routines.fifoDirectWrite32);
RET();
SetJumpTarget(skip_fast);
ABI_PushAllCallerSavedRegsAndAdjustStack();
if (addrReg != ABI_PARAM1) {
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
} else {
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
}
if (info.displacement) {
ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
}
switch (info.operandSize) {
case 4:
CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2));
break;
}
ABI_PopAllCallerSavedRegsAndAdjustStack();
RET();
#endif
return trampoline;
}
// This generates some fairly heavy trampolines, but:
// 1) It's really necessary. We don't know anything about the context.
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
// that many of them in a typical program/game.
u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
{
#ifdef _M_X64
if (!IsInJitCode(codePtr))
return 0; // this will become a regular crash real soon after this
u8 *oldCodePtr = GetWritableCodePtr();
InstructionInfo info;
if (!DisassembleMov(codePtr, info, accessType)) {
BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress);
@ -81,108 +169,42 @@ u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress);
}
X64Reg addrReg = (X64Reg)info.scaledReg;
X64Reg dataReg = (X64Reg)info.regOperandReg;
if (info.otherReg != RBX)
PanicAlert("BackPatch : Base reg not RBX."
"\n\nAttempted to access %08x.", emAddress);
//if (accessType == OP_ACCESS_WRITE)
// PanicAlert("BackPatch : Currently only supporting reads."
// "\n\nAttempted to write to %08x.", emAddress);
// OK, let's write a trampoline, and a jump to it.
// Later, let's share trampolines.
if (accessType == OP_ACCESS_WRITE)
PanicAlert("BackPatch : Currently only supporting reads."
"\n\nAttempted to write to %08x.", emAddress);
// In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads.
// Next step - support writes, special case FIFO writes. Also, support 32-bit mode.
u8 *trampoline = trampolineCodePtr;
SetCodePtr(trampolineCodePtr);
if (accessType == 0)
{
// It's a read. Easy.
ABI_PushAllCallerSavedRegsAndAdjustStack();
if (addrReg != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
if (info.displacement) {
ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
}
switch (info.operandSize) {
case 4:
CALL(ProtectFunction((void *)&Memory::Read_U32, 1));
break;
default:
BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress);
break;
}
ABI_PopAllCallerSavedRegsAndAdjustStack();
MOV(32, R(dataReg), R(EAX));
RET();
trampolineCodePtr = GetWritableCodePtr();
SetCodePtr(codePtr);
XEmitter emitter(codePtr);
int bswapNopCount;
// Check the following BSWAP for REX byte
if ((GetCodePtr()[info.instructionSize] & 0xF0) == 0x40)
if ((codePtr[info.instructionSize] & 0xF0) == 0x40)
bswapNopCount = 3;
else
bswapNopCount = 2;
CALL(trampoline);
NOP((int)info.instructionSize + bswapNopCount - 5);
SetCodePtr(oldCodePtr);
const u8 *trampoline = trampolines.GetReadTrampoline(info);
emitter.CALL((void *)trampoline);
emitter.NOP((int)info.instructionSize + bswapNopCount - 5);
return codePtr;
}
else if (accessType == 1)
{
// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a
// hardware access - we can take shortcuts.
//if (emAddress == 0xCC008000)
// PanicAlert("caught a fifo write");
if (dataReg != EAX)
PanicAlert("Backpatch write - not through EAX");
CMP(32, R(addrReg), Imm32(0xCC008000));
FixupBranch skip_fast = J_CC(CC_NE, false);
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
CALL((void*)Asm::fifoDirectWrite32);
RET();
SetJumpTarget(skip_fast);
ABI_PushAllCallerSavedRegsAndAdjustStack();
if (addrReg != ABI_PARAM1) {
//INT3();
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
} else {
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
}
if (info.displacement) {
ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
}
switch (info.operandSize) {
case 4:
CALL(ProtectFunction((void *)&Memory::Write_U32, 2));
break;
default:
BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress);
break;
}
ABI_PopAllCallerSavedRegsAndAdjustStack();
RET();
trampolineCodePtr = GetWritableCodePtr();
// TODO: special case FIFO writes. Also, support 32-bit mode.
// Also, debug this so that it actually works correctly :P
XEmitter emitter(codePtr - 2);
// We know it's EAX so the BSWAP before will be two byte. Overwrite it.
SetCodePtr(codePtr - 2);
CALL(trampoline);
NOP((int)info.instructionSize - 3);
const u8 *trampoline = trampolines.GetWriteTrampoline(info);
emitter.CALL((void *)trampoline);
emitter.NOP((int)info.instructionSize - 3);
if (info.instructionSize < 3)
PanicAlert("instruction too small");
SetCodePtr(oldCodePtr);
// We entered here with a BSWAP-ed EAX. We'll have to swap it back.
ctx->Rax = Common::swap32(ctx->Rax);
return codePtr - 2;
}
return 0;

View File

@ -56,19 +56,15 @@ using namespace Gen;
op_agent_t agent;
#endif
static u8 *codeCache;
static u8 *genFunctions;
static u8 *trampolineCache;
u8 *trampolineCodePtr;
#define INVALID_EXIT 0xFFFFFFFF
enum
{
//CODE_SIZE = 1024*1024*8,
GEN_SIZE = 4096,
TRAMPOLINE_SIZE = 1024*1024,
//MAX_NUM_BLOCKS = 65536,
};
int CODE_SIZE = 1024*1024*16;
int MAX_NUM_BLOCKS = 65536*2;
static u8 **blockCodePointers;
@ -89,36 +85,22 @@ using namespace Gen;
void Jit64::InitCache()
{
if(Core::g_CoreStartupParameter.bJITUnlimitedCache)
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
{
CODE_SIZE = 1024*1024*8*8;
MAX_NUM_BLOCKS = 65536*8;
}
codeCache = (u8*)AllocateExecutableMemory(CODE_SIZE);
genFunctions = (u8*)AllocateExecutableMemory(GEN_SIZE);
trampolineCache = (u8*)AllocateExecutableMemory(TRAMPOLINE_SIZE);
trampolineCodePtr = trampolineCache;
#ifdef OPROFILE_REPORT
agent = op_open_agent();
#endif
blocks = new JitBlock[MAX_NUM_BLOCKS];
blockCodePointers = new u8*[MAX_NUM_BLOCKS];
ClearCache();
SetCodePtr(genFunctions);
Asm::Generate();
// Protect the generated functions
WriteProtectMemory(genFunctions, GEN_SIZE, true);
SetCodePtr(codeCache);
}
void Jit64::ShutdownCache()
{
UnWriteProtectMemory(genFunctions, GEN_SIZE, true);
FreeMemoryPages(codeCache, CODE_SIZE);
FreeMemoryPages(genFunctions, GEN_SIZE);
FreeMemoryPages(trampolineCache, TRAMPOLINE_SIZE);
delete [] blocks;
delete [] blockCodePointers;
blocks = 0;
@ -135,21 +117,23 @@ using namespace Gen;
{
Core::DisplayMessage("Cleared code cache.", 3000);
// Is destroying the blocks really necessary?
for (int i = 0; i < numBlocks; i++) {
for (int i = 0; i < numBlocks; i++)
{
DestroyBlock(i, false);
}
links_to.clear();
trampolineCodePtr = trampolineCache;
numBlocks = 0;
memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS);
memset(codeCache, 0xCC, CODE_SIZE);
SetCodePtr(codeCache);
trampolines.ClearCodeSpace();
}
void Jit64::DestroyBlocksWithFlag(BlockFlag death_flag)
{
for (int i = 0; i < numBlocks; i++) {
if (blocks[i].flags & death_flag) {
for (int i = 0; i < numBlocks; i++)
{
if (blocks[i].flags & death_flag)
{
DestroyBlock(i, false);
}
}
@ -190,10 +174,10 @@ using namespace Gen;
const u8 *Jit64::Jit(u32 emAddress)
{
if (GetCodePtr() >= codeCache + CODE_SIZE - 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1)
if (GetSpaceLeft() < 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1)
{
LOG(DYNA_REC, "JIT cache full - clearing.")
if(Core::g_CoreStartupParameter.bJITUnlimitedCache)
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
{
PanicAlert("What? JIT cache still full - clearing.");
}
@ -221,10 +205,8 @@ using namespace Gen;
}
}
u8 *oldCodePtr = GetWritableCodePtr();
LinkBlock(numBlocks);
LinkBlockExits(numBlocks);
SetCodePtr(oldCodePtr);
}
#ifdef OPROFILE_REPORT
@ -257,7 +239,7 @@ using namespace Gen;
void Jit64::EnterFastRun()
{
CompiledCode pExecAddr = (CompiledCode)Asm::enterCode;
CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
pExecAddr();
//Will return when PowerPC::state changes
}
@ -336,8 +318,8 @@ using namespace Gen;
int destinationBlock = GetBlockNumberFromAddress(b.exitAddress[e]);
if (destinationBlock != -1)
{
SetCodePtr(b.exitPtrs[e]);
JMP(blocks[destinationBlock].checkedEntry, true);
XEmitter emit(b.exitPtrs[e]);
emit.JMP(blocks[destinationBlock].checkedEntry, true);
b.linkStatus[e] = true;
}
}
@ -345,6 +327,7 @@ using namespace Gen;
}
using namespace std;
void Jit64::LinkBlock(int i)
{
LinkBlockExits(i);
@ -386,15 +369,15 @@ using namespace Gen;
// Not entirely ideal, but .. pretty good.
// TODO - make sure that the below stuff really is safe.
u8 *prev_code = GetWritableCodePtr();
// Spurious entrances from previously linked blocks can only come through checkedEntry
SetCodePtr((u8*)b.checkedEntry);
MOV(32, M(&PC), Imm32(b.originalAddress));
JMP(Asm::dispatcher, true);
SetCodePtr(blockCodePointers[blocknum]);
MOV(32, M(&PC), Imm32(b.originalAddress));
JMP(Asm::dispatcher, true);
SetCodePtr(prev_code); // reset code pointer
XEmitter emit((u8*)b.checkedEntry);
emit.MOV(32, M(&PC), Imm32(b.originalAddress));
emit.JMP(asm_routines.dispatcher, true);
emit.SetCodePtr(blockCodePointers[blocknum]);
emit.MOV(32, M(&PC), Imm32(b.originalAddress));
emit.JMP(asm_routines.dispatcher, true);
}

View File

@ -19,6 +19,6 @@
#include "../Gekko.h"
// Will soon introduced the JitBlockCache class here.
// Will soon introduce the JitBlockCache class here.
#endif

View File

@ -34,13 +34,12 @@ namespace JitCore
void Init()
{
jit.Init();
jit.InitCache();
Asm::compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
}
void Shutdown()
{
jit.ShutdownCache();
jit.Shutdown();
}
void SingleStep()

View File

@ -27,8 +27,6 @@ using namespace Gen;
using namespace PowerPC;
GPRRegCache gpr;
FPURegCache fpr;
void RegCache::Start(PPCAnalyst::BlockRegStats &stats)
{
@ -267,7 +265,7 @@ using namespace PowerPC;
xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
OpArg newloc = ::Gen::R(xr);
if (doLoad || regs[i].location.IsImm())
MOV(32, newloc, regs[i].location);
emit->MOV(32, newloc, regs[i].location);
for (int j = 0; j < 32; j++)
{
if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr)
@ -309,7 +307,7 @@ using namespace PowerPC;
}
OpArg newLoc = GetDefaultLocation(i);
// if (doStore) //<-- Breaks JIT compilation
MOV(32, newLoc, regs[i].location);
emit->MOV(32, newLoc, regs[i].location);
regs[i].location = newLoc;
regs[i].away = false;
}
@ -327,11 +325,13 @@ using namespace PowerPC;
xregs[xr].free = false;
xregs[xr].dirty = makeDirty;
OpArg newloc = ::Gen::R(xr);
if (doLoad) {
if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF)) {
if (doLoad)
{
if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF))
{
PanicAlert("WARNING - misaligned fp register location %i", i);
}
MOVAPD(xr, regs[i].location);
emit->MOVAPD(xr, regs[i].location);
}
regs[i].location = newloc;
regs[i].away = true;
@ -352,7 +352,7 @@ using namespace PowerPC;
xregs[xr].dirty = false;
xregs[xr].ppcReg = -1;
OpArg newLoc = GetDefaultLocation(i);
MOVAPD(newLoc, xr);
emit->MOVAPD(newLoc, xr);
regs[i].location = newLoc;
regs[i].away = false;
}

View File

@ -72,10 +72,15 @@
void DiscardRegContentsIfCached(int preg);
virtual const int *GetAllocationOrder(int &count) = 0;
XEmitter *emit;
public:
virtual ~RegCache() {}
virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0;
void SetEmitter(XEmitter *emitter) {emit = emitter;}
void FlushR(X64Reg reg);
void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);}
void FlushLockX(X64Reg reg) {
@ -142,8 +147,5 @@
OpArg GetDefaultLocation(int reg) const;
};
extern GPRRegCache gpr;
extern FPURegCache fpr;
#endif

View File

@ -33,39 +33,39 @@
const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg))
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
{
fpr.Lock(d, a, b);
if (d == a)
{
fpr.LoadToX64(d, true);
op(fpr.RX(d), fpr.R(b));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b && reversible)
{
fpr.LoadToX64(d, true);
op(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(a));
}
else if (a != d && b != d)
{
// Sources different from d, can use rather quick solution
fpr.LoadToX64(d, !dupe);
MOVSD(fpr.RX(d), fpr.R(a));
op(fpr.RX(d), fpr.R(b));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (b != d)
{
fpr.LoadToX64(d, !dupe);
MOVSD(XMM0, fpr.R(b));
MOVSD(fpr.RX(d), fpr.R(a));
op(fpr.RX(d), Gen::R(XMM0));
(this->*op)(fpr.RX(d), Gen::R(XMM0));
}
else // Other combo, must use two temps :(
{
MOVSD(XMM0, fpr.R(a));
MOVSD(XMM1, fpr.R(b));
fpr.LoadToX64(d, !dupe);
op(XMM0, Gen::R(XMM1));
(this->*op)(XMM0, Gen::R(XMM1));
MOVSD(fpr.RX(d), Gen::R(XMM0));
}
if (dupe) {
@ -86,16 +86,16 @@
bool dupe = inst.OPCD == 59;
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &DIVSD); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &SUBSD); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &ADDSD); break; //add
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add
case 23: //sel
Default(inst);
break;
case 24: //res
Default(inst);
break;
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &MULSD); break; //mul
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
}

View File

@ -42,7 +42,7 @@
u32 And(u32 a, u32 b) {return a & b;}
u32 Xor(u32 a, u32 b) {return a ^ b;}
void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
{
gpr.Lock(d, a);
if (a || binary || carry) // yeh nasty special case addic
@ -57,7 +57,7 @@
{
if (gpr.R(d).IsImm())
gpr.LoadToX64(d, false);
op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
if (carry)
GenerateCarry(EAX);
}
@ -66,7 +66,7 @@
{
gpr.LoadToX64(d, false);
MOV(32, gpr.R(d), gpr.R(a));
op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
if (carry)
GenerateCarry(EAX);
}
@ -84,7 +84,7 @@
{
// Todo - special case immediates.
MOV(32, R(EAX), gpr.R(d));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
gpr.UnlockAll();
}
@ -109,22 +109,22 @@
MOV(32, gpr.R(d), gpr.R(a));
gpr.UnlockAll();
} else {
regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD); //addi
regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD); //addi
}
break;
case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, ADD); break; //addis
case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD); break; //addis
case 24:
if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc) //check for nop
{NOP(); return;} //make the nop visible in the generated code. not much use but interesting if we see one.
regimmop(a, s, true, inst.UIMM, Or, OR);
regimmop(a, s, true, inst.UIMM, Or, &XEmitter::OR);
break; //ori
case 25: regimmop(a, s, true, inst.UIMM << 16, Or, OR, false); break;//oris
case 28: regimmop(a, s, true, inst.UIMM, And, AND, true); break;
case 29: regimmop(a, s, true, inst.UIMM << 16, And, AND, true); break;
case 26: regimmop(a, s, true, inst.UIMM, Xor, XOR, false); break; //xori
case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, XOR, false); break; //xoris
case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD, false, true); //addic
case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, ADD, true, true); //addic_rc
case 25: regimmop(a, s, true, inst.UIMM << 16, Or, &XEmitter::OR, false); break;//oris
case 28: regimmop(a, s, true, inst.UIMM, And, &XEmitter::AND, true); break;
case 29: regimmop(a, s, true, inst.UIMM << 16, And, &XEmitter::AND, true); break;
case 26: regimmop(a, s, true, inst.UIMM, Xor, &XEmitter::XOR, false); break; //xori
case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, &XEmitter::XOR, false); break; //xoris
case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, false, true); //addic
case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, true, true); //addic_rc
default:
Default(inst);
break;
@ -295,7 +295,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -328,7 +328,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -353,7 +353,7 @@
if (inst.Rc) {
// result is already in eax
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -374,7 +374,7 @@
MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends
if (inst.Rc) {
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -394,7 +394,7 @@
MOVSX(32, 16, gpr.RX(a), gpr.R(s));
if (inst.Rc) {
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -474,7 +474,7 @@
if (inst.OE) PanicAlert("OE: subfx");
if (inst.Rc) {
// result is already in eax
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -514,7 +514,7 @@
gpr.UnlockAll();
if (inst.Rc) {
MOV(32, R(EAX), gpr.R(d));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -544,7 +544,7 @@
MOV(32, R(EAX), R(EDX));
MOV(32, gpr.R(d), R(EDX));
// result is already in eax
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
} else {
MOV(32, gpr.R(d), R(EDX));
}
@ -570,7 +570,7 @@
gpr.UnlockAll();
gpr.UnlockAllX();
if (inst.Rc) {
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -606,7 +606,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(d));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
gpr.UnlockAll();
}
@ -618,7 +618,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(d));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
gpr.UnlockAll();
}
@ -630,7 +630,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(d));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
gpr.UnlockAll();
}
@ -666,7 +666,7 @@
gpr.UnlockAllX();
if (inst.Rc)
{
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -730,7 +730,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -767,7 +767,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -799,7 +799,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -821,7 +821,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -851,7 +851,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -881,7 +881,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -929,7 +929,7 @@
if (inst.Rc) {
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -975,7 +975,7 @@
if (inst.Rc) {
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
}
}
@ -1006,7 +1006,7 @@
if (inst.Rc)
{
MOV(32, R(EAX), gpr.R(a));
CALL((u8*)Asm::computeRc);
CALL((u8*)asm_routines.computeRc);
// TODO: Check PPC manual too
}
}

View File

@ -144,7 +144,7 @@
fpr.Flush(FLUSH_ALL);
ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
MOV(32, M(&PowerPC::ppcState.pc), Imm32(js.compilerPC + 12));
JMP(Asm::testExceptions, true);
JMP(asm_routines.testExceptions, true);
js.compilerPC += 8;
return;
}
@ -287,14 +287,13 @@
gpr.SetImmediate32(a, addr);
gpr.FlushLockX(ABI_PARAM1);
MOV(32, R(ABI_PARAM1), gpr.R(s));
// INT3();
switch (accessSize)
{
// No need to protect these, they don't touch any state
// question - should we inline them instead? Pro: Lose a CALL Con: Code bloat
case 8: CALL((void *)Asm::fifoDirectWrite8); break;
case 16: CALL((void *)Asm::fifoDirectWrite16); break;
case 32: CALL((void *)Asm::fifoDirectWrite32); break;
case 8: CALL((void *)asm_routines.fifoDirectWrite8); break;
case 16: CALL((void *)asm_routines.fifoDirectWrite16); break;
case 32: CALL((void *)asm_routines.fifoDirectWrite32); break;
}
js.fifoBytesThisBlock += accessSize >> 3;
gpr.UnlockAllX();
@ -377,9 +376,9 @@
SetJumpTarget(unsafe_addr);
switch (accessSize)
{
case 32: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break;
case 16: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break;
case 8: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U8, 2), ABI_PARAM1, ABI_PARAM2); break;
case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break;
case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break;
case 8: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), ABI_PARAM1, ABI_PARAM2); break;
}
SetJumpTarget(skip_call);
gpr.UnlockAll();
@ -402,7 +401,6 @@
//return _inst.RA ? (m_GPR[_inst.RA] + _inst.SIMM_16) : _inst.SIMM_16;
gpr.FlushLockX(ECX, EDX);
gpr.FlushLockX(ESI);
//INT3();
MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));

View File

@ -242,7 +242,7 @@ void Jit64::stfs(UGeckoInstruction inst)
{
// Float directly to write gather pipe! Fun!
CVTSD2SS(XMM0, fpr.R(s));
CALL((void*)Asm::fifoDirectWriteFloat);
CALL((void*)asm_routines.fifoDirectWriteFloat);
// TODO
js.fifoBytesThisBlock += 4;
return;

View File

@ -161,7 +161,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
#endif
FixupBranch skip_call = J();
SetJumpTarget(argh);
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
SetJumpTarget(skip_call);
gpr.UnlockAll();
gpr.UnlockAllX();
@ -184,7 +184,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
// Writing to FIFO. Let's do fast method.
CVTPD2PS(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
CALL((void*)Asm::fifoDirectWriteXmm64);
CALL((void*)asm_routines.fifoDirectWriteXmm64);
js.fifoBytesThisBlock += 8;
return;
}
@ -211,7 +211,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
FixupBranch arg2 = J();
SetJumpTarget(argh);
CALL(ProtectFunction((void *)&WriteDual32, 0));
CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
#else
FixupBranch argh = J_CC(CC_NZ);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
@ -224,10 +224,10 @@ void Jit64::psq_st(UGeckoInstruction inst)
FixupBranch arg2 = J();
SetJumpTarget(argh);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
ADD(32, R(ABI_PARAM2), Imm32(4));
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
#endif
SetJumpTarget(arg2);
gpr.UnlockAll();
@ -424,7 +424,6 @@ void Jit64::psq_l(UGeckoInstruction inst)
#endif
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
//INT3();
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
MOVD_xmm(XMM0, M(&temp64));

View File

@ -163,40 +163,40 @@
*/
//There's still a little bit more optimization that can be squeezed out of this
void Jit64::tri_op(int d, int a, int b, bool reversible, void (*op)(X64Reg, OpArg))
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
{
fpr.Lock(d, a, b);
if (d == a)
{
fpr.LoadToX64(d, true);
op(fpr.RX(d), fpr.R(b));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b && reversible)
{
fpr.LoadToX64(d, true);
op(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(a));
}
else if (a != d && b != d)
{
//sources different from d, can use rather quick solution
fpr.LoadToX64(d, false);
MOVAPD(fpr.RX(d), fpr.R(a));
op(fpr.RX(d), fpr.R(b));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (b != d)
{
fpr.LoadToX64(d, false);
MOVAPD(XMM0, fpr.R(b));
MOVAPD(fpr.RX(d), fpr.R(a));
op(fpr.RX(d), Gen::R(XMM0));
(this->*op)(fpr.RX(d), Gen::R(XMM0));
}
else //Other combo, must use two temps :(
{
MOVAPD(XMM0, fpr.R(a));
MOVAPD(XMM1, fpr.R(b));
fpr.LoadToX64(d, false);
op(XMM0, Gen::R(XMM1));
(this->*op)(XMM0, Gen::R(XMM1));
MOVAPD(fpr.RX(d), Gen::R(XMM0));
}
ForceSinglePrecisionP(fpr.RX(d));
@ -213,16 +213,16 @@
}
switch (inst.SUBOP5)
{
case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &DIVPD); break; //div
case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &SUBPD); break; //sub
case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &ADDPD); break; //add
case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div
case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub
case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; //add
case 23://sel
Default(inst);
break;
case 24://res
Default(inst);
break;
case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &MULPD); break; //mul
case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
}

View File

@ -76,9 +76,9 @@ void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signEx
FixupBranch argh = J_CC(CC_Z);
switch (accessSize)
{
case 32: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
case 16: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
case 8: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U8, 1), reg); break;
case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
case 8: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break;
}
if (signExtend && accessSize < 32) {
// Need to sign extend values coming from the Read_U* functions.
@ -114,7 +114,7 @@ void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize,
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0);
FixupBranch skip_call = J();
SetJumpTarget(unsafe_addr);
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
SetJumpTarget(skip_call);
}

View File

@ -463,7 +463,7 @@ void BPWritten(int addr, int changes, int newval)
{
// the number of lines copied is determined by the y scale * source efb height
float yScale = bpmem.dispcopyyscale / 256.0f;
float xfbLines = bpmem.copyTexSrcWH.y + 1.0 * yScale;
float xfbLines = bpmem.copyTexSrcWH.y + 1.0f * yScale;
XFB_Write(Memory_GetPtr(bpmem.copyTexDest<<5), multirc, (bpmem.copyMipMapStrideChannels << 4), (int)xfbLines);
}
else

View File

@ -82,68 +82,68 @@ void NativeVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl)
}
#ifdef USE_JIT
Gen::XEmitter emit(m_compiledCode);
// Alright, we have our vertex declaration. Compile some crazy code to set it quickly using GL.
u8 *old_code_ptr = GetWritableCodePtr();
SetCodePtr(m_compiledCode);
ABI_EmitPrologue(6);
emit.ABI_EmitPrologue(6);
CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0);
emit.CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0);
if (_vtx_decl.num_normals >= 1) {
CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]);
emit.CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]);
if (_vtx_decl.num_normals == 3) {
CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]);
CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]);
emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]);
emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]);
}
}
for (int i = 0; i < 2; i++) {
if (_vtx_decl.color_offset[i] != -1) {
if (i == 0)
CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
emit.CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
else
CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
emit.CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
}
}
for (int i = 0; i < 8; i++) {
if (_vtx_decl.texcoord_offset[i] != -1) {
for (int i = 0; i < 8; i++)
{
if (_vtx_decl.texcoord_offset[i] != -1)
{
int id = GL_TEXTURE0 + i;
#ifdef _M_X64
#ifdef _MSC_VER
MOV(32, R(RCX), Imm32(id));
emit.MOV(32, R(RCX), Imm32(id));
#else
MOV(32, R(RDI), Imm32(id));
emit.MOV(32, R(RDI), Imm32(id));
#endif
#else
ABI_AlignStack(1 * 4);
PUSH(32, Imm32(id));
emit.ABI_AlignStack(1 * 4);
emit.PUSH(32, Imm32(id));
#endif
CALL((void *)glClientActiveTexture);
emit.CALL((void *)glClientActiveTexture);
#ifndef _M_X64
#ifdef _WIN32
// don't inc stack on windows, stdcall
#else
ABI_RestoreStack(1 * 4);
emit.ABI_RestoreStack(1 * 4);
#endif
#endif
CallCdeclFunction4_I(
emit.CallCdeclFunction4_I(
glTexCoordPointer, _vtx_decl.texcoord_size[i], VarToGL(_vtx_decl.texcoord_gl_type[i]),
_vtx_decl.stride, _vtx_decl.texcoord_offset[i]);
}
}
if (_vtx_decl.posmtx_offset != -1) {
CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset);
emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset);
}
ABI_EmitEpilogue(6);
if (Gen::GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE)
emit.ABI_EmitEpilogue(6);
if (emit.GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE)
{
Crash();
}
SetCodePtr(old_code_ptr);
#endif
this->vtx_decl = _vtx_decl;
}

View File

@ -44,7 +44,7 @@
#define USE_JIT
#define COMPILED_CODE_SIZE 4096*4
#define COMPILED_CODE_SIZE 4096
NativeVertexFormat *g_nativeVertexFmt;
@ -116,6 +116,7 @@ void LOADERDECL TexMtx_Write_Short3()
VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
{
m_compiledCode = NULL;
m_numLoadedVertices = 0;
m_VertexSize = 0;
m_numPipelineStages = 0;
@ -126,16 +127,14 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
m_VtxDesc = vtx_desc;
SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex);
m_compiledCode = (u8 *)AllocateExecutableMemory(COMPILED_CODE_SIZE, false);
if (m_compiledCode) {
memset(m_compiledCode, 0, COMPILED_CODE_SIZE);
}
AllocCodeSpace(COMPILED_CODE_SIZE);
CompileVertexTranslator();
WriteProtect();
}
VertexLoader::~VertexLoader()
{
FreeMemoryPages(m_compiledCode, COMPILED_CODE_SIZE);
FreeCodeSpace();
delete m_NativeFmt;
}
@ -143,13 +142,14 @@ void VertexLoader::CompileVertexTranslator()
{
m_VertexSize = 0;
const TVtxAttr &vtx_attr = m_VtxAttr;
//const TVtxDesc &vtx_desc = m_VtxDesc;
#ifdef USE_JIT
u8 *old_code_ptr = GetWritableCodePtr();
SetCodePtr(m_compiledCode);
if (m_compiledCode)
PanicAlert("trying to recompile a vtx translator");
m_compiledCode = GetCodePtr();
ABI_EmitPrologue(4);
// MOV(32, R(EBX), M(&loop_counter));
// Start loop here
const u8 *loop_start = GetCodePtr();
@ -477,7 +477,6 @@ void VertexLoader::CompileVertexTranslator()
//SUB(32, R(EBX), Imm8(1));
J_CC(CC_NZ, loop_start, true);
ABI_EmitEpilogue(4);
SetCodePtr(old_code_ptr);
#endif
m_NativeFmt->Initialize(vtx_decl);
}

View File

@ -22,9 +22,10 @@
#include "CPMemory.h"
#include "DataReader.h"
#include "NativeVertexFormat.h"
#include "x64Emitter.h"
class VertexLoaderUID
{
u32 vid[5];
@ -52,7 +53,7 @@ public:
}
};
class VertexLoader
class VertexLoader : public Gen::XCodeBlock
{
public:
VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
@ -86,7 +87,7 @@ private:
TPipelineFunction m_PipelineStages[64]; // TODO - figure out real max. it's lower.
int m_numPipelineStages;
u8 *m_compiledCode;
const u8 *m_compiledCode;
int m_numLoadedVertices;