Turn the X86 emitter into a class, so the code pointer is no longer a global, yay! Created XCodeBlock that derives from XEmitter, and the Jit now derives from XCodeBlock so it can call all ADD SUB JNZ etc without having to prefix them with "emit.". I think someone's gonna like this.
There's some cleanup still to be done, but hey, it works. There shouldn't be a noticable speed difference. I hope GCC doesn't have a problem with the "member function pointers" I used. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1594 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
b5dcdcf779
commit
104acd5bc1
|
@ -25,7 +25,7 @@ using namespace Gen;
|
|||
// ====================================
|
||||
|
||||
// Sets up a __cdecl function.
|
||||
void ABI_EmitPrologue(int maxCallParams)
|
||||
void XEmitter::ABI_EmitPrologue(int maxCallParams)
|
||||
{
|
||||
#ifdef _M_IX86
|
||||
// Don't really need to do anything
|
||||
|
@ -40,7 +40,8 @@ void ABI_EmitPrologue(int maxCallParams)
|
|||
#error Arch not supported
|
||||
#endif
|
||||
}
|
||||
void ABI_EmitEpilogue(int maxCallParams)
|
||||
|
||||
void XEmitter::ABI_EmitEpilogue(int maxCallParams)
|
||||
{
|
||||
#ifdef _M_IX86
|
||||
RET();
|
||||
|
@ -60,14 +61,14 @@ void ABI_EmitEpilogue(int maxCallParams)
|
|||
// Shared code between Win32 and Unix32
|
||||
// ====================================
|
||||
|
||||
void ABI_CallFunctionC(void *func, u32 param1) {
|
||||
void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
|
||||
ABI_AlignStack(1 * 4);
|
||||
PUSH(32, Imm32(param1));
|
||||
CALL(func);
|
||||
ABI_RestoreStack(1 * 4);
|
||||
}
|
||||
|
||||
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
|
||||
void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
|
||||
ABI_AlignStack(2 * 4);
|
||||
PUSH(32, Imm32(param2));
|
||||
PUSH(32, Imm32(param1));
|
||||
|
@ -76,14 +77,14 @@ void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
|
|||
}
|
||||
|
||||
// Pass a register as a paremeter.
|
||||
void ABI_CallFunctionR(void *func, X64Reg reg1) {
|
||||
void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
|
||||
ABI_AlignStack(1 * 4);
|
||||
PUSH(32, R(reg1));
|
||||
CALL(func);
|
||||
ABI_RestoreStack(1 * 4);
|
||||
}
|
||||
|
||||
void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
|
||||
void XEmitter::ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
|
||||
{
|
||||
ABI_AlignStack(2 * 4);
|
||||
PUSH(32, R(reg2));
|
||||
|
@ -92,7 +93,7 @@ void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
|
|||
ABI_RestoreStack(2 * 4);
|
||||
}
|
||||
|
||||
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
|
||||
void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
|
||||
{
|
||||
ABI_AlignStack(2 * 4);
|
||||
PUSH(32, arg1);
|
||||
|
@ -101,7 +102,7 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
|
|||
ABI_RestoreStack(2 * 4);
|
||||
}
|
||||
|
||||
void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
||||
// Note: 4 * 4 = 16 bytes, so alignment is preserved.
|
||||
PUSH(EBP);
|
||||
PUSH(EBX);
|
||||
|
@ -109,14 +110,14 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
|||
PUSH(EDI);
|
||||
}
|
||||
|
||||
void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
||||
POP(EDI);
|
||||
POP(ESI);
|
||||
POP(EBX);
|
||||
POP(EBP);
|
||||
}
|
||||
|
||||
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
|
||||
unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
|
||||
frameSize += 4; // reserve space for return address
|
||||
unsigned int alignedSize =
|
||||
#ifdef __GNUC__
|
||||
|
@ -128,7 +129,7 @@ unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
|
|||
}
|
||||
|
||||
|
||||
void ABI_AlignStack(unsigned int frameSize) {
|
||||
void XEmitter::ABI_AlignStack(unsigned int frameSize) {
|
||||
// Mac OS X requires the stack to be 16-byte aligned before every call.
|
||||
// Linux requires the stack to be 16-byte aligned before calls that put SSE
|
||||
// vectors on the stack, but since we do not keep track of which calls do that,
|
||||
|
@ -145,7 +146,7 @@ void ABI_AlignStack(unsigned int frameSize) {
|
|||
#endif
|
||||
}
|
||||
|
||||
void ABI_RestoreStack(unsigned int frameSize) {
|
||||
void XEmitter::ABI_RestoreStack(unsigned int frameSize) {
|
||||
unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize);
|
||||
alignedSize -= 4; // return address is POPped at end of call
|
||||
if (alignedSize != 0) {
|
||||
|
@ -155,26 +156,26 @@ void ABI_RestoreStack(unsigned int frameSize) {
|
|||
|
||||
#else
|
||||
|
||||
void ABI_CallFunctionC(void *func, u32 param1) {
|
||||
void XEmitter::ABI_CallFunctionC(void *func, u32 param1) {
|
||||
MOV(32, R(ABI_PARAM1), Imm32(param1));
|
||||
CALL(func);
|
||||
}
|
||||
|
||||
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
|
||||
void XEmitter::ABI_CallFunctionCC(void *func, u32 param1, u32 param2) {
|
||||
MOV(32, R(ABI_PARAM1), Imm32(param1));
|
||||
MOV(32, R(ABI_PARAM2), Imm32(param2));
|
||||
CALL(func);
|
||||
}
|
||||
|
||||
// Pass a register as a paremeter.
|
||||
void ABI_CallFunctionR(void *func, X64Reg reg1) {
|
||||
void XEmitter::ABI_CallFunctionR(void *func, X64Reg reg1) {
|
||||
if (reg1 != ABI_PARAM1)
|
||||
MOV(32, R(ABI_PARAM1), R(reg1));
|
||||
CALL(func);
|
||||
}
|
||||
|
||||
// Pass a register as a paremeter.
|
||||
void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
|
||||
void XEmitter::ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
|
||||
if (reg1 != ABI_PARAM1)
|
||||
MOV(32, R(ABI_PARAM1), R(reg1));
|
||||
if (reg2 != ABI_PARAM2)
|
||||
|
@ -182,7 +183,7 @@ void ABI_CallFunctionRR(void *func, X64Reg reg1, X64Reg reg2) {
|
|||
CALL(func);
|
||||
}
|
||||
|
||||
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
|
||||
void XEmitter::ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
|
||||
{
|
||||
if (!arg1.IsSimpleReg(ABI_PARAM1))
|
||||
MOV(32, R(ABI_PARAM1), arg1);
|
||||
|
@ -190,21 +191,21 @@ void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2)
|
|||
CALL(func);
|
||||
}
|
||||
|
||||
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize) {
|
||||
unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
|
||||
return frameSize;
|
||||
}
|
||||
|
||||
void ABI_AlignStack(unsigned int /*frameSize*/) {
|
||||
void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
|
||||
}
|
||||
|
||||
void ABI_RestoreStack(unsigned int /*frameSize*/) {
|
||||
void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// Win64 Specific Code
|
||||
// ====================================
|
||||
void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
||||
//we only want to do this once
|
||||
PUSH(RBX);
|
||||
PUSH(RSI);
|
||||
|
@ -218,7 +219,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
|||
SUB(64, R(RSP), Imm8(0x28));
|
||||
}
|
||||
|
||||
void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
||||
ADD(64, R(RSP), Imm8(0x28));
|
||||
POP(R15);
|
||||
POP(R14);
|
||||
|
@ -232,7 +233,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
|||
|
||||
// Win64 Specific Code
|
||||
// ====================================
|
||||
void ABI_PushAllCallerSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
|
||||
PUSH(RCX);
|
||||
PUSH(RDX);
|
||||
PUSH(RSI);
|
||||
|
@ -245,7 +246,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
|
|||
SUB(64, R(RSP), Imm8(0x28));
|
||||
}
|
||||
|
||||
void ABI_PopAllCallerSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
|
||||
ADD(64, R(RSP), Imm8(0x28));
|
||||
POP(R11);
|
||||
POP(R10);
|
||||
|
@ -260,7 +261,7 @@ void ABI_PopAllCallerSavedRegsAndAdjustStack() {
|
|||
#else
|
||||
// Unix64 Specific Code
|
||||
// ====================================
|
||||
void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
||||
PUSH(RBX);
|
||||
PUSH(RBP);
|
||||
PUSH(R12);
|
||||
|
@ -270,7 +271,7 @@ void ABI_PushAllCalleeSavedRegsAndAdjustStack() {
|
|||
PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
|
||||
}
|
||||
|
||||
void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
||||
POP(R15);
|
||||
POP(R15);
|
||||
POP(R14);
|
||||
|
@ -280,7 +281,7 @@ void ABI_PopAllCalleeSavedRegsAndAdjustStack() {
|
|||
POP(RBX);
|
||||
}
|
||||
|
||||
void ABI_PushAllCallerSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
|
||||
PUSH(RCX);
|
||||
PUSH(RDX);
|
||||
PUSH(RSI);
|
||||
|
@ -292,7 +293,7 @@ void ABI_PushAllCallerSavedRegsAndAdjustStack() {
|
|||
PUSH(R11);
|
||||
}
|
||||
|
||||
void ABI_PopAllCallerSavedRegsAndAdjustStack() {
|
||||
void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
|
||||
POP(R11);
|
||||
POP(R11);
|
||||
POP(R10);
|
||||
|
|
|
@ -18,8 +18,6 @@
|
|||
#ifndef _JIT_ABI_H
|
||||
#define _JIT_ABI_H
|
||||
|
||||
#include "x64Emitter.h"
|
||||
|
||||
// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
|
||||
// All convensions return values in EAX (+ possibly EDX).
|
||||
|
||||
|
@ -81,42 +79,5 @@
|
|||
|
||||
#endif
|
||||
|
||||
// Utility functions
|
||||
// These only support u32 parameters, but that's enough for a lot of uses.
|
||||
// These will destroy the 1 or 2 first "parameter regs".
|
||||
void ABI_CallFunctionC(void *func, u32 param1);
|
||||
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
|
||||
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
|
||||
|
||||
// Pass a register as a paremeter.
|
||||
void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
|
||||
void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
|
||||
|
||||
// A function that doesn't have any control over what it will do to regs,
|
||||
// such as the dispatcher, should be surrounded by these.
|
||||
void ABI_PushAllCalleeSavedRegsAndAdjustStack();
|
||||
void ABI_PopAllCalleeSavedRegsAndAdjustStack();
|
||||
|
||||
// A function that doesn't know anything about it's surroundings, should
|
||||
// be surrounded by these to establish a safe environment, where it can roam free.
|
||||
// An example is a backpatch injected function.
|
||||
void ABI_PushAllCallerSavedRegsAndAdjustStack();
|
||||
void ABI_PopAllCallerSavedRegsAndAdjustStack();
|
||||
|
||||
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
|
||||
void ABI_AlignStack(unsigned int frameSize);
|
||||
void ABI_RestoreStack(unsigned int frameSize);
|
||||
|
||||
// Sets up a __cdecl function.
|
||||
// Only x64 really needs the parameter.
|
||||
void ABI_EmitPrologue(int maxCallParams);
|
||||
void ABI_EmitEpilogue(int maxCallParams);
|
||||
|
||||
#ifdef _M_IX86
|
||||
inline int ABI_GetNumXMMRegs() { return 8; }
|
||||
#else
|
||||
inline int ABI_GetNumXMMRegs() { return 16; }
|
||||
#endif
|
||||
|
||||
#endif // _JIT_ABI_H
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
// This is purposedely not a full wrapper for virtualalloc/mmap, but it
|
||||
// provides exactly the primitive operations that Dolphin needs.
|
||||
|
||||
void* AllocateExecutableMemory(int size, bool low)
|
||||
void* AllocateExecutableMemory(size_t size, bool low)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
|
||||
|
@ -71,7 +71,7 @@ void* AllocateExecutableMemory(int size, bool low)
|
|||
}
|
||||
|
||||
|
||||
void* AllocateMemoryPages(int size)
|
||||
void* AllocateMemoryPages(size_t size)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_READWRITE);
|
||||
|
@ -99,7 +99,7 @@ void* AllocateMemoryPages(int size)
|
|||
}
|
||||
|
||||
|
||||
void FreeMemoryPages(void* ptr, int size)
|
||||
void FreeMemoryPages(void* ptr, size_t size)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (ptr)
|
||||
|
@ -113,7 +113,7 @@ void FreeMemoryPages(void* ptr, int size)
|
|||
}
|
||||
|
||||
|
||||
void WriteProtectMemory(void* ptr, int size, bool allowExecute)
|
||||
void WriteProtectMemory(void* ptr, size_t size, bool allowExecute)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READ : PAGE_READONLY, 0);
|
||||
|
@ -123,7 +123,7 @@ void WriteProtectMemory(void* ptr, int size, bool allowExecute)
|
|||
}
|
||||
|
||||
|
||||
void UnWriteProtectMemory(void* ptr, int size, bool allowExecute)
|
||||
void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
VirtualProtect(ptr, size, allowExecute ? PAGE_EXECUTE_READWRITE : PAGE_READONLY, 0);
|
||||
|
|
|
@ -18,14 +18,14 @@
|
|||
#ifndef _MEMORYUTIL_H
|
||||
#define _MEMORYUTIL_H
|
||||
|
||||
void* AllocateExecutableMemory(int size, bool low = true);
|
||||
void* AllocateMemoryPages(int size);
|
||||
void FreeMemoryPages(void* ptr, int size);
|
||||
void WriteProtectMemory(void* ptr, int size, bool executable = false);
|
||||
void UnWriteProtectMemory(void* ptr, int size, bool allowExecute);
|
||||
void* AllocateExecutableMemory(size_t size, bool low = true);
|
||||
void* AllocateMemoryPages(size_t size);
|
||||
void FreeMemoryPages(void* ptr, size_t size);
|
||||
void WriteProtectMemory(void* ptr, size_t size, bool executable = false);
|
||||
void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute);
|
||||
|
||||
|
||||
inline int GetPageSize() {return(4096);}
|
||||
inline int GetPageSize() {return 4096;}
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -18,33 +18,29 @@
|
|||
#include <map>
|
||||
|
||||
#include "Common.h"
|
||||
#include "Thunk.h"
|
||||
#include "x64Emitter.h"
|
||||
#include "MemoryUtil.h"
|
||||
#include "ABI.h"
|
||||
#include "Thunk.h"
|
||||
|
||||
using namespace Gen;
|
||||
ThunkManager thunks;
|
||||
|
||||
#define THUNK_ARENA_SIZE 1024*1024*1
|
||||
|
||||
namespace {
|
||||
static std::map<void *, const u8 *> thunks;
|
||||
u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
|
||||
u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
|
||||
|
||||
static u8 *thunk_memory;
|
||||
static u8 *thunk_code;
|
||||
static const u8 *save_regs;
|
||||
static const u8 *load_regs;
|
||||
static u16 saved_mxcsr;
|
||||
}
|
||||
|
||||
void Thunk_Init()
|
||||
namespace
|
||||
{
|
||||
thunk_memory = (u8 *)AllocateExecutableMemory(THUNK_ARENA_SIZE);
|
||||
thunk_code = thunk_memory;
|
||||
|
||||
GenContext ctx(&thunk_code);
|
||||
static u8 GC_ALIGNED32(saved_fp_state[16 * 4 * 4]);
|
||||
static u8 GC_ALIGNED32(saved_gpr_state[16 * 8]);
|
||||
static u16 saved_mxcsr;
|
||||
|
||||
} // namespace
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
void ThunkManager::Init()
|
||||
{
|
||||
AllocCodeSpace(THUNK_ARENA_SIZE);
|
||||
save_regs = GetCodePtr();
|
||||
for (int i = 2; i < ABI_GetNumXMMRegs(); i++)
|
||||
MOVAPS(M(saved_fp_state + i * 16), (X64Reg)(XMM0 + i));
|
||||
|
@ -89,31 +85,27 @@ void Thunk_Init()
|
|||
RET();
|
||||
}
|
||||
|
||||
void Thunk_Reset()
|
||||
void ThunkManager::Reset()
|
||||
{
|
||||
thunks.clear();
|
||||
thunk_code = thunk_memory;
|
||||
ResetCodePtr();
|
||||
}
|
||||
|
||||
void Thunk_Shutdown()
|
||||
void ThunkManager::Shutdown()
|
||||
{
|
||||
Thunk_Reset();
|
||||
FreeMemoryPages(thunk_memory, THUNK_ARENA_SIZE);
|
||||
thunk_memory = 0;
|
||||
thunk_code = 0;
|
||||
Reset();
|
||||
FreeCodeSpace();
|
||||
}
|
||||
|
||||
void *ProtectFunction(void *function, int num_params)
|
||||
void *ThunkManager::ProtectFunction(void *function, int num_params)
|
||||
{
|
||||
std::map<void *, const u8 *>::iterator iter;
|
||||
iter = thunks.find(function);
|
||||
if (iter != thunks.end())
|
||||
return (void *)iter->second;
|
||||
|
||||
if (!thunk_memory)
|
||||
if (!region)
|
||||
PanicAlert("Trying to protect functions before the emu is started. Bad bad bad.");
|
||||
|
||||
GenContext gen(&thunk_code);
|
||||
const u8 *call_point = GetCodePtr();
|
||||
// Make sure to align stack.
|
||||
#ifdef _M_X64
|
||||
|
|
|
@ -18,6 +18,11 @@
|
|||
#ifndef _THUNK_H
|
||||
#define _THUNK_H
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "Common.h"
|
||||
#include "x64Emitter.h"
|
||||
|
||||
// This simple class creates a wrapper around a C/C++ function that saves all fp state
|
||||
// before entering it, and restores it upon exit. This is required to be able to selectively
|
||||
// call functions from generated code, without inflicting the performance hit and increase
|
||||
|
@ -30,10 +35,21 @@
|
|||
// NOT THREAD SAFE. This may only be used from the CPU thread.
|
||||
// Any other thread using this stuff will be FATAL.
|
||||
|
||||
void Thunk_Init();
|
||||
void Thunk_Reset();
|
||||
void Thunk_Shutdown();
|
||||
class ThunkManager : public Gen::XCodeBlock
|
||||
{
|
||||
std::map<void *, const u8 *> thunks;
|
||||
|
||||
void *ProtectFunction(void *function, int num_params);
|
||||
const u8 *save_regs;
|
||||
const u8 *load_regs;
|
||||
|
||||
public:
|
||||
void Init();
|
||||
void Reset();
|
||||
void Shutdown();
|
||||
|
||||
void *ProtectFunction(void *function, int num_params);
|
||||
};
|
||||
|
||||
extern ThunkManager thunks;
|
||||
|
||||
#endif
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -21,217 +21,264 @@
|
|||
#define _DOLPHIN_INTEL_CODEGEN
|
||||
|
||||
#include "Common.h"
|
||||
#include "MemoryUtil.h"
|
||||
|
||||
namespace Gen
|
||||
{
|
||||
enum X64Reg
|
||||
|
||||
enum X64Reg
|
||||
{
|
||||
EAX = 0, EBX = 3, ECX = 1, EDX = 2,
|
||||
ESI = 6, EDI = 7, EBP = 5, ESP = 4,
|
||||
|
||||
RAX = 0, RBX = 3, RCX = 1, RDX = 2,
|
||||
RSI = 6, RDI = 7, RBP = 5, RSP = 4,
|
||||
R8 = 8, R9 = 9, R10 = 10,R11 = 11,
|
||||
R12 = 12,R13 = 13,R14 = 14,R15 = 15,
|
||||
|
||||
AL = 0, BL = 3, CL = 1, DL = 2,
|
||||
AH = 4, BH = 7, CH = 5, DH = 6,
|
||||
|
||||
AX = 0, BX = 3, CX = 1, DX = 2,
|
||||
SI = 6, DI = 7, BP = 5, SP = 4,
|
||||
|
||||
XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
|
||||
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
|
||||
|
||||
INVALID_REG = 0xFFFFFFFF
|
||||
};
|
||||
|
||||
enum CCFlags
|
||||
{
|
||||
CC_O = 0,
|
||||
CC_NO = 1,
|
||||
CC_B = 2, CC_C = 2, CC_NAE = 2,
|
||||
CC_NB = 3, CC_NC = 3, CC_AE = 3,
|
||||
CC_Z = 4, CC_E = 4,
|
||||
CC_NZ = 5, CC_NE = 5,
|
||||
CC_BE = 6, CC_NA = 6,
|
||||
CC_NBE = 7, CC_A = 7,
|
||||
CC_S = 8,
|
||||
CC_NS = 9,
|
||||
CC_P = 0xA, CC_PE = 0xA,
|
||||
CC_NP = 0xB, CC_PO = 0xB,
|
||||
CC_L = 0xC, CC_NGE = 0xC,
|
||||
CC_NL = 0xD, CC_GE = 0xD,
|
||||
CC_LE = 0xE, CC_NG = 0xE,
|
||||
CC_NLE = 0xF, CC_G = 0xF
|
||||
};
|
||||
|
||||
enum
|
||||
{
|
||||
NUMGPRs = 16,
|
||||
NUMXMMs = 16,
|
||||
};
|
||||
|
||||
enum
|
||||
{
|
||||
SCALE_NONE = 0,
|
||||
SCALE_1 = 1,
|
||||
SCALE_2 = 2,
|
||||
SCALE_4 = 4,
|
||||
SCALE_8 = 8,
|
||||
SCALE_ATREG = 16,
|
||||
SCALE_RIP = 0xFF,
|
||||
SCALE_IMM8 = 0xF0,
|
||||
SCALE_IMM16 = 0xF1,
|
||||
SCALE_IMM32 = 0xF2,
|
||||
SCALE_IMM64 = 0xF3,
|
||||
};
|
||||
|
||||
enum NormalOp {
|
||||
nrmADD,
|
||||
nrmADC,
|
||||
nrmSUB,
|
||||
nrmSBB,
|
||||
nrmAND,
|
||||
nrmOR ,
|
||||
nrmXOR,
|
||||
nrmMOV,
|
||||
nrmTEST,
|
||||
nrmCMP,
|
||||
nrmXCHG,
|
||||
};
|
||||
|
||||
class XEmitter;
|
||||
|
||||
// RIP addressing does not benefit from micro op fusion on Core arch
|
||||
struct OpArg
|
||||
{
|
||||
OpArg() {} // dummy op arg, used for storage
|
||||
OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
|
||||
{
|
||||
EAX = 0, EBX = 3, ECX = 1, EDX = 2,
|
||||
ESI = 6, EDI = 7, EBP = 5, ESP = 4,
|
||||
|
||||
RAX = 0, RBX = 3, RCX = 1, RDX = 2,
|
||||
RSI = 6, RDI = 7, RBP = 5, RSP = 4,
|
||||
R8 = 8, R9 = 9, R10 = 10,R11 = 11,
|
||||
R12 = 12,R13 = 13,R14 = 14,R15 = 15,
|
||||
operandReg = 0;
|
||||
scale = (u8)_scale;
|
||||
offsetOrBaseReg = (u8)rmReg;
|
||||
indexReg = (u8)scaledReg;
|
||||
//if scale == 0 never mind offseting
|
||||
offset = _offset;
|
||||
}
|
||||
void WriteRex(XEmitter *emit, bool op64, int customOp = -1) const;
|
||||
void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
|
||||
void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
|
||||
// This one is public - must be written to
|
||||
u64 offset; // use RIP-relative as much as possible - 64-bit immediates are not available.
|
||||
u8 operandReg;
|
||||
|
||||
AL = 0, BL = 3, CL = 1, DL = 2,
|
||||
AH = 4, BH = 7, CH = 5, DH = 6,
|
||||
void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
|
||||
bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
|
||||
bool IsSimpleReg() const {return scale == SCALE_NONE;}
|
||||
bool IsSimpleReg(X64Reg reg) const {
|
||||
if (!IsSimpleReg())
|
||||
return false;
|
||||
return GetSimpleReg() == reg;
|
||||
}
|
||||
|
||||
AX = 0, BX = 3, CX = 1, DX = 2,
|
||||
SI = 6, DI = 7, BP = 5, SP = 4,
|
||||
|
||||
XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
|
||||
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
|
||||
|
||||
INVALID_REG = 0xFFFFFFFF
|
||||
};
|
||||
|
||||
enum CCFlags
|
||||
bool CanDoOpWith(const OpArg &other) const
|
||||
{
|
||||
CC_O = 0,
|
||||
CC_NO = 1,
|
||||
CC_B = 2, CC_C = 2, CC_NAE = 2,
|
||||
CC_NB = 3, CC_NC = 3, CC_AE = 3,
|
||||
CC_Z = 4, CC_E = 4,
|
||||
CC_NZ = 5, CC_NE = 5,
|
||||
CC_BE = 6, CC_NA = 6,
|
||||
CC_NBE = 7, CC_A = 7,
|
||||
CC_S = 8,
|
||||
CC_NS = 9,
|
||||
CC_P = 0xA, CC_PE = 0xA,
|
||||
CC_NP = 0xB, CC_PO = 0xB,
|
||||
CC_L = 0xC, CC_NGE = 0xC,
|
||||
CC_NL = 0xD, CC_GE = 0xD,
|
||||
CC_LE = 0xE, CC_NG = 0xE,
|
||||
CC_NLE = 0xF, CC_G = 0xF
|
||||
};
|
||||
if (IsSimpleReg()) return true;
|
||||
if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
enum
|
||||
int GetImmBits() const
|
||||
{
|
||||
NUMGPRs = 16,
|
||||
NUMXMMs = 16,
|
||||
};
|
||||
switch (scale)
|
||||
{
|
||||
case SCALE_IMM8: return 8;
|
||||
case SCALE_IMM16: return 16;
|
||||
case SCALE_IMM32: return 32;
|
||||
case SCALE_IMM64: return 64;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
enum
|
||||
X64Reg GetSimpleReg() const
|
||||
{
|
||||
SCALE_NONE = 0,
|
||||
SCALE_1 = 1,
|
||||
SCALE_2 = 2,
|
||||
SCALE_4 = 4,
|
||||
SCALE_8 = 8,
|
||||
SCALE_ATREG = 16,
|
||||
SCALE_RIP = 0xFF,
|
||||
SCALE_IMM8 = 0xF0,
|
||||
SCALE_IMM16 = 0xF1,
|
||||
SCALE_IMM32 = 0xF2,
|
||||
SCALE_IMM64 = 0xF3,
|
||||
};
|
||||
if (scale == SCALE_NONE)
|
||||
return (X64Reg)offsetOrBaseReg;
|
||||
else
|
||||
return INVALID_REG;
|
||||
}
|
||||
private:
|
||||
u8 scale;
|
||||
u8 offsetOrBaseReg;
|
||||
u8 indexReg;
|
||||
};
|
||||
|
||||
inline OpArg M(void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
|
||||
inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);}
|
||||
inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
|
||||
inline OpArg MDisp(X64Reg value, int offset) {
|
||||
return OpArg((u32)offset, SCALE_ATREG, value); }
|
||||
inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
|
||||
{
|
||||
return OpArg(offset, scale, base, scaled);
|
||||
}
|
||||
inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);}
|
||||
inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
|
||||
inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
|
||||
inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
|
||||
#ifdef _M_X64
|
||||
inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
|
||||
#else
|
||||
inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
|
||||
#endif
|
||||
|
||||
struct FixupBranch
|
||||
{
|
||||
u8 *ptr;
|
||||
int type; //0 = 8bit 1 = 32bit
|
||||
};
|
||||
|
||||
enum SSECompare
|
||||
{
|
||||
EQ = 0,
|
||||
LT,
|
||||
LE,
|
||||
UNORD,
|
||||
NEQ,
|
||||
NLT,
|
||||
NLE,
|
||||
ORD,
|
||||
};
|
||||
|
||||
typedef const u8* JumpTarget;
|
||||
|
||||
class XEmitter
|
||||
{
|
||||
friend struct OpArg; // for Write8 etc
|
||||
private:
|
||||
u8 *code;
|
||||
|
||||
void Rex(int w, int r, int x, int b);
|
||||
void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
|
||||
void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
|
||||
void WriteMulDivType(int bits, OpArg src, int ext);
|
||||
void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2);
|
||||
void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
|
||||
void WriteMXCSR(OpArg arg, int ext);
|
||||
void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
|
||||
|
||||
protected:
|
||||
inline void Write8(u8 value) {*code++ = value;}
|
||||
inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
|
||||
inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
|
||||
inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
|
||||
|
||||
public:
|
||||
XEmitter() { code = NULL; }
|
||||
XEmitter(u8 *code_ptr) { code = code_ptr; }
|
||||
|
||||
void WriteModRM(int mod, int rm, int reg);
|
||||
void WriteSIB(int scale, int index, int base);
|
||||
|
||||
void SetCodePtr(u8 *ptr);
|
||||
void ReserveCodeSpace(int bytes);
|
||||
const u8 *AlignCode4();
|
||||
const u8 *AlignCode16();
|
||||
const u8 *AlignCodePage();
|
||||
const u8 *GetCodePtr();
|
||||
const u8 *GetCodePtr() const;
|
||||
u8 *GetWritableCodePtr();
|
||||
|
||||
// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
|
||||
// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
|
||||
// INC and DEC are slow on Intel Core, but not on AMD. They create a
|
||||
// false flag dependency because they only update a subset of the flags.
|
||||
// XCHG is SLOW and should be avoided.
|
||||
|
||||
// Safe way to temporarily redirect the code generator.
|
||||
class GenContext
|
||||
{
|
||||
u8 **code_ptr_ptr;
|
||||
u8 *saved_ptr;
|
||||
public:
|
||||
GenContext(u8 **code_ptr_ptr_)
|
||||
{
|
||||
saved_ptr = GetWritableCodePtr();
|
||||
code_ptr_ptr = code_ptr_ptr_;
|
||||
SetCodePtr(*code_ptr_ptr);
|
||||
}
|
||||
~GenContext()
|
||||
{
|
||||
*code_ptr_ptr = GetWritableCodePtr();
|
||||
SetCodePtr(saved_ptr);
|
||||
}
|
||||
};
|
||||
|
||||
enum NormalOp {
|
||||
nrmADD,
|
||||
nrmADC,
|
||||
nrmSUB,
|
||||
nrmSBB,
|
||||
nrmAND,
|
||||
nrmOR ,
|
||||
nrmXOR,
|
||||
nrmMOV,
|
||||
nrmTEST,
|
||||
nrmCMP,
|
||||
nrmXCHG,
|
||||
};
|
||||
|
||||
// Make the generation routine examine which direction to go
|
||||
// probably has to be a static
|
||||
|
||||
// RIP addressing does not benefit from micro op fusion on Core arch
|
||||
struct OpArg
|
||||
{
|
||||
OpArg() {} //dummy op arg, used for storage
|
||||
OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
|
||||
{
|
||||
operandReg = 0;
|
||||
scale = (u8)_scale;
|
||||
offsetOrBaseReg = (u8)rmReg;
|
||||
indexReg = (u8)scaledReg;
|
||||
//if scale == 0 never mind offseting
|
||||
offset = _offset;
|
||||
}
|
||||
void WriteRex(bool op64, int customOp = -1) const;
|
||||
void WriteRest(int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF) const;
|
||||
void WriteSingleByteOp(u8 op, X64Reg operandReg, int bits);
|
||||
//This one is public - must be written to
|
||||
u64 offset; //use RIP-relative as much as possible - avoid 64-bit immediates at all costs
|
||||
u8 operandReg;
|
||||
|
||||
void WriteNormalOp(bool toRM, NormalOp op, const OpArg &operand, int bits) const;
|
||||
bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
|
||||
bool IsSimpleReg() const {return scale == SCALE_NONE;}
|
||||
bool IsSimpleReg(X64Reg reg) const {
|
||||
if (!IsSimpleReg())
|
||||
return false;
|
||||
return GetSimpleReg() == reg;
|
||||
}
|
||||
bool CanDoOpWith(const OpArg &other) const
|
||||
{
|
||||
if (IsSimpleReg()) return true;
|
||||
if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
int GetImmBits() const
|
||||
{
|
||||
switch (scale)
|
||||
{
|
||||
case SCALE_IMM8: return 8;
|
||||
case SCALE_IMM16: return 16;
|
||||
case SCALE_IMM32: return 32;
|
||||
case SCALE_IMM64: return 64;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
X64Reg GetSimpleReg() const
|
||||
{
|
||||
if (scale == SCALE_NONE)
|
||||
return (X64Reg)offsetOrBaseReg;
|
||||
else
|
||||
return INVALID_REG;
|
||||
}
|
||||
private:
|
||||
u8 scale;
|
||||
u8 offsetOrBaseReg;
|
||||
u8 indexReg;
|
||||
};
|
||||
|
||||
inline OpArg M(void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
|
||||
inline OpArg R(X64Reg value) {return OpArg(0, SCALE_NONE, value);}
|
||||
inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
|
||||
inline OpArg MDisp(X64Reg value, int offset) {
|
||||
return OpArg((u32)offset, SCALE_ATREG, value); }
|
||||
inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
|
||||
{
|
||||
return OpArg(offset, scale, base, scaled);
|
||||
}
|
||||
inline OpArg Imm8 (u8 imm) {return OpArg(imm, SCALE_IMM8);}
|
||||
inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
|
||||
inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
|
||||
inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
|
||||
#ifdef _M_X64
|
||||
inline OpArg ImmPtr(void* imm) {return Imm64((u64)imm);}
|
||||
#else
|
||||
inline OpArg ImmPtr(void* imm) {return Imm32((u32)imm);}
|
||||
#endif
|
||||
|
||||
// Debug breakpoint
|
||||
void INT3();
|
||||
|
||||
// Do nothing
|
||||
void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
|
||||
|
||||
// Save energy in wait-loops on P4 only. Probably not too useful.
|
||||
void PAUSE();
|
||||
void RET();
|
||||
|
||||
// Flag control
|
||||
void STC();
|
||||
void CLC();
|
||||
void CMC();
|
||||
|
||||
// These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
|
||||
void LAHF(); // 3 cycle vector path
|
||||
void SAHF(); // direct path fast
|
||||
|
||||
|
||||
// Stack control
|
||||
void PUSH(X64Reg reg);
|
||||
void POP(X64Reg reg);
|
||||
void PUSH(int bits, const OpArg ®);
|
||||
void POP(int bits, const OpArg ®);
|
||||
void PUSHF();
|
||||
void POPF();
|
||||
|
||||
typedef const u8* JumpTarget;
|
||||
|
||||
struct FixupBranch
|
||||
{
|
||||
u8 *ptr;
|
||||
int type; //0 = 8bit 1 = 32bit
|
||||
};
|
||||
|
||||
// Flow control
|
||||
void RET();
|
||||
void RET_FAST();
|
||||
void UD2();
|
||||
FixupBranch J(bool force5bytes = false);
|
||||
|
||||
void JMP(const u8 * addr, bool force5Bytes = false);
|
||||
|
@ -239,7 +286,7 @@ namespace Gen
|
|||
void JMPptr(const OpArg &arg);
|
||||
void JMPself(); //infinite loop!
|
||||
|
||||
void CALL(void *fnptr);
|
||||
void CALL(const void *fnptr);
|
||||
void CALLptr(OpArg arg);
|
||||
|
||||
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
|
||||
|
@ -248,66 +295,20 @@ namespace Gen
|
|||
|
||||
void SetJumpTarget(const FixupBranch &branch);
|
||||
|
||||
//WARNING - INC and DEC slow on Intel Core, but not on AMD, since it creates
|
||||
//false flags dependencies because they only update a subset of the flags
|
||||
|
||||
// ector - I hereby BAN inc and dec due to their horribleness :P
|
||||
// void INC(int bits, OpArg arg);
|
||||
// void DEC(int bits, OpArg arg);
|
||||
|
||||
void SETcc(CCFlags flag, OpArg dest);
|
||||
// Note: CMOV brings small if any benefit on current cpus, unfortunately.
|
||||
// Note: CMOV brings small if any benefit on current cpus.
|
||||
void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
|
||||
|
||||
// Fences
|
||||
void LFENCE();
|
||||
void MFENCE();
|
||||
void SFENCE();
|
||||
|
||||
// Bit scan
|
||||
void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
|
||||
void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit
|
||||
|
||||
//These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
|
||||
|
||||
void LAHF(); // 3 cycle vector path
|
||||
void SAHF(); // direct path fast
|
||||
|
||||
//Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
|
||||
//LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
|
||||
|
||||
//Actually REP MOVSD could be useful :P
|
||||
|
||||
void MOVNTI(int bits, OpArg dest, X64Reg src);
|
||||
|
||||
void MUL(int bits, OpArg src); //UNSIGNED
|
||||
void DIV(int bits, OpArg src);
|
||||
void IMUL(int bits, OpArg src); //SIGNED
|
||||
void IDIV(int bits, OpArg src);
|
||||
void IMUL(int bits, X64Reg regOp, OpArg src);
|
||||
void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
|
||||
|
||||
|
||||
void NEG(int bits, OpArg src);
|
||||
void NOT(int bits, OpArg src);
|
||||
|
||||
void ROL(int bits, OpArg dest, OpArg shift);
|
||||
void ROR(int bits, OpArg dest, OpArg shift);
|
||||
void RCL(int bits, OpArg dest, OpArg shift);
|
||||
void RCR(int bits, OpArg dest, OpArg shift);
|
||||
void SHL(int bits, OpArg dest, OpArg shift);
|
||||
void SHR(int bits, OpArg dest, OpArg shift);
|
||||
void SAR(int bits, OpArg dest, OpArg shift);
|
||||
|
||||
|
||||
void CWD(int bits = 16);
|
||||
inline void CDQ() {CWD(32);}
|
||||
inline void CQO() {CWD(64);}
|
||||
void CBW(int bits = 8);
|
||||
inline void CWDE() {CBW(16);}
|
||||
inline void CDQE() {CBW(32);}
|
||||
|
||||
void LEA(int bits, X64Reg dest, OpArg src);
|
||||
|
||||
|
||||
// Cache control
|
||||
enum PrefetchLevel
|
||||
{
|
||||
PF_NTA, //Non-temporal (data used once and only once)
|
||||
|
@ -316,58 +317,82 @@ namespace Gen
|
|||
PF_T2, //Levels 3+ (aliased to T0 on AMD)
|
||||
};
|
||||
void PREFETCH(PrefetchLevel level, OpArg arg);
|
||||
|
||||
void MOVNTI(int bits, OpArg dest, X64Reg src);
|
||||
void MOVNTDQ(OpArg arg, X64Reg regOp);
|
||||
void MOVNTPS(OpArg arg, X64Reg regOp);
|
||||
void MOVNTPD(OpArg arg, X64Reg regOp);
|
||||
|
||||
// Multiplication / division
|
||||
void MUL(int bits, OpArg src); //UNSIGNED
|
||||
void IMUL(int bits, OpArg src); //SIGNED
|
||||
void IMUL(int bits, X64Reg regOp, OpArg src);
|
||||
void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
|
||||
void DIV(int bits, OpArg src);
|
||||
void IDIV(int bits, OpArg src);
|
||||
|
||||
// Shift
|
||||
void ROL(int bits, OpArg dest, OpArg shift);
|
||||
void ROR(int bits, OpArg dest, OpArg shift);
|
||||
void RCL(int bits, OpArg dest, OpArg shift);
|
||||
void RCR(int bits, OpArg dest, OpArg shift);
|
||||
void SHL(int bits, OpArg dest, OpArg shift);
|
||||
void SHR(int bits, OpArg dest, OpArg shift);
|
||||
void SAR(int bits, OpArg dest, OpArg shift);
|
||||
|
||||
// Extend EAX into EDX in various ways
|
||||
void CWD(int bits = 16);
|
||||
inline void CDQ() {CWD(32);}
|
||||
inline void CQO() {CWD(64);}
|
||||
void CBW(int bits = 8);
|
||||
inline void CWDE() {CBW(16);}
|
||||
inline void CDQE() {CBW(32);}
|
||||
|
||||
// Load effective address
|
||||
void LEA(int bits, X64Reg dest, OpArg src);
|
||||
|
||||
// Integer arithmetic
|
||||
void NEG (int bits, OpArg src);
|
||||
void ADD (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void ADC (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void SUB (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void SBB (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void AND (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void CMP (int bits, const OpArg &a1, const OpArg &a2);
|
||||
|
||||
// Bit operations
|
||||
void NOT (int bits, OpArg src);
|
||||
void OR (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void XOR (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void MOV (int bits, const OpArg &a1, const OpArg &a2);
|
||||
void TEST(int bits, const OpArg &a1, const OpArg &a2);
|
||||
void CMP (int bits, const OpArg &a1, const OpArg &a2);
|
||||
|
||||
// XCHG is SLOW and should be avoided.
|
||||
//void XCHG(int bits, const OpArg &a1, const OpArg &a2);
|
||||
|
||||
// Are these useful at all? Consider removing.
|
||||
void XCHG(int bits, const OpArg &a1, const OpArg &a2);
|
||||
void XCHG_AHAL();
|
||||
|
||||
// Byte swapping (32 and 64-bit only).
|
||||
void BSWAP(int bits, X64Reg reg);
|
||||
|
||||
// Sign/zero extension
|
||||
void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
|
||||
void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
|
||||
|
||||
enum SSECompare
|
||||
{
|
||||
EQ = 0,
|
||||
LT,
|
||||
LE,
|
||||
UNORD,
|
||||
NEQ,
|
||||
NLT,
|
||||
NLE,
|
||||
ORD,
|
||||
};
|
||||
|
||||
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
|
||||
void STMXCSR(OpArg memloc);
|
||||
void LDMXCSR(OpArg memloc);
|
||||
|
||||
// Regular SSE/SSE2 instructions
|
||||
// Prefixes
|
||||
void LOCK();
|
||||
void REP();
|
||||
void REPNE();
|
||||
|
||||
void FWAIT();
|
||||
|
||||
// SSE/SSE2: Floating point arithmetic
|
||||
void ADDSS(X64Reg regOp, OpArg arg);
|
||||
void ADDSD(X64Reg regOp, OpArg arg);
|
||||
void SUBSS(X64Reg regOp, OpArg arg);
|
||||
void SUBSD(X64Reg regOp, OpArg arg);
|
||||
void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void ANDSS(X64Reg regOp, OpArg arg);
|
||||
void ANDSD(X64Reg regOp, OpArg arg);
|
||||
void ANDNSS(X64Reg regOp, OpArg arg);
|
||||
void ANDNSD(X64Reg regOp, OpArg arg);
|
||||
void ORSS(X64Reg regOp, OpArg arg);
|
||||
void ORSD(X64Reg regOp, OpArg arg);
|
||||
void XORSS(X64Reg regOp, OpArg arg);
|
||||
void XORSD(X64Reg regOp, OpArg arg);
|
||||
void MULSS(X64Reg regOp, OpArg arg);
|
||||
void MULSD(X64Reg regOp, OpArg arg);
|
||||
void DIVSS(X64Reg regOp, OpArg arg);
|
||||
|
@ -381,45 +406,65 @@ namespace Gen
|
|||
void RSQRTSS(X64Reg regOp, OpArg arg);
|
||||
void RSQRTSD(X64Reg regOp, OpArg arg);
|
||||
|
||||
void COMISS(X64Reg regOp, OpArg arg);
|
||||
void COMISD(X64Reg regOp, OpArg arg);
|
||||
// SSE/SSE2: Floating point bitwise (yes)
|
||||
void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void ANDSS(X64Reg regOp, OpArg arg);
|
||||
void ANDSD(X64Reg regOp, OpArg arg);
|
||||
void ANDNSS(X64Reg regOp, OpArg arg);
|
||||
void ANDNSD(X64Reg regOp, OpArg arg);
|
||||
void ORSS(X64Reg regOp, OpArg arg);
|
||||
void ORSD(X64Reg regOp, OpArg arg);
|
||||
void XORSS(X64Reg regOp, OpArg arg);
|
||||
void XORSD(X64Reg regOp, OpArg arg);
|
||||
|
||||
// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
|
||||
void ADDPS(X64Reg regOp, OpArg arg);
|
||||
void ADDPD(X64Reg regOp, OpArg arg);
|
||||
void SUBPS(X64Reg regOp, OpArg arg);
|
||||
void SUBPD(X64Reg regOp, OpArg arg);
|
||||
void CMPPS(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void ANDPS(X64Reg regOp, OpArg arg);
|
||||
void ANDPD(X64Reg regOp, OpArg arg);
|
||||
void ANDNPS(X64Reg regOp, OpArg arg);
|
||||
void ANDNPD(X64Reg regOp, OpArg arg);
|
||||
void ORPS(X64Reg regOp, OpArg arg);
|
||||
void ORPD(X64Reg regOp, OpArg arg);
|
||||
void XORPS(X64Reg regOp, OpArg arg);
|
||||
void XORPD(X64Reg regOp, OpArg arg);
|
||||
void MULPS(X64Reg regOp, OpArg arg);
|
||||
void MULPD(X64Reg regOp, OpArg arg);
|
||||
void DIVPS(X64Reg regOp, OpArg arg);
|
||||
void DIVPD(X64Reg regOp, OpArg arg);
|
||||
void MINPS(X64Reg regOp, OpArg arg);
|
||||
void MINPD(X64Reg regOp, OpArg arg);
|
||||
void MAXPS(X64Reg regOp, OpArg arg);
|
||||
void MAXPD(X64Reg regOp, OpArg arg);
|
||||
void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
|
||||
void MULPS(X64Reg regOp, OpArg arg);
|
||||
void MULPD(X64Reg regOp, OpArg arg);
|
||||
void DIVPS(X64Reg regOp, OpArg arg);
|
||||
void DIVPD(X64Reg regOp, OpArg arg);
|
||||
void MINPS(X64Reg regOp, OpArg arg);
|
||||
void MINPD(X64Reg regOp, OpArg arg);
|
||||
void MAXPS(X64Reg regOp, OpArg arg);
|
||||
void MAXPD(X64Reg regOp, OpArg arg);
|
||||
void SQRTPS(X64Reg regOp, OpArg arg);
|
||||
void SQRTPD(X64Reg regOp, OpArg arg);
|
||||
void RSQRTPS(X64Reg regOp, OpArg arg);
|
||||
void RSQRTPD(X64Reg regOp, OpArg arg);
|
||||
|
||||
// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
|
||||
void ANDPS(X64Reg regOp, OpArg arg);
|
||||
void ANDPD(X64Reg regOp, OpArg arg);
|
||||
void ANDNPS(X64Reg regOp, OpArg arg);
|
||||
void ANDNPD(X64Reg regOp, OpArg arg);
|
||||
void ORPS(X64Reg regOp, OpArg arg);
|
||||
void ORPD(X64Reg regOp, OpArg arg);
|
||||
void XORPS(X64Reg regOp, OpArg arg);
|
||||
void XORPD(X64Reg regOp, OpArg arg);
|
||||
|
||||
// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
|
||||
void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);
|
||||
void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);
|
||||
|
||||
|
||||
// SSE/SSE2: Useful alternative to shuffle in some cases.
|
||||
void MOVDDUP(X64Reg regOp, OpArg arg);
|
||||
|
||||
void UNPCKLPD(X64Reg dest, OpArg src);
|
||||
void UNPCKHPD(X64Reg dest, OpArg src);
|
||||
|
||||
// SSE/SSE2: Compares.
|
||||
void COMISS(X64Reg regOp, OpArg arg);
|
||||
void COMISD(X64Reg regOp, OpArg arg);
|
||||
void UCOMISS(X64Reg regOp, OpArg arg);
|
||||
void UCOMISD(X64Reg regOp, OpArg arg);
|
||||
|
||||
// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
|
||||
void MOVAPS(X64Reg regOp, OpArg arg);
|
||||
void MOVAPD(X64Reg regOp, OpArg arg);
|
||||
void MOVAPS(OpArg arg, X64Reg regOp);
|
||||
|
@ -435,20 +480,20 @@ namespace Gen
|
|||
void MOVSS(OpArg arg, X64Reg regOp);
|
||||
void MOVSD(OpArg arg, X64Reg regOp);
|
||||
|
||||
void MOVMSKPS(X64Reg dest, OpArg arg);
|
||||
void MOVMSKPD(X64Reg dest, OpArg arg);
|
||||
|
||||
void MOVD_xmm(X64Reg dest, const OpArg &arg);
|
||||
void MOVQ_xmm(X64Reg dest, OpArg arg);
|
||||
void MOVD_xmm(const OpArg &arg, X64Reg src);
|
||||
void MOVQ_xmm(OpArg arg, X64Reg src);
|
||||
|
||||
// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
|
||||
void MOVMSKPS(X64Reg dest, OpArg arg);
|
||||
void MOVMSKPD(X64Reg dest, OpArg arg);
|
||||
|
||||
// SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
|
||||
void MASKMOVDQU(X64Reg dest, X64Reg src);
|
||||
void LDDQU(X64Reg dest, OpArg src);
|
||||
|
||||
void UNPCKLPD(X64Reg dest, OpArg src);
|
||||
void UNPCKHPD(X64Reg dest, OpArg src);
|
||||
|
||||
// SSE/SSE2: Data type conversions.
|
||||
void CVTPS2PD(X64Reg dest, OpArg src);
|
||||
void CVTPD2PS(X64Reg dest, OpArg src);
|
||||
void CVTSS2SD(X64Reg dest, OpArg src);
|
||||
|
@ -458,7 +503,7 @@ namespace Gen
|
|||
void CVTPD2DQ(X64Reg regOp, OpArg arg);
|
||||
void CVTDQ2PS(X64Reg regOp, const OpArg &arg);
|
||||
|
||||
//Integer SSE instructions
|
||||
// SSE2: Packed integer instructions
|
||||
void PACKSSDW(X64Reg dest, OpArg arg);
|
||||
void PACKSSWB(X64Reg dest, OpArg arg);
|
||||
//void PACKUSDW(X64Reg dest, OpArg arg);
|
||||
|
@ -528,42 +573,138 @@ namespace Gen
|
|||
|
||||
void RTDSC();
|
||||
|
||||
void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
|
||||
void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
|
||||
void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
|
||||
void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
|
||||
// Utility functions
|
||||
// These only support u32 parameters, but that's enough for a lot of uses.
|
||||
// These will destroy the 1 or 2 first "parameter regs".
|
||||
void ABI_CallFunctionC(void *func, u32 param1);
|
||||
void ABI_CallFunctionCC(void *func, u32 param1, u32 param2);
|
||||
void ABI_CallFunctionAC(void *func, const Gen::OpArg &arg1, u32 param2);
|
||||
|
||||
// Pass a register as a paremeter.
|
||||
void ABI_CallFunctionR(void *func, Gen::X64Reg reg1);
|
||||
void ABI_CallFunctionRR(void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
|
||||
|
||||
// A function that doesn't have any control over what it will do to regs,
|
||||
// such as the dispatcher, should be surrounded by these.
|
||||
void ABI_PushAllCalleeSavedRegsAndAdjustStack();
|
||||
void ABI_PopAllCalleeSavedRegsAndAdjustStack();
|
||||
|
||||
// A function that doesn't know anything about it's surroundings, should
|
||||
// be surrounded by these to establish a safe environment, where it can roam free.
|
||||
// An example is a backpatch injected function.
|
||||
void ABI_PushAllCallerSavedRegsAndAdjustStack();
|
||||
void ABI_PopAllCallerSavedRegsAndAdjustStack();
|
||||
|
||||
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize);
|
||||
void ABI_AlignStack(unsigned int frameSize);
|
||||
void ABI_RestoreStack(unsigned int frameSize);
|
||||
|
||||
// Sets up a __cdecl function.
|
||||
// Only x64 really needs the parameter.
|
||||
void ABI_EmitPrologue(int maxCallParams);
|
||||
void ABI_EmitEpilogue(int maxCallParams);
|
||||
|
||||
#ifdef _M_IX86
|
||||
inline int ABI_GetNumXMMRegs() { return 8; }
|
||||
#else
|
||||
inline int ABI_GetNumXMMRegs() { return 16; }
|
||||
#endif
|
||||
|
||||
// Strange call wrappers.
|
||||
void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
|
||||
void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
|
||||
void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
|
||||
void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
|
||||
|
||||
#if defined(_M_IX86) || !defined(_WIN32)
|
||||
|
||||
#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
|
||||
#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e))
|
||||
#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f))
|
||||
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g))
|
||||
#define CallCdeclFunction3_I(a,b,c,d) CallCdeclFunction3((void *)(a), (b), (c), (d))
|
||||
#define CallCdeclFunction4_I(a,b,c,d,e) CallCdeclFunction4((void *)(a), (b), (c), (d), (e))
|
||||
#define CallCdeclFunction5_I(a,b,c,d,e,f) CallCdeclFunction5((void *)(a), (b), (c), (d), (e), (f))
|
||||
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) CallCdeclFunction6((void *)(a), (b), (c), (d), (e), (f), (g))
|
||||
|
||||
#define DECLARE_IMPORT(x)
|
||||
#define DECLARE_IMPORT(x)
|
||||
|
||||
#else
|
||||
|
||||
// Comments from VertexLoader.cpp about these horrors:
|
||||
// Comments from VertexLoader.cpp about these horrors:
|
||||
|
||||
// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
|
||||
// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
|
||||
// want to grab the function pointers from the import table instead.
|
||||
// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
|
||||
// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
|
||||
// want to grab the function pointers from the import table instead.
|
||||
|
||||
void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
|
||||
void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
|
||||
void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
|
||||
void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
|
||||
void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
|
||||
void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
|
||||
void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
|
||||
void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
|
||||
|
||||
#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
|
||||
#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
|
||||
#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
|
||||
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
|
||||
#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
|
||||
#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
|
||||
#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
|
||||
#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
|
||||
|
||||
#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
|
||||
#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
|
||||
|
||||
#endif
|
||||
}; // class XEmitter
|
||||
|
||||
}
|
||||
|
||||
// Everything that needs to generate X86 code should inherit from this.
|
||||
// You get memory management for free, plus, you can use all the MOV etc functions without
|
||||
// having to prefix them with gen-> or something similar.
|
||||
class XCodeBlock : public XEmitter
|
||||
{
|
||||
protected:
|
||||
u8 *region;
|
||||
size_t region_size;
|
||||
|
||||
public:
|
||||
XCodeBlock() : region(NULL), region_size(0) {}
|
||||
virtual ~XCodeBlock() { if (region) FreeCodeSpace(); }
|
||||
|
||||
// Call this before you generate any code.
|
||||
void AllocCodeSpace(int size)
|
||||
{
|
||||
region_size = size;
|
||||
region = (u8*)AllocateExecutableMemory(region_size);
|
||||
SetCodePtr(region);
|
||||
}
|
||||
|
||||
// Always clear code space with breakpoints, so that if someone accidentally executes
|
||||
// uninitialized, it just breaks into the debugger.
|
||||
void ClearCodeSpace()
|
||||
{
|
||||
// x86/64: 0xCC = breakpoint
|
||||
memset(region, 0xCC, region_size);
|
||||
ResetCodePtr();
|
||||
}
|
||||
|
||||
// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
|
||||
void FreeCodeSpace()
|
||||
{
|
||||
FreeMemoryPages(region, region_size);
|
||||
region = NULL;
|
||||
region_size = 0;
|
||||
}
|
||||
|
||||
// Cannot currently be undone. Will write protect the entire code region.
|
||||
// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
|
||||
void WriteProtect()
|
||||
{
|
||||
WriteProtectMemory(region, region_size, true);
|
||||
}
|
||||
|
||||
void ResetCodePtr()
|
||||
{
|
||||
SetCodePtr(region);
|
||||
}
|
||||
|
||||
size_t GetSpaceLeft() const
|
||||
{
|
||||
return region_size - (GetCodePtr() - region);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif
|
||||
|
|
|
@ -46,7 +46,7 @@ namespace HW
|
|||
{
|
||||
CoreTiming::Init();
|
||||
|
||||
Thunk_Init(); // not really hw, but this way we know it's inited early :P
|
||||
thunks.Init(); // not really hw, but this way we know it's inited early :P
|
||||
State_Init();
|
||||
|
||||
// Init the whole Hardware
|
||||
|
@ -88,7 +88,7 @@ namespace HW
|
|||
}
|
||||
|
||||
State_Shutdown();
|
||||
Thunk_Shutdown();
|
||||
thunks.Shutdown();
|
||||
CoreTiming::Shutdown();
|
||||
}
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ LONG NTAPI Handler(PEXCEPTION_POINTERS pPtrs)
|
|||
|
||||
//We could emulate the memory accesses here, but then they would still be around to take up
|
||||
//execution resources. Instead, we backpatch into a generic memory call and retry.
|
||||
u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx);
|
||||
const u8 *new_rip = jit.BackPatch(codePtr, accessType, emAddress, ctx);
|
||||
|
||||
// Rip/Eip needs to be updated.
|
||||
if (new_rip)
|
||||
|
|
|
@ -164,6 +164,8 @@ ps_adds1
|
|||
Jit64 jit;
|
||||
PPCAnalyst::CodeBuffer code_buffer(32000);
|
||||
|
||||
int CODE_SIZE = 1024*1024*16;
|
||||
|
||||
namespace CPUCompare
|
||||
{
|
||||
extern u32 m_BlockStart;
|
||||
|
@ -171,6 +173,11 @@ namespace CPUCompare
|
|||
|
||||
void Jit64::Init()
|
||||
{
|
||||
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
|
||||
{
|
||||
CODE_SIZE = 1024*1024*8*8;
|
||||
}
|
||||
|
||||
jo.optimizeStack = true;
|
||||
jo.enableBlocklink = true; // Speed boost, but not 100% safe
|
||||
#ifdef _M_X64
|
||||
|
@ -182,6 +189,23 @@ namespace CPUCompare
|
|||
jo.fpAccurateFlags = true;
|
||||
jo.optimizeGatherPipe = true;
|
||||
jo.fastInterrupts = false;
|
||||
|
||||
gpr.SetEmitter(this);
|
||||
fpr.SetEmitter(this);
|
||||
|
||||
trampolines.Init();
|
||||
AllocCodeSpace(CODE_SIZE);
|
||||
InitCache();
|
||||
asm_routines.Init();
|
||||
}
|
||||
|
||||
void Jit64::Shutdown()
|
||||
{
|
||||
FreeCodeSpace();
|
||||
ShutdownCache();
|
||||
|
||||
trampolines.Shutdown();
|
||||
asm_routines.Shutdown();
|
||||
}
|
||||
|
||||
void Jit64::WriteCallInterpreter(UGeckoInstruction _inst)
|
||||
|
@ -271,7 +295,7 @@ namespace CPUCompare
|
|||
else
|
||||
{
|
||||
MOV(32, M(&PC), Imm32(destination));
|
||||
JMP(Asm::dispatcher, true);
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -280,7 +304,7 @@ namespace CPUCompare
|
|||
MOV(32, M(&PC), R(EAX));
|
||||
Cleanup();
|
||||
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
|
||||
JMP(Asm::dispatcher, true);
|
||||
JMP(asm_routines.dispatcher, true);
|
||||
}
|
||||
|
||||
void Jit64::WriteRfiExitDestInEAX()
|
||||
|
@ -288,7 +312,7 @@ namespace CPUCompare
|
|||
MOV(32, M(&PC), R(EAX));
|
||||
Cleanup();
|
||||
SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount));
|
||||
JMP(Asm::testExceptions, true);
|
||||
JMP(asm_routines.testExceptions, true);
|
||||
}
|
||||
|
||||
void Jit64::WriteExceptionExit(u32 exception)
|
||||
|
@ -296,7 +320,7 @@ namespace CPUCompare
|
|||
Cleanup();
|
||||
OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception));
|
||||
MOV(32, M(&PC), Imm32(js.compilerPC + 4));
|
||||
JMP(Asm::testExceptions, true);
|
||||
JMP(asm_routines.testExceptions, true);
|
||||
}
|
||||
|
||||
const u8* Jit64::DoJit(u32 emaddress, JitBlock &b)
|
||||
|
@ -326,11 +350,13 @@ namespace CPUCompare
|
|||
// Downcount flag check. The last block decremented downcounter, and the flag should still be available.
|
||||
FixupBranch skip = J_CC(CC_NBE);
|
||||
MOV(32, M(&PC), Imm32(js.blockStart));
|
||||
JMP(Asm::doTiming, true); // downcount hit zero - go doTiming.
|
||||
JMP(asm_routines.doTiming, true); // downcount hit zero - go doTiming.
|
||||
SetJumpTarget(skip);
|
||||
|
||||
const u8 *normalEntry = GetCodePtr();
|
||||
if (ImHereDebug) CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
|
||||
|
||||
if (ImHereDebug)
|
||||
CALL((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
|
||||
|
||||
if (js.fpa.any)
|
||||
{
|
||||
|
@ -338,7 +364,7 @@ namespace CPUCompare
|
|||
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
|
||||
FixupBranch b1 = J_CC(CC_NZ);
|
||||
MOV(32, M(&PC), Imm32(js.blockStart));
|
||||
JMP(Asm::fpException, true);
|
||||
JMP(asm_routines.fpException, true);
|
||||
SetJumpTarget(b1);
|
||||
}
|
||||
|
||||
|
@ -348,7 +374,7 @@ namespace CPUCompare
|
|||
TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF));
|
||||
FixupBranch b1 = J_CC(CC_Z);
|
||||
MOV(32, M(&PC), Imm32(js.blockStart));
|
||||
JMP(Asm::testExceptions, true);
|
||||
JMP(asm_routines.testExceptions, true);
|
||||
SetJumpTarget(b1);
|
||||
}
|
||||
|
||||
|
@ -404,7 +430,7 @@ namespace CPUCompare
|
|||
if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32)
|
||||
{
|
||||
js.fifoBytesThisBlock -= 32;
|
||||
CALL(ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
|
||||
CALL(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0));
|
||||
}
|
||||
|
||||
PPCTables::CompileInstruction(ops[i].inst);
|
||||
|
|
|
@ -24,7 +24,9 @@
|
|||
|
||||
#include "../PPCAnalyst.h"
|
||||
#include "JitCache.h"
|
||||
#include "JitRegCache.h"
|
||||
#include "x64Emitter.h"
|
||||
#include "x64Analyzer.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
|
@ -47,8 +49,24 @@ struct CONTEXT
|
|||
|
||||
#endif
|
||||
|
||||
class Jit64
|
||||
|
||||
class TrampolineCache : public Gen::XCodeBlock
|
||||
{
|
||||
public:
|
||||
void Init();
|
||||
void Shutdown();
|
||||
|
||||
const u8 *GetReadTrampoline(const InstructionInfo &info);
|
||||
const u8 *GetWriteTrampoline(const InstructionInfo &info);
|
||||
};
|
||||
|
||||
|
||||
class Jit64 : public Gen::XCodeBlock
|
||||
{
|
||||
TrampolineCache trampolines;
|
||||
GPRRegCache gpr;
|
||||
FPURegCache fpr;
|
||||
|
||||
public:
|
||||
typedef void (*CompiledCode)();
|
||||
|
||||
|
@ -157,7 +175,7 @@ public:
|
|||
bool RangeIntersect(int s1, int e1, int s2, int e2) const;
|
||||
bool IsInJitCode(const u8 *codePtr);
|
||||
|
||||
u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx);
|
||||
const u8 *BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx);
|
||||
|
||||
#define JIT_OPCODE 0
|
||||
|
||||
|
@ -165,6 +183,7 @@ public:
|
|||
const u8* DoJit(u32 emaddress, JitBlock &b);
|
||||
|
||||
void Init();
|
||||
void Shutdown();
|
||||
|
||||
// Utilities for use by opcodes
|
||||
|
||||
|
@ -188,10 +207,10 @@ public:
|
|||
void ForceSinglePrecisionP(Gen::X64Reg xmm);
|
||||
void JitClearCA();
|
||||
void JitSetCA();
|
||||
void tri_op(int d, int a, int b, bool reversible, void (*op)(Gen::X64Reg, Gen::OpArg));
|
||||
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
|
||||
typedef u32 (*Operation)(u32 a, u32 b);
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
||||
void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg));
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
||||
void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg));
|
||||
|
||||
|
||||
// OPCODES
|
||||
|
|
|
@ -31,27 +31,12 @@
|
|||
#include "../../HW/CPUCompare.h"
|
||||
#include "../../HW/GPFifo.h"
|
||||
#include "../../Core.h"
|
||||
#include "JitAsm.h"
|
||||
|
||||
using namespace Gen;
|
||||
int blocksExecuted;
|
||||
|
||||
namespace Asm
|
||||
{
|
||||
const u8 *enterCode;
|
||||
const u8 *testExceptions;
|
||||
const u8 *fpException;
|
||||
const u8 *doTiming;
|
||||
const u8 *dispatcher;
|
||||
const u8 *dispatcherNoCheck;
|
||||
const u8 *dispatcherPcInEAX;
|
||||
const u8 *computeRc;
|
||||
const u8 *computeRcFp;
|
||||
|
||||
const u8 *fifoDirectWrite8;
|
||||
const u8 *fifoDirectWrite16;
|
||||
const u8 *fifoDirectWrite32;
|
||||
const u8 *fifoDirectWriteFloat;
|
||||
const u8 *fifoDirectWriteXmm64;
|
||||
static int temp32;
|
||||
|
||||
bool compareEnabled = false;
|
||||
|
||||
|
@ -72,16 +57,15 @@ static bool enableStatistics = false;
|
|||
//RBX - Base pointer of memory
|
||||
//R15 - Pointer to array of block pointers
|
||||
|
||||
AsmRoutineManager asm_routines;
|
||||
|
||||
// PLAN: no more block numbers - crazy opcodes just contain offset within
|
||||
// dynarec buffer
|
||||
// At this offset - 4, there is an int specifying the block number.
|
||||
|
||||
|
||||
void GenerateCommon();
|
||||
|
||||
#ifdef _M_IX86
|
||||
void Generate()
|
||||
void AsmRoutineManager::Generate()
|
||||
{
|
||||
enterCode = AlignCode16();
|
||||
PUSH(EBP);
|
||||
|
@ -129,7 +113,6 @@ void Generate()
|
|||
ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1));
|
||||
}
|
||||
//grab from list and jump to it
|
||||
//INT3();
|
||||
MOV(32, R(EDX), ImmPtr(jit.GetCodePointers()));
|
||||
JMPptr(MComplex(EDX, EAX, 4, 0));
|
||||
SetJumpTarget(notfound);
|
||||
|
@ -180,12 +163,14 @@ void Generate()
|
|||
|
||||
#elif defined(_M_X64)
|
||||
|
||||
void Generate()
|
||||
void AsmRoutineManager::Generate()
|
||||
{
|
||||
enterCode = AlignCode16();
|
||||
|
||||
ABI_PushAllCalleeSavedRegsAndAdjustStack();
|
||||
|
||||
if (!jit.GetCodePointers() || !Memory::base)
|
||||
PanicAlert("Memory::base and jit.GetCodePointers() must return valid values");
|
||||
MOV(64, R(RBX), Imm64((u64)Memory::base));
|
||||
MOV(64, R(R15), Imm64((u64)jit.GetCodePointers())); //It's below 2GB so 32 bits are good enough
|
||||
const u8 *outerLoop = GetCodePtr();
|
||||
|
@ -264,7 +249,7 @@ void Generate()
|
|||
}
|
||||
#endif
|
||||
|
||||
void GenFifoWrite(int size)
|
||||
void AsmRoutineManager::GenFifoWrite(int size)
|
||||
{
|
||||
// Assume value in ABI_PARAM1
|
||||
PUSH(ESI);
|
||||
|
@ -287,8 +272,7 @@ void GenFifoWrite(int size)
|
|||
RET();
|
||||
}
|
||||
|
||||
static int temp32;
|
||||
void GenFifoFloatWrite()
|
||||
void AsmRoutineManager::GenFifoFloatWrite()
|
||||
{
|
||||
// Assume value in XMM0
|
||||
PUSH(ESI);
|
||||
|
@ -306,7 +290,7 @@ void GenFifoFloatWrite()
|
|||
RET();
|
||||
}
|
||||
|
||||
void GenFifoXmm64Write()
|
||||
void AsmRoutineManager::GenFifoXmm64Write()
|
||||
{
|
||||
// Assume value in XMM0. Assume pre-byteswapped (unlike the others here!)
|
||||
PUSH(ESI);
|
||||
|
@ -319,7 +303,7 @@ void GenFifoXmm64Write()
|
|||
RET();
|
||||
}
|
||||
|
||||
void GenerateCommon()
|
||||
void AsmRoutineManager::GenerateCommon()
|
||||
{
|
||||
// USES_CR
|
||||
computeRc = AlignCode16();
|
||||
|
@ -364,5 +348,3 @@ void GenerateCommon()
|
|||
SetJumpTarget(skip_fast_write);
|
||||
CALL((void *)&Memory::Write_U8);*/
|
||||
}
|
||||
|
||||
} // namespace Asm
|
||||
|
|
|
@ -14,33 +14,71 @@
|
|||
|
||||
// Official SVN repository and contact information can be found at
|
||||
// http://code.google.com/p/dolphin-emu/
|
||||
|
||||
#ifndef _JITASM_H
|
||||
#define _JITASM_H
|
||||
|
||||
namespace Asm
|
||||
#include "x64Emitter.h"
|
||||
|
||||
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
|
||||
// code at runtime. In the case of fixed code like this, after writing it, we write
|
||||
// protect the memory, essentially making it work just like precompiled code.
|
||||
|
||||
// There are some advantages to this approach:
|
||||
// 1) No need to setup an external assembler in the build.
|
||||
// 2) Cross platform, as long as it's x86/x64.
|
||||
// 3) Can optimize code at runtime for the specific CPU model.
|
||||
// There aren't really any disadvantages other than having to maintain a x86 emitter,
|
||||
// which we have to do anyway :)
|
||||
//
|
||||
// To add a new asm routine, just add another const here, and add the code to Generate.
|
||||
// Also, possibly increase the size of the code buffer.
|
||||
|
||||
class AsmRoutineManager : public Gen::XCodeBlock
|
||||
{
|
||||
extern const u8 *enterCode;
|
||||
|
||||
extern const u8 *dispatcher;
|
||||
extern const u8 *dispatcherNoCheck;
|
||||
extern const u8 *dispatcherPcInEAX;
|
||||
|
||||
extern const u8 *fpException;
|
||||
extern const u8 *computeRc;
|
||||
extern const u8 *computeRcFp;
|
||||
extern const u8 *testExceptions;
|
||||
extern const u8 *dispatchPcInEAX;
|
||||
extern const u8 *doTiming;
|
||||
|
||||
extern const u8 *fifoDirectWrite8;
|
||||
extern const u8 *fifoDirectWrite16;
|
||||
extern const u8 *fifoDirectWrite32;
|
||||
extern const u8 *fifoDirectWriteFloat;
|
||||
extern const u8 *fifoDirectWriteXmm64;
|
||||
|
||||
extern bool compareEnabled;
|
||||
private:
|
||||
void Generate();
|
||||
}
|
||||
void GenerateCommon();
|
||||
void GenFifoWrite(int size);
|
||||
void GenFifoFloatWrite();
|
||||
void GenFifoXmm64Write();
|
||||
|
||||
public:
|
||||
void Init() {
|
||||
AllocCodeSpace(8192);
|
||||
Generate();
|
||||
WriteProtect();
|
||||
}
|
||||
|
||||
void Shutdown() {
|
||||
FreeCodeSpace();
|
||||
}
|
||||
|
||||
|
||||
// Public generated functions. Just CALL(M((void*)func)) them.
|
||||
|
||||
const u8 *enterCode;
|
||||
|
||||
const u8 *dispatcher;
|
||||
const u8 *dispatcherNoCheck;
|
||||
const u8 *dispatcherPcInEAX;
|
||||
|
||||
const u8 *fpException;
|
||||
const u8 *computeRc;
|
||||
const u8 *computeRcFp;
|
||||
const u8 *testExceptions;
|
||||
const u8 *dispatchPcInEAX;
|
||||
const u8 *doTiming;
|
||||
|
||||
const u8 *fifoDirectWrite8;
|
||||
const u8 *fifoDirectWrite16;
|
||||
const u8 *fifoDirectWrite32;
|
||||
const u8 *fifoDirectWriteFloat;
|
||||
const u8 *fifoDirectWriteXmm64;
|
||||
|
||||
bool compareEnabled;
|
||||
};
|
||||
|
||||
extern AsmRoutineManager asm_routines;
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
using namespace Gen;
|
||||
|
||||
extern u8 *trampolineCodePtr;
|
||||
|
||||
|
||||
void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
|
||||
u64 code_addr = (u64)codePtr;
|
||||
disassembler disasm;
|
||||
|
@ -51,17 +51,105 @@ void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) {
|
|||
return;
|
||||
}
|
||||
|
||||
|
||||
void TrampolineCache::Init()
|
||||
{
|
||||
AllocCodeSpace(1024 * 1024);
|
||||
}
|
||||
|
||||
void TrampolineCache::Shutdown()
|
||||
{
|
||||
AllocCodeSpace(1024 * 1024);
|
||||
}
|
||||
|
||||
// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
|
||||
const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info)
|
||||
{
|
||||
if (GetSpaceLeft() < 1024)
|
||||
PanicAlert("Trampoline cache full");
|
||||
|
||||
X64Reg addrReg = (X64Reg)info.scaledReg;
|
||||
X64Reg dataReg = (X64Reg)info.regOperandReg;
|
||||
const u8 *trampoline = GetCodePtr();
|
||||
#ifdef _M_X64
|
||||
// It's a read. Easy.
|
||||
ABI_PushAllCallerSavedRegsAndAdjustStack();
|
||||
if (addrReg != ABI_PARAM1)
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
|
||||
if (info.displacement) {
|
||||
ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
|
||||
}
|
||||
switch (info.operandSize) {
|
||||
case 4:
|
||||
CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1));
|
||||
break;
|
||||
}
|
||||
ABI_PopAllCallerSavedRegsAndAdjustStack();
|
||||
MOV(32, R(dataReg), R(EAX));
|
||||
RET();
|
||||
#endif
|
||||
return trampoline;
|
||||
}
|
||||
|
||||
// Extremely simplistic - just generate the requested trampoline. May reuse them in the future.
|
||||
const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info)
|
||||
{
|
||||
if (GetSpaceLeft() < 1024)
|
||||
PanicAlert("Trampoline cache full");
|
||||
|
||||
X64Reg addrReg = (X64Reg)info.scaledReg;
|
||||
X64Reg dataReg = (X64Reg)info.regOperandReg;
|
||||
if (dataReg != EAX)
|
||||
PanicAlert("Backpatch write - not through EAX");
|
||||
|
||||
const u8 *trampoline = GetCodePtr();
|
||||
|
||||
#ifdef _M_X64
|
||||
|
||||
// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a
|
||||
// hardware access - we can take shortcuts.
|
||||
//if (emAddress == 0xCC008000)
|
||||
// PanicAlert("caught a fifo write");
|
||||
CMP(32, R(addrReg), Imm32(0xCC008000));
|
||||
FixupBranch skip_fast = J_CC(CC_NE, false);
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
|
||||
CALL((void*)asm_routines.fifoDirectWrite32);
|
||||
RET();
|
||||
SetJumpTarget(skip_fast);
|
||||
ABI_PushAllCallerSavedRegsAndAdjustStack();
|
||||
if (addrReg != ABI_PARAM1) {
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
|
||||
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
|
||||
} else {
|
||||
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
|
||||
}
|
||||
if (info.displacement) {
|
||||
ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
|
||||
}
|
||||
switch (info.operandSize) {
|
||||
case 4:
|
||||
CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2));
|
||||
break;
|
||||
}
|
||||
ABI_PopAllCallerSavedRegsAndAdjustStack();
|
||||
RET();
|
||||
#endif
|
||||
|
||||
return trampoline;
|
||||
}
|
||||
|
||||
|
||||
// This generates some fairly heavy trampolines, but:
|
||||
// 1) It's really necessary. We don't know anything about the context.
|
||||
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
|
||||
// that many of them in a typical program/game.
|
||||
u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
|
||||
const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
|
||||
{
|
||||
#ifdef _M_X64
|
||||
if (!IsInJitCode(codePtr))
|
||||
return 0; // this will become a regular crash real soon after this
|
||||
|
||||
u8 *oldCodePtr = GetWritableCodePtr();
|
||||
InstructionInfo info;
|
||||
if (!DisassembleMov(codePtr, info, accessType)) {
|
||||
BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress);
|
||||
|
@ -81,108 +169,42 @@ u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx)
|
|||
BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress);
|
||||
}
|
||||
|
||||
X64Reg addrReg = (X64Reg)info.scaledReg;
|
||||
X64Reg dataReg = (X64Reg)info.regOperandReg;
|
||||
if (info.otherReg != RBX)
|
||||
PanicAlert("BackPatch : Base reg not RBX."
|
||||
"\n\nAttempted to access %08x.", emAddress);
|
||||
//if (accessType == OP_ACCESS_WRITE)
|
||||
// PanicAlert("BackPatch : Currently only supporting reads."
|
||||
// "\n\nAttempted to write to %08x.", emAddress);
|
||||
|
||||
// OK, let's write a trampoline, and a jump to it.
|
||||
// Later, let's share trampolines.
|
||||
|
||||
if (accessType == OP_ACCESS_WRITE)
|
||||
PanicAlert("BackPatch : Currently only supporting reads."
|
||||
"\n\nAttempted to write to %08x.", emAddress);
|
||||
|
||||
// In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads.
|
||||
// Next step - support writes, special case FIFO writes. Also, support 32-bit mode.
|
||||
u8 *trampoline = trampolineCodePtr;
|
||||
SetCodePtr(trampolineCodePtr);
|
||||
|
||||
if (accessType == 0)
|
||||
{
|
||||
// It's a read. Easy.
|
||||
ABI_PushAllCallerSavedRegsAndAdjustStack();
|
||||
if (addrReg != ABI_PARAM1)
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
|
||||
if (info.displacement) {
|
||||
ADD(32, R(ABI_PARAM1), Imm32(info.displacement));
|
||||
}
|
||||
switch (info.operandSize) {
|
||||
case 4:
|
||||
CALL(ProtectFunction((void *)&Memory::Read_U32, 1));
|
||||
break;
|
||||
default:
|
||||
BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress);
|
||||
break;
|
||||
}
|
||||
ABI_PopAllCallerSavedRegsAndAdjustStack();
|
||||
MOV(32, R(dataReg), R(EAX));
|
||||
RET();
|
||||
trampolineCodePtr = GetWritableCodePtr();
|
||||
|
||||
SetCodePtr(codePtr);
|
||||
XEmitter emitter(codePtr);
|
||||
int bswapNopCount;
|
||||
// Check the following BSWAP for REX byte
|
||||
if ((GetCodePtr()[info.instructionSize] & 0xF0) == 0x40)
|
||||
if ((codePtr[info.instructionSize] & 0xF0) == 0x40)
|
||||
bswapNopCount = 3;
|
||||
else
|
||||
bswapNopCount = 2;
|
||||
CALL(trampoline);
|
||||
NOP((int)info.instructionSize + bswapNopCount - 5);
|
||||
SetCodePtr(oldCodePtr);
|
||||
|
||||
const u8 *trampoline = trampolines.GetReadTrampoline(info);
|
||||
emitter.CALL((void *)trampoline);
|
||||
emitter.NOP((int)info.instructionSize + bswapNopCount - 5);
|
||||
return codePtr;
|
||||
}
|
||||
else if (accessType == 1)
|
||||
{
|
||||
// It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a
|
||||
// hardware access - we can take shortcuts.
|
||||
//if (emAddress == 0xCC008000)
|
||||
// PanicAlert("caught a fifo write");
|
||||
if (dataReg != EAX)
|
||||
PanicAlert("Backpatch write - not through EAX");
|
||||
CMP(32, R(addrReg), Imm32(0xCC008000));
|
||||
FixupBranch skip_fast = J_CC(CC_NE, false);
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
|
||||
CALL((void*)Asm::fifoDirectWrite32);
|
||||
RET();
|
||||
SetJumpTarget(skip_fast);
|
||||
ABI_PushAllCallerSavedRegsAndAdjustStack();
|
||||
if (addrReg != ABI_PARAM1) {
|
||||
//INT3();
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
|
||||
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
|
||||
} else {
|
||||
MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg));
|
||||
MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg));
|
||||
}
|
||||
if (info.displacement) {
|
||||
ADD(32, R(ABI_PARAM2), Imm32(info.displacement));
|
||||
}
|
||||
switch (info.operandSize) {
|
||||
case 4:
|
||||
CALL(ProtectFunction((void *)&Memory::Write_U32, 2));
|
||||
break;
|
||||
default:
|
||||
BackPatchError(StringFromFormat("We don't handle the size %i yet in backpatch", info.operandSize), codePtr, emAddress);
|
||||
break;
|
||||
}
|
||||
ABI_PopAllCallerSavedRegsAndAdjustStack();
|
||||
RET();
|
||||
|
||||
trampolineCodePtr = GetWritableCodePtr();
|
||||
|
||||
// TODO: special case FIFO writes. Also, support 32-bit mode.
|
||||
// Also, debug this so that it actually works correctly :P
|
||||
XEmitter emitter(codePtr - 2);
|
||||
// We know it's EAX so the BSWAP before will be two byte. Overwrite it.
|
||||
SetCodePtr(codePtr - 2);
|
||||
CALL(trampoline);
|
||||
NOP((int)info.instructionSize - 3);
|
||||
const u8 *trampoline = trampolines.GetWriteTrampoline(info);
|
||||
emitter.CALL((void *)trampoline);
|
||||
emitter.NOP((int)info.instructionSize - 3);
|
||||
if (info.instructionSize < 3)
|
||||
PanicAlert("instruction too small");
|
||||
SetCodePtr(oldCodePtr);
|
||||
|
||||
// We entered here with a BSWAP-ed EAX. We'll have to swap it back.
|
||||
ctx->Rax = Common::swap32(ctx->Rax);
|
||||
|
||||
return codePtr - 2;
|
||||
}
|
||||
return 0;
|
||||
|
|
|
@ -56,19 +56,15 @@ using namespace Gen;
|
|||
op_agent_t agent;
|
||||
#endif
|
||||
static u8 *codeCache;
|
||||
static u8 *genFunctions;
|
||||
static u8 *trampolineCache;
|
||||
u8 *trampolineCodePtr;
|
||||
#define INVALID_EXIT 0xFFFFFFFF
|
||||
|
||||
enum
|
||||
{
|
||||
//CODE_SIZE = 1024*1024*8,
|
||||
GEN_SIZE = 4096,
|
||||
TRAMPOLINE_SIZE = 1024*1024,
|
||||
//MAX_NUM_BLOCKS = 65536,
|
||||
};
|
||||
int CODE_SIZE = 1024*1024*16;
|
||||
|
||||
int MAX_NUM_BLOCKS = 65536*2;
|
||||
|
||||
static u8 **blockCodePointers;
|
||||
|
@ -89,36 +85,22 @@ using namespace Gen;
|
|||
|
||||
void Jit64::InitCache()
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITUnlimitedCache)
|
||||
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
|
||||
{
|
||||
CODE_SIZE = 1024*1024*8*8;
|
||||
MAX_NUM_BLOCKS = 65536*8;
|
||||
}
|
||||
|
||||
codeCache = (u8*)AllocateExecutableMemory(CODE_SIZE);
|
||||
genFunctions = (u8*)AllocateExecutableMemory(GEN_SIZE);
|
||||
trampolineCache = (u8*)AllocateExecutableMemory(TRAMPOLINE_SIZE);
|
||||
trampolineCodePtr = trampolineCache;
|
||||
|
||||
#ifdef OPROFILE_REPORT
|
||||
agent = op_open_agent();
|
||||
#endif
|
||||
blocks = new JitBlock[MAX_NUM_BLOCKS];
|
||||
blockCodePointers = new u8*[MAX_NUM_BLOCKS];
|
||||
|
||||
ClearCache();
|
||||
SetCodePtr(genFunctions);
|
||||
Asm::Generate();
|
||||
// Protect the generated functions
|
||||
WriteProtectMemory(genFunctions, GEN_SIZE, true);
|
||||
SetCodePtr(codeCache);
|
||||
}
|
||||
|
||||
void Jit64::ShutdownCache()
|
||||
{
|
||||
UnWriteProtectMemory(genFunctions, GEN_SIZE, true);
|
||||
FreeMemoryPages(codeCache, CODE_SIZE);
|
||||
FreeMemoryPages(genFunctions, GEN_SIZE);
|
||||
FreeMemoryPages(trampolineCache, TRAMPOLINE_SIZE);
|
||||
delete [] blocks;
|
||||
delete [] blockCodePointers;
|
||||
blocks = 0;
|
||||
|
@ -135,21 +117,23 @@ using namespace Gen;
|
|||
{
|
||||
Core::DisplayMessage("Cleared code cache.", 3000);
|
||||
// Is destroying the blocks really necessary?
|
||||
for (int i = 0; i < numBlocks; i++) {
|
||||
for (int i = 0; i < numBlocks; i++)
|
||||
{
|
||||
DestroyBlock(i, false);
|
||||
}
|
||||
links_to.clear();
|
||||
trampolineCodePtr = trampolineCache;
|
||||
numBlocks = 0;
|
||||
memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS);
|
||||
memset(codeCache, 0xCC, CODE_SIZE);
|
||||
SetCodePtr(codeCache);
|
||||
|
||||
trampolines.ClearCodeSpace();
|
||||
}
|
||||
|
||||
void Jit64::DestroyBlocksWithFlag(BlockFlag death_flag)
|
||||
{
|
||||
for (int i = 0; i < numBlocks; i++) {
|
||||
if (blocks[i].flags & death_flag) {
|
||||
for (int i = 0; i < numBlocks; i++)
|
||||
{
|
||||
if (blocks[i].flags & death_flag)
|
||||
{
|
||||
DestroyBlock(i, false);
|
||||
}
|
||||
}
|
||||
|
@ -190,10 +174,10 @@ using namespace Gen;
|
|||
|
||||
const u8 *Jit64::Jit(u32 emAddress)
|
||||
{
|
||||
if (GetCodePtr() >= codeCache + CODE_SIZE - 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1)
|
||||
if (GetSpaceLeft() < 0x10000 || numBlocks >= MAX_NUM_BLOCKS - 1)
|
||||
{
|
||||
LOG(DYNA_REC, "JIT cache full - clearing.")
|
||||
if(Core::g_CoreStartupParameter.bJITUnlimitedCache)
|
||||
if (Core::g_CoreStartupParameter.bJITUnlimitedCache)
|
||||
{
|
||||
PanicAlert("What? JIT cache still full - clearing.");
|
||||
}
|
||||
|
@ -221,10 +205,8 @@ using namespace Gen;
|
|||
}
|
||||
}
|
||||
|
||||
u8 *oldCodePtr = GetWritableCodePtr();
|
||||
LinkBlock(numBlocks);
|
||||
LinkBlockExits(numBlocks);
|
||||
SetCodePtr(oldCodePtr);
|
||||
}
|
||||
|
||||
#ifdef OPROFILE_REPORT
|
||||
|
@ -257,7 +239,7 @@ using namespace Gen;
|
|||
|
||||
void Jit64::EnterFastRun()
|
||||
{
|
||||
CompiledCode pExecAddr = (CompiledCode)Asm::enterCode;
|
||||
CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode;
|
||||
pExecAddr();
|
||||
//Will return when PowerPC::state changes
|
||||
}
|
||||
|
@ -336,8 +318,8 @@ using namespace Gen;
|
|||
int destinationBlock = GetBlockNumberFromAddress(b.exitAddress[e]);
|
||||
if (destinationBlock != -1)
|
||||
{
|
||||
SetCodePtr(b.exitPtrs[e]);
|
||||
JMP(blocks[destinationBlock].checkedEntry, true);
|
||||
XEmitter emit(b.exitPtrs[e]);
|
||||
emit.JMP(blocks[destinationBlock].checkedEntry, true);
|
||||
b.linkStatus[e] = true;
|
||||
}
|
||||
}
|
||||
|
@ -345,6 +327,7 @@ using namespace Gen;
|
|||
}
|
||||
|
||||
using namespace std;
|
||||
|
||||
void Jit64::LinkBlock(int i)
|
||||
{
|
||||
LinkBlockExits(i);
|
||||
|
@ -386,15 +369,15 @@ using namespace Gen;
|
|||
// Not entirely ideal, but .. pretty good.
|
||||
|
||||
// TODO - make sure that the below stuff really is safe.
|
||||
u8 *prev_code = GetWritableCodePtr();
|
||||
|
||||
// Spurious entrances from previously linked blocks can only come through checkedEntry
|
||||
SetCodePtr((u8*)b.checkedEntry);
|
||||
MOV(32, M(&PC), Imm32(b.originalAddress));
|
||||
JMP(Asm::dispatcher, true);
|
||||
SetCodePtr(blockCodePointers[blocknum]);
|
||||
MOV(32, M(&PC), Imm32(b.originalAddress));
|
||||
JMP(Asm::dispatcher, true);
|
||||
SetCodePtr(prev_code); // reset code pointer
|
||||
XEmitter emit((u8*)b.checkedEntry);
|
||||
emit.MOV(32, M(&PC), Imm32(b.originalAddress));
|
||||
emit.JMP(asm_routines.dispatcher, true);
|
||||
|
||||
emit.SetCodePtr(blockCodePointers[blocknum]);
|
||||
emit.MOV(32, M(&PC), Imm32(b.originalAddress));
|
||||
emit.JMP(asm_routines.dispatcher, true);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,6 +19,6 @@
|
|||
|
||||
#include "../Gekko.h"
|
||||
|
||||
// Will soon introduced the JitBlockCache class here.
|
||||
// Will soon introduce the JitBlockCache class here.
|
||||
|
||||
#endif
|
||||
|
|
|
@ -34,13 +34,12 @@ namespace JitCore
|
|||
void Init()
|
||||
{
|
||||
jit.Init();
|
||||
jit.InitCache();
|
||||
Asm::compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
|
||||
asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient;
|
||||
}
|
||||
|
||||
void Shutdown()
|
||||
{
|
||||
jit.ShutdownCache();
|
||||
jit.Shutdown();
|
||||
}
|
||||
|
||||
void SingleStep()
|
||||
|
|
|
@ -27,8 +27,6 @@ using namespace Gen;
|
|||
using namespace PowerPC;
|
||||
|
||||
|
||||
GPRRegCache gpr;
|
||||
FPURegCache fpr;
|
||||
|
||||
void RegCache::Start(PPCAnalyst::BlockRegStats &stats)
|
||||
{
|
||||
|
@ -267,7 +265,7 @@ using namespace PowerPC;
|
|||
xregs[xr].dirty = makeDirty || regs[i].location.IsImm();
|
||||
OpArg newloc = ::Gen::R(xr);
|
||||
if (doLoad || regs[i].location.IsImm())
|
||||
MOV(32, newloc, regs[i].location);
|
||||
emit->MOV(32, newloc, regs[i].location);
|
||||
for (int j = 0; j < 32; j++)
|
||||
{
|
||||
if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr)
|
||||
|
@ -309,7 +307,7 @@ using namespace PowerPC;
|
|||
}
|
||||
OpArg newLoc = GetDefaultLocation(i);
|
||||
// if (doStore) //<-- Breaks JIT compilation
|
||||
MOV(32, newLoc, regs[i].location);
|
||||
emit->MOV(32, newLoc, regs[i].location);
|
||||
regs[i].location = newLoc;
|
||||
regs[i].away = false;
|
||||
}
|
||||
|
@ -327,11 +325,13 @@ using namespace PowerPC;
|
|||
xregs[xr].free = false;
|
||||
xregs[xr].dirty = makeDirty;
|
||||
OpArg newloc = ::Gen::R(xr);
|
||||
if (doLoad) {
|
||||
if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF)) {
|
||||
if (doLoad)
|
||||
{
|
||||
if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF))
|
||||
{
|
||||
PanicAlert("WARNING - misaligned fp register location %i", i);
|
||||
}
|
||||
MOVAPD(xr, regs[i].location);
|
||||
emit->MOVAPD(xr, regs[i].location);
|
||||
}
|
||||
regs[i].location = newloc;
|
||||
regs[i].away = true;
|
||||
|
@ -352,7 +352,7 @@ using namespace PowerPC;
|
|||
xregs[xr].dirty = false;
|
||||
xregs[xr].ppcReg = -1;
|
||||
OpArg newLoc = GetDefaultLocation(i);
|
||||
MOVAPD(newLoc, xr);
|
||||
emit->MOVAPD(newLoc, xr);
|
||||
regs[i].location = newLoc;
|
||||
regs[i].away = false;
|
||||
}
|
||||
|
|
|
@ -72,10 +72,15 @@
|
|||
|
||||
void DiscardRegContentsIfCached(int preg);
|
||||
virtual const int *GetAllocationOrder(int &count) = 0;
|
||||
|
||||
XEmitter *emit;
|
||||
|
||||
public:
|
||||
virtual ~RegCache() {}
|
||||
virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0;
|
||||
|
||||
void SetEmitter(XEmitter *emitter) {emit = emitter;}
|
||||
|
||||
void FlushR(X64Reg reg);
|
||||
void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);}
|
||||
void FlushLockX(X64Reg reg) {
|
||||
|
@ -142,8 +147,5 @@
|
|||
OpArg GetDefaultLocation(int reg) const;
|
||||
};
|
||||
|
||||
extern GPRRegCache gpr;
|
||||
extern FPURegCache fpr;
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -33,39 +33,39 @@
|
|||
const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||
const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
|
||||
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (*op)(Gen::X64Reg, Gen::OpArg))
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
if (d == a)
|
||||
{
|
||||
fpr.LoadToX64(d, true);
|
||||
op(fpr.RX(d), fpr.R(b));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b && reversible)
|
||||
{
|
||||
fpr.LoadToX64(d, true);
|
||||
op(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else if (a != d && b != d)
|
||||
{
|
||||
// Sources different from d, can use rather quick solution
|
||||
fpr.LoadToX64(d, !dupe);
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
op(fpr.RX(d), fpr.R(b));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (b != d)
|
||||
{
|
||||
fpr.LoadToX64(d, !dupe);
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
op(fpr.RX(d), Gen::R(XMM0));
|
||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
||||
}
|
||||
else // Other combo, must use two temps :(
|
||||
{
|
||||
MOVSD(XMM0, fpr.R(a));
|
||||
MOVSD(XMM1, fpr.R(b));
|
||||
fpr.LoadToX64(d, !dupe);
|
||||
op(XMM0, Gen::R(XMM1));
|
||||
(this->*op)(XMM0, Gen::R(XMM1));
|
||||
MOVSD(fpr.RX(d), Gen::R(XMM0));
|
||||
}
|
||||
if (dupe) {
|
||||
|
@ -86,16 +86,16 @@
|
|||
bool dupe = inst.OPCD == 59;
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &DIVSD); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &SUBSD); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &ADDSD); break; //add
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add
|
||||
case 23: //sel
|
||||
Default(inst);
|
||||
break;
|
||||
case 24: //res
|
||||
Default(inst);
|
||||
break;
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &MULSD); break; //mul
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
u32 And(u32 a, u32 b) {return a & b;}
|
||||
u32 Xor(u32 a, u32 b) {return a ^ b;}
|
||||
|
||||
void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void(*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
|
||||
void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry)
|
||||
{
|
||||
gpr.Lock(d, a);
|
||||
if (a || binary || carry) // yeh nasty special case addic
|
||||
|
@ -57,7 +57,7 @@
|
|||
{
|
||||
if (gpr.R(d).IsImm())
|
||||
gpr.LoadToX64(d, false);
|
||||
op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
|
||||
(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
|
||||
if (carry)
|
||||
GenerateCarry(EAX);
|
||||
}
|
||||
|
@ -66,7 +66,7 @@
|
|||
{
|
||||
gpr.LoadToX64(d, false);
|
||||
MOV(32, gpr.R(d), gpr.R(a));
|
||||
op(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
|
||||
(this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16;
|
||||
if (carry)
|
||||
GenerateCarry(EAX);
|
||||
}
|
||||
|
@ -84,7 +84,7 @@
|
|||
{
|
||||
// Todo - special case immediates.
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -109,22 +109,22 @@
|
|||
MOV(32, gpr.R(d), gpr.R(a));
|
||||
gpr.UnlockAll();
|
||||
} else {
|
||||
regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD); //addi
|
||||
regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD); //addi
|
||||
}
|
||||
break;
|
||||
case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, ADD); break; //addis
|
||||
case 15: regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD); break; //addis
|
||||
case 24:
|
||||
if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc) //check for nop
|
||||
{NOP(); return;} //make the nop visible in the generated code. not much use but interesting if we see one.
|
||||
regimmop(a, s, true, inst.UIMM, Or, OR);
|
||||
regimmop(a, s, true, inst.UIMM, Or, &XEmitter::OR);
|
||||
break; //ori
|
||||
case 25: regimmop(a, s, true, inst.UIMM << 16, Or, OR, false); break;//oris
|
||||
case 28: regimmop(a, s, true, inst.UIMM, And, AND, true); break;
|
||||
case 29: regimmop(a, s, true, inst.UIMM << 16, And, AND, true); break;
|
||||
case 26: regimmop(a, s, true, inst.UIMM, Xor, XOR, false); break; //xori
|
||||
case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, XOR, false); break; //xoris
|
||||
case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, ADD, false, true); //addic
|
||||
case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, ADD, true, true); //addic_rc
|
||||
case 25: regimmop(a, s, true, inst.UIMM << 16, Or, &XEmitter::OR, false); break;//oris
|
||||
case 28: regimmop(a, s, true, inst.UIMM, And, &XEmitter::AND, true); break;
|
||||
case 29: regimmop(a, s, true, inst.UIMM << 16, And, &XEmitter::AND, true); break;
|
||||
case 26: regimmop(a, s, true, inst.UIMM, Xor, &XEmitter::XOR, false); break; //xori
|
||||
case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, &XEmitter::XOR, false); break; //xoris
|
||||
case 12: //regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, false, true); //addic
|
||||
case 13: //regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, XEmitter::ADD, true, true); //addic_rc
|
||||
default:
|
||||
Default(inst);
|
||||
break;
|
||||
|
@ -295,7 +295,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -328,7 +328,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -353,7 +353,7 @@
|
|||
|
||||
if (inst.Rc) {
|
||||
// result is already in eax
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -374,7 +374,7 @@
|
|||
MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -394,7 +394,7 @@
|
|||
MOVSX(32, 16, gpr.RX(a), gpr.R(s));
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -474,7 +474,7 @@
|
|||
if (inst.OE) PanicAlert("OE: subfx");
|
||||
if (inst.Rc) {
|
||||
// result is already in eax
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -514,7 +514,7 @@
|
|||
gpr.UnlockAll();
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -544,7 +544,7 @@
|
|||
MOV(32, R(EAX), R(EDX));
|
||||
MOV(32, gpr.R(d), R(EDX));
|
||||
// result is already in eax
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
} else {
|
||||
MOV(32, gpr.R(d), R(EDX));
|
||||
}
|
||||
|
@ -570,7 +570,7 @@
|
|||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
if (inst.Rc) {
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -606,7 +606,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -618,7 +618,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -630,7 +630,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(d));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
}
|
||||
|
@ -666,7 +666,7 @@
|
|||
gpr.UnlockAllX();
|
||||
if (inst.Rc)
|
||||
{
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -730,7 +730,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -767,7 +767,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -799,7 +799,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -821,7 +821,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -851,7 +851,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -881,7 +881,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -929,7 +929,7 @@
|
|||
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -975,7 +975,7 @@
|
|||
|
||||
if (inst.Rc) {
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1006,7 +1006,7 @@
|
|||
if (inst.Rc)
|
||||
{
|
||||
MOV(32, R(EAX), gpr.R(a));
|
||||
CALL((u8*)Asm::computeRc);
|
||||
CALL((u8*)asm_routines.computeRc);
|
||||
// TODO: Check PPC manual too
|
||||
}
|
||||
}
|
||||
|
|
|
@ -144,7 +144,7 @@
|
|||
fpr.Flush(FLUSH_ALL);
|
||||
ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
|
||||
MOV(32, M(&PowerPC::ppcState.pc), Imm32(js.compilerPC + 12));
|
||||
JMP(Asm::testExceptions, true);
|
||||
JMP(asm_routines.testExceptions, true);
|
||||
js.compilerPC += 8;
|
||||
return;
|
||||
}
|
||||
|
@ -287,14 +287,13 @@
|
|||
gpr.SetImmediate32(a, addr);
|
||||
gpr.FlushLockX(ABI_PARAM1);
|
||||
MOV(32, R(ABI_PARAM1), gpr.R(s));
|
||||
// INT3();
|
||||
switch (accessSize)
|
||||
{
|
||||
// No need to protect these, they don't touch any state
|
||||
// question - should we inline them instead? Pro: Lose a CALL Con: Code bloat
|
||||
case 8: CALL((void *)Asm::fifoDirectWrite8); break;
|
||||
case 16: CALL((void *)Asm::fifoDirectWrite16); break;
|
||||
case 32: CALL((void *)Asm::fifoDirectWrite32); break;
|
||||
case 8: CALL((void *)asm_routines.fifoDirectWrite8); break;
|
||||
case 16: CALL((void *)asm_routines.fifoDirectWrite16); break;
|
||||
case 32: CALL((void *)asm_routines.fifoDirectWrite32); break;
|
||||
}
|
||||
js.fifoBytesThisBlock += accessSize >> 3;
|
||||
gpr.UnlockAllX();
|
||||
|
@ -377,9 +376,9 @@
|
|||
SetJumpTarget(unsafe_addr);
|
||||
switch (accessSize)
|
||||
{
|
||||
case 32: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break;
|
||||
case 16: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break;
|
||||
case 8: ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U8, 2), ABI_PARAM1, ABI_PARAM2); break;
|
||||
case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); break;
|
||||
case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), ABI_PARAM1, ABI_PARAM2); break;
|
||||
case 8: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), ABI_PARAM1, ABI_PARAM2); break;
|
||||
}
|
||||
SetJumpTarget(skip_call);
|
||||
gpr.UnlockAll();
|
||||
|
@ -402,7 +401,6 @@
|
|||
//return _inst.RA ? (m_GPR[_inst.RA] + _inst.SIMM_16) : _inst.SIMM_16;
|
||||
gpr.FlushLockX(ECX, EDX);
|
||||
gpr.FlushLockX(ESI);
|
||||
//INT3();
|
||||
MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16));
|
||||
if (inst.RA)
|
||||
ADD(32, R(EAX), gpr.R(inst.RA));
|
||||
|
|
|
@ -242,7 +242,7 @@ void Jit64::stfs(UGeckoInstruction inst)
|
|||
{
|
||||
// Float directly to write gather pipe! Fun!
|
||||
CVTSD2SS(XMM0, fpr.R(s));
|
||||
CALL((void*)Asm::fifoDirectWriteFloat);
|
||||
CALL((void*)asm_routines.fifoDirectWriteFloat);
|
||||
// TODO
|
||||
js.fifoBytesThisBlock += 4;
|
||||
return;
|
||||
|
|
|
@ -161,7 +161,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
|||
#endif
|
||||
FixupBranch skip_call = J();
|
||||
SetJumpTarget(argh);
|
||||
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
SetJumpTarget(skip_call);
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
|
@ -184,7 +184,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
|||
// Writing to FIFO. Let's do fast method.
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
|
||||
CALL((void*)Asm::fifoDirectWriteXmm64);
|
||||
CALL((void*)asm_routines.fifoDirectWriteXmm64);
|
||||
js.fifoBytesThisBlock += 8;
|
||||
return;
|
||||
}
|
||||
|
@ -211,7 +211,7 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
|||
MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
||||
FixupBranch arg2 = J();
|
||||
SetJumpTarget(argh);
|
||||
CALL(ProtectFunction((void *)&WriteDual32, 0));
|
||||
CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
|
||||
#else
|
||||
FixupBranch argh = J_CC(CC_NZ);
|
||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
|
||||
|
@ -224,10 +224,10 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
|||
FixupBranch arg2 = J();
|
||||
SetJumpTarget(argh);
|
||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
|
||||
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
|
||||
ADD(32, R(ABI_PARAM2), Imm32(4));
|
||||
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
#endif
|
||||
SetJumpTarget(arg2);
|
||||
gpr.UnlockAll();
|
||||
|
@ -424,7 +424,6 @@ void Jit64::psq_l(UGeckoInstruction inst)
|
|||
#endif
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(&temp64), R(EAX));
|
||||
//INT3();
|
||||
fpr.LoadToX64(inst.RS, false, true);
|
||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
||||
MOVD_xmm(XMM0, M(&temp64));
|
||||
|
|
|
@ -163,40 +163,40 @@
|
|||
*/
|
||||
|
||||
//There's still a little bit more optimization that can be squeezed out of this
|
||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (*op)(X64Reg, OpArg))
|
||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg))
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
|
||||
if (d == a)
|
||||
{
|
||||
fpr.LoadToX64(d, true);
|
||||
op(fpr.RX(d), fpr.R(b));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b && reversible)
|
||||
{
|
||||
fpr.LoadToX64(d, true);
|
||||
op(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else if (a != d && b != d)
|
||||
{
|
||||
//sources different from d, can use rather quick solution
|
||||
fpr.LoadToX64(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||
op(fpr.RX(d), fpr.R(b));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (b != d)
|
||||
{
|
||||
fpr.LoadToX64(d, false);
|
||||
MOVAPD(XMM0, fpr.R(b));
|
||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||
op(fpr.RX(d), Gen::R(XMM0));
|
||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
||||
}
|
||||
else //Other combo, must use two temps :(
|
||||
{
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
MOVAPD(XMM1, fpr.R(b));
|
||||
fpr.LoadToX64(d, false);
|
||||
op(XMM0, Gen::R(XMM1));
|
||||
(this->*op)(XMM0, Gen::R(XMM1));
|
||||
MOVAPD(fpr.RX(d), Gen::R(XMM0));
|
||||
}
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
|
@ -213,16 +213,16 @@
|
|||
}
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &DIVPD); break; //div
|
||||
case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &SUBPD); break; //sub
|
||||
case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &ADDPD); break; //add
|
||||
case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div
|
||||
case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub
|
||||
case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; //add
|
||||
case 23://sel
|
||||
Default(inst);
|
||||
break;
|
||||
case 24://res
|
||||
Default(inst);
|
||||
break;
|
||||
case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &MULPD); break; //mul
|
||||
case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||
}
|
||||
|
|
|
@ -76,9 +76,9 @@ void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signEx
|
|||
FixupBranch argh = J_CC(CC_Z);
|
||||
switch (accessSize)
|
||||
{
|
||||
case 32: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
|
||||
case 16: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
|
||||
case 8: ABI_CallFunctionR(ProtectFunction((void *)&Memory::Read_U8, 1), reg); break;
|
||||
case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break;
|
||||
case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break;
|
||||
case 8: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break;
|
||||
}
|
||||
if (signExtend && accessSize < 32) {
|
||||
// Need to sign extend values coming from the Read_U* functions.
|
||||
|
@ -114,7 +114,7 @@ void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize,
|
|||
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0);
|
||||
FixupBranch skip_call = J();
|
||||
SetJumpTarget(unsafe_addr);
|
||||
ABI_CallFunctionRR(ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
SetJumpTarget(skip_call);
|
||||
}
|
||||
|
||||
|
|
|
@ -463,7 +463,7 @@ void BPWritten(int addr, int changes, int newval)
|
|||
{
|
||||
// the number of lines copied is determined by the y scale * source efb height
|
||||
float yScale = bpmem.dispcopyyscale / 256.0f;
|
||||
float xfbLines = bpmem.copyTexSrcWH.y + 1.0 * yScale;
|
||||
float xfbLines = bpmem.copyTexSrcWH.y + 1.0f * yScale;
|
||||
XFB_Write(Memory_GetPtr(bpmem.copyTexDest<<5), multirc, (bpmem.copyMipMapStrideChannels << 4), (int)xfbLines);
|
||||
}
|
||||
else
|
||||
|
|
|
@ -82,68 +82,68 @@ void NativeVertexFormat::Initialize(const PortableVertexDeclaration &_vtx_decl)
|
|||
}
|
||||
|
||||
#ifdef USE_JIT
|
||||
Gen::XEmitter emit(m_compiledCode);
|
||||
// Alright, we have our vertex declaration. Compile some crazy code to set it quickly using GL.
|
||||
u8 *old_code_ptr = GetWritableCodePtr();
|
||||
SetCodePtr(m_compiledCode);
|
||||
ABI_EmitPrologue(6);
|
||||
emit.ABI_EmitPrologue(6);
|
||||
|
||||
CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0);
|
||||
emit.CallCdeclFunction4_I(glVertexPointer, 3, GL_FLOAT, _vtx_decl.stride, 0);
|
||||
|
||||
if (_vtx_decl.num_normals >= 1) {
|
||||
CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]);
|
||||
emit.CallCdeclFunction3_I(glNormalPointer, VarToGL(_vtx_decl.normal_gl_type), _vtx_decl.stride, _vtx_decl.normal_offset[0]);
|
||||
if (_vtx_decl.num_normals == 3) {
|
||||
CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]);
|
||||
CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]);
|
||||
emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM1_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[1]);
|
||||
emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_NORM2_ATTRIB, _vtx_decl.normal_gl_size, VarToGL(_vtx_decl.normal_gl_type), GL_TRUE, _vtx_decl.stride, _vtx_decl.normal_offset[2]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (_vtx_decl.color_offset[i] != -1) {
|
||||
if (i == 0)
|
||||
CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
|
||||
emit.CallCdeclFunction4_I(glColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
|
||||
else
|
||||
CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
|
||||
emit.CallCdeclFunction4((void *)glSecondaryColorPointer, 4, GL_UNSIGNED_BYTE, _vtx_decl.stride, _vtx_decl.color_offset[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
if (_vtx_decl.texcoord_offset[i] != -1) {
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
if (_vtx_decl.texcoord_offset[i] != -1)
|
||||
{
|
||||
int id = GL_TEXTURE0 + i;
|
||||
#ifdef _M_X64
|
||||
#ifdef _MSC_VER
|
||||
MOV(32, R(RCX), Imm32(id));
|
||||
emit.MOV(32, R(RCX), Imm32(id));
|
||||
#else
|
||||
MOV(32, R(RDI), Imm32(id));
|
||||
emit.MOV(32, R(RDI), Imm32(id));
|
||||
#endif
|
||||
#else
|
||||
ABI_AlignStack(1 * 4);
|
||||
PUSH(32, Imm32(id));
|
||||
emit.ABI_AlignStack(1 * 4);
|
||||
emit.PUSH(32, Imm32(id));
|
||||
#endif
|
||||
CALL((void *)glClientActiveTexture);
|
||||
emit.CALL((void *)glClientActiveTexture);
|
||||
#ifndef _M_X64
|
||||
#ifdef _WIN32
|
||||
// don't inc stack on windows, stdcall
|
||||
#else
|
||||
ABI_RestoreStack(1 * 4);
|
||||
emit.ABI_RestoreStack(1 * 4);
|
||||
#endif
|
||||
#endif
|
||||
CallCdeclFunction4_I(
|
||||
emit.CallCdeclFunction4_I(
|
||||
glTexCoordPointer, _vtx_decl.texcoord_size[i], VarToGL(_vtx_decl.texcoord_gl_type[i]),
|
||||
_vtx_decl.stride, _vtx_decl.texcoord_offset[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (_vtx_decl.posmtx_offset != -1) {
|
||||
CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset);
|
||||
emit.CallCdeclFunction6((void *)glVertexAttribPointer, SHADER_POSMTX_ATTRIB, 4, GL_UNSIGNED_BYTE, GL_FALSE, _vtx_decl.stride, _vtx_decl.posmtx_offset);
|
||||
}
|
||||
|
||||
ABI_EmitEpilogue(6);
|
||||
if (Gen::GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE)
|
||||
emit.ABI_EmitEpilogue(6);
|
||||
if (emit.GetCodePtr() - (u8*)m_compiledCode > COMPILED_CODE_SIZE)
|
||||
{
|
||||
Crash();
|
||||
}
|
||||
|
||||
SetCodePtr(old_code_ptr);
|
||||
#endif
|
||||
this->vtx_decl = _vtx_decl;
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@
|
|||
|
||||
#define USE_JIT
|
||||
|
||||
#define COMPILED_CODE_SIZE 4096*4
|
||||
#define COMPILED_CODE_SIZE 4096
|
||||
|
||||
NativeVertexFormat *g_nativeVertexFmt;
|
||||
|
||||
|
@ -116,6 +116,7 @@ void LOADERDECL TexMtx_Write_Short3()
|
|||
|
||||
VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
|
||||
{
|
||||
m_compiledCode = NULL;
|
||||
m_numLoadedVertices = 0;
|
||||
m_VertexSize = 0;
|
||||
m_numPipelineStages = 0;
|
||||
|
@ -126,16 +127,14 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
|
|||
m_VtxDesc = vtx_desc;
|
||||
SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex);
|
||||
|
||||
m_compiledCode = (u8 *)AllocateExecutableMemory(COMPILED_CODE_SIZE, false);
|
||||
if (m_compiledCode) {
|
||||
memset(m_compiledCode, 0, COMPILED_CODE_SIZE);
|
||||
}
|
||||
AllocCodeSpace(COMPILED_CODE_SIZE);
|
||||
CompileVertexTranslator();
|
||||
WriteProtect();
|
||||
}
|
||||
|
||||
VertexLoader::~VertexLoader()
|
||||
{
|
||||
FreeMemoryPages(m_compiledCode, COMPILED_CODE_SIZE);
|
||||
FreeCodeSpace();
|
||||
delete m_NativeFmt;
|
||||
}
|
||||
|
||||
|
@ -143,13 +142,14 @@ void VertexLoader::CompileVertexTranslator()
|
|||
{
|
||||
m_VertexSize = 0;
|
||||
const TVtxAttr &vtx_attr = m_VtxAttr;
|
||||
//const TVtxDesc &vtx_desc = m_VtxDesc;
|
||||
|
||||
#ifdef USE_JIT
|
||||
u8 *old_code_ptr = GetWritableCodePtr();
|
||||
SetCodePtr(m_compiledCode);
|
||||
if (m_compiledCode)
|
||||
PanicAlert("trying to recompile a vtx translator");
|
||||
|
||||
m_compiledCode = GetCodePtr();
|
||||
ABI_EmitPrologue(4);
|
||||
// MOV(32, R(EBX), M(&loop_counter));
|
||||
|
||||
// Start loop here
|
||||
const u8 *loop_start = GetCodePtr();
|
||||
|
||||
|
@ -477,7 +477,6 @@ void VertexLoader::CompileVertexTranslator()
|
|||
//SUB(32, R(EBX), Imm8(1));
|
||||
J_CC(CC_NZ, loop_start, true);
|
||||
ABI_EmitEpilogue(4);
|
||||
SetCodePtr(old_code_ptr);
|
||||
#endif
|
||||
m_NativeFmt->Initialize(vtx_decl);
|
||||
}
|
||||
|
|
|
@ -22,9 +22,10 @@
|
|||
|
||||
#include "CPMemory.h"
|
||||
#include "DataReader.h"
|
||||
|
||||
#include "NativeVertexFormat.h"
|
||||
|
||||
#include "x64Emitter.h"
|
||||
|
||||
class VertexLoaderUID
|
||||
{
|
||||
u32 vid[5];
|
||||
|
@ -52,7 +53,7 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class VertexLoader
|
||||
class VertexLoader : public Gen::XCodeBlock
|
||||
{
|
||||
public:
|
||||
VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
|
||||
|
@ -86,7 +87,7 @@ private:
|
|||
TPipelineFunction m_PipelineStages[64]; // TODO - figure out real max. it's lower.
|
||||
int m_numPipelineStages;
|
||||
|
||||
u8 *m_compiledCode;
|
||||
const u8 *m_compiledCode;
|
||||
|
||||
int m_numLoadedVertices;
|
||||
|
||||
|
|
Loading…
Reference in New Issue