flycast/core/rec-x64/rec_x64.cpp

#include "build.h"

#if FEAT_SHREC == DYNAREC_JIT && HOST_CPU == CPU_X64

//#define CANONICAL_TEST

#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
using namespace Xbyak::util;

#include "types.h"
#include "hw/sh4/sh4_opcode_list.h"
#include "hw/sh4/dyna/ngen.h"
#include "hw/sh4/modules/mmu.h"
#include "hw/sh4/sh4_interrupts.h"

#include "hw/sh4/sh4_core.h"
#include "hw/sh4/sh4_mem.h"
#include "x64_regalloc.h"
#include "xbyak_base.h"
#include "oslib/unwind_info.h"
#include "oslib/virtmem.h"

static void (*mainloop)();
static void (*handleException)();

static u64 jmp_rsp;

namespace MemSize {
	enum {
		S8,
		S16,
		S32,
		S64,
		Count
	};
}
namespace MemOp {
	enum {
		R,
		W,
		Count
	};
}
namespace MemType {
	enum {
		Fast,
		StoreQueue,
		Slow,
		Count
	};
}

static const void *MemHandlers[MemType::Count][MemSize::Count][MemOp::Count];
static const u8 *MemHandlerStart, *MemHandlerEnd;
static UnwindInfo unwinder;
#ifndef _WIN32
static float xmmSave[4];
#endif

static void ngen_blockcheckfail(u32 pc) {
	//printf("X64 JIT: SMC invalidation at %08X\n", pc);
	rdv_BlockCheckFail(pc);
}

static void handle_sh4_exception(SH4ThrownException& ex, u32 pc)
{
	if (pc & 1)
	{
		// Delay slot
		AdjustDelaySlotException(ex);
		pc--;
	}
	Do_Exception(pc, ex.expEvn);
	p_sh4rcb->cntx.cycle_counter += 4;	// probably more is needed
	handleException();
}

static void interpreter_fallback(u16 op, OpCallFP *oph, u32 pc)
{
	try {
		oph(op);
	} catch (SH4ThrownException& ex) {
		handle_sh4_exception(ex, pc);
	}
}

static void do_sqw_mmu_no_ex(u32 addr, u32 pc)
{
	try {
		do_sqw_mmu(addr);
	} catch (SH4ThrownException& ex) {
		handle_sh4_exception(ex, pc);
	}
}

const std::array<Xbyak::Reg32, 4> call_regs
#ifdef _WIN32
	{ ecx, edx, r8d, r9d };
#else
	{ edi, esi, edx, ecx };
#endif
const std::array<Xbyak::Reg64, 4> call_regs64
#ifdef _WIN32
	{ rcx, rdx, r8, r9 };
#else
	{ rdi, rsi, rdx, rcx };
#endif
const std::array<Xbyak::Xmm, 4> call_regsxmm { xmm0, xmm1, xmm2, xmm3 };

#ifdef _WIN32
constexpr u32 STACK_ALIGN = 0x28;	// 32-byte shadow space + 8 byte alignment
#else
constexpr u32 STACK_ALIGN = 8;
#endif

class BlockCompiler : public BaseXbyakRec<BlockCompiler, true>
{
public:
	using BaseCompiler = BaseXbyakRec<BlockCompiler, true>;
	friend class BaseXbyakRec<BlockCompiler, true>;

	BlockCompiler(Sh4CodeBuffer& codeBuffer) : BaseCompiler(codeBuffer), regalloc(this) { }
	BlockCompiler(Sh4CodeBuffer& codeBuffer, u8 *code_ptr) : BaseCompiler(codeBuffer, code_ptr), regalloc(this) { }

	void compile(RuntimeBlockInfo* block, bool force_checks, bool optimise)
	{
		//printf("X86_64 compiling %08x to %p\n", block->addr, codeBuffer.get());
		current_opid = -1;

		CheckBlock(force_checks, block);

		sub(rsp, STACK_ALIGN);

		if (mmu_enabled() && block->has_fpu_op)
		{
			Xbyak::Label fpu_enabled;
			mov(rax, (uintptr_t)&sr);
			test(dword[rax], 0x8000);			// test SR.FD bit
			jz(fpu_enabled);
			mov(call_regs[0], block->vaddr);	// pc
			mov(call_regs[1], Sh4Ex_FpuDisabled);// exception code
			GenCall((void (*)())Do_Exception);
			jmp(exit_block, T_NEAR);
			L(fpu_enabled);
		}
		mov(rax, (uintptr_t)&p_sh4rcb->cntx.cycle_counter);
		sub(dword[rax], block->guest_cycles);

		regalloc.DoAlloc(block);

		for (current_opid = 0; current_opid < block->oplist.size(); current_opid++)
		{
			shil_opcode& op  = block->oplist[current_opid];

			regalloc.OpBegin(&op, current_opid);

			switch (op.op)
			{
			case shop_ifb:
				if (mmu_enabled())
				{
					mov(call_regs64[1], reinterpret_cast<uintptr_t>(*OpDesc[op.rs3._imm]->oph));	// op handler
					mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0));	// pc
				}

				if (op.rs1._imm)
				{
					mov(rax, (size_t)&next_pc);
					mov(dword[rax], op.rs2._imm);
				}

				mov(call_regs[0], op.rs3._imm);

				if (!mmu_enabled())
					GenCall(OpDesc[op.rs3._imm]->oph);
				else
					GenCall(interpreter_fallback);

				break;

			case shop_mov64:
			{
				verify(op.rd.is_r64f());
				verify(op.rs1.is_r64f());

#if ALLOC_F64 == false
				mov(rax, (uintptr_t)op.rs1.reg_ptr());
				mov(rax, qword[rax]);
				mov(rcx, (uintptr_t)op.rd.reg_ptr());
				mov(qword[rcx], rax);
#else
				Xbyak::Xmm rd0 = regalloc.MapXRegister(op.rd, 0);
				Xbyak::Xmm rs0 = regalloc.MapXRegister(op.rs1, 0);
				Xbyak::Xmm rd1 = regalloc.MapXRegister(op.rd, 1);
				Xbyak::Xmm rs1 = regalloc.MapXRegister(op.rs1, 1);
				if (rd0 == rs1)
				{
					movss(xmm0, rd0);
					movss(rd0, rs0);
					movss(rd1, xmm0);
				}
				else
				{
					if (rd0 != rs0)
						movss(rd0, rs0);
					if (rd1 != rs1)
						movss(rd1, rs1);
				}
#endif
			}
			break;

			case shop_readm:
				if (!GenReadMemImmediate(op, block))
				{
					// Not an immediate address
					shil_param_to_host_reg(op.rs1, call_regs[0]);
					if (!op.rs3.is_null())
					{
						if (op.rs3.is_imm())
							add(call_regs[0], op.rs3._imm);
						else if (regalloc.IsAllocg(op.rs3))
							add(call_regs[0], regalloc.MapRegister(op.rs3));
						else
						{
							mov(rax, (uintptr_t)op.rs3.reg_ptr());
							add(call_regs[0], dword[rax]);
						}
					}
					genMmuLookup(block, op, 0);

					int size = op.size == 1 ? MemSize::S8 : op.size == 2 ? MemSize::S16 : op.size == 4 ? MemSize::S32 : MemSize::S64;
					GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::R], mmu_enabled());

#if ALLOC_F64 == false
					if (size == MemSize::S64)
					{
						mov(rcx, (uintptr_t)op.rd.reg_ptr());
						mov(qword[rcx], rax);
					}
					else
#endif
					{
						mov(rcx, rax);
						host_reg_to_shil_param(op.rd, rcx);
					}
				}
				break;

			case shop_writem:
			{
				if (!GenWriteMemImmediate(op, block))
				{
					shil_param_to_host_reg(op.rs1, call_regs[0]);
					if (!op.rs3.is_null())
					{
						if (op.rs3.is_imm())
							add(call_regs[0], op.rs3._imm);
						else if (regalloc.IsAllocg(op.rs3))
							add(call_regs[0], regalloc.MapRegister(op.rs3));
						else
						{
							mov(rax, (uintptr_t)op.rs3.reg_ptr());
							add(call_regs[0], dword[rax]);
						}
					}
					genMmuLookup(block, op, 1);

#if ALLOC_F64 == false
					if (op.size == 8)
					{
						mov(rax, (uintptr_t)op.rs2.reg_ptr());
						mov(call_regs64[1], qword[rax]);
					}
					else
#endif
						shil_param_to_host_reg(op.rs2, call_regs64[1]);

					int size = op.size == 1 ? MemSize::S8 : op.size == 2 ? MemSize::S16 : op.size == 4 ? MemSize::S32 : MemSize::S64;
					GenCall((void (*)())MemHandlers[optimise ? MemType::Fast : MemType::Slow][size][MemOp::W], mmu_enabled());
				}
			}
			break;

			case shop_jcond:
			case shop_jdyn:
			case shop_mov32:
				genBaseOpcode(op);
				break;

#ifndef CANONICAL_TEST
			case shop_sync_sr:
				GenCall(UpdateSR);
				break;
			case shop_sync_fpscr:
				GenCall(UpdateFPSCR);
				break;

			case shop_negc:
				{
					Xbyak::Reg32 rs2;
					if (op.rs2.is_reg())
					{
						rs2 = regalloc.MapRegister(op.rs2);
						if (regalloc.mapg(op.rd) == regalloc.mapg(op.rs2))
						{
							mov(ecx, rs2);
							rs2 = ecx;
						}
					}
					Xbyak::Reg32 rd = regalloc.MapRegister(op.rd);
					if (op.rs1.is_imm())
						mov(rd, op.rs1.imm_value());
					else if (regalloc.mapg(op.rd) != regalloc.mapg(op.rs1))
						mov(rd, regalloc.MapRegister(op.rs1));
					Xbyak::Reg64 rd64 = rd.cvt64();
					neg(rd64);
					if (op.rs2.is_imm())
						sub(rd64, op.rs2.imm_value());
					else
						sub(rd64, rs2.cvt64());
					Xbyak::Reg64 rd2_64 = regalloc.MapRegister(op.rd2).cvt64();
					mov(rd2_64, rd64);
					shr(rd2_64, 63);
				}
				break;

			case shop_mul_s64:
				movsxd(rax, regalloc.MapRegister(op.rs1));
				if (op.rs2.is_reg())
					movsxd(rcx, regalloc.MapRegister(op.rs2));
				else
					mov(rcx, (s64)(s32)op.rs2._imm);
				mul(rcx);
				mov(regalloc.MapRegister(op.rd), eax);
				shr(rax, 32);
				mov(regalloc.MapRegister(op.rd2), eax);
				break;

			case shop_pref:
				{
					Xbyak::Label no_sqw;
					if (op.rs1.is_imm())
					{
						// this test shouldn't be necessary
						if ((op.rs1._imm & 0xFC000000) != 0xE0000000)
							break;

						mov(call_regs[0], op.rs1._imm);
					}
					else
					{
						Xbyak::Reg32 rn;
						if (regalloc.IsAllocg(op.rs1))
						{
							rn = regalloc.MapRegister(op.rs1);
						}
						else
						{
							mov(rax, (uintptr_t)op.rs1.reg_ptr());
							mov(eax, dword[rax]);
							rn = eax;
						}
						mov(ecx, rn);
						shr(ecx, 26);
						cmp(ecx, 0x38);
						jne(no_sqw);

						mov(call_regs[0], rn);
					}
					if (mmu_enabled())
					{
						mov(call_regs[1], block->vaddr + op.guest_offs - (op.delay_slot ? 1 : 0));	// pc

						GenCall(do_sqw_mmu_no_ex);
					}
					else
					{
						mov(call_regs64[1], (uintptr_t)sq_both);
						mov(rax, (size_t)&do_sqw_nommu);
						saveXmmRegisters();
						call(qword[rax]);
						restoreXmmRegisters();
					}
					L(no_sqw);
				}
				break;

			case shop_frswap:
				mov(rax, (uintptr_t)op.rs1.reg_ptr());
				mov(rcx, (uintptr_t)op.rd.reg_ptr());
				if (cpu.has(Cpu::tAVX512F))
				{
					vmovaps(zmm0, zword[rax]);
					vmovaps(zmm1, zword[rcx]);
					vmovaps(zword[rax], zmm1);
					vmovaps(zword[rcx], zmm0);
				}
				else if (cpu.has(Cpu::tAVX))
				{
					vmovaps(ymm0, yword[rax]);
					vmovaps(ymm1, yword[rcx]);
					vmovaps(yword[rax], ymm1);
					vmovaps(yword[rcx], ymm0);

					vmovaps(ymm0, yword[rax + 32]);
					vmovaps(ymm1, yword[rcx + 32]);
					vmovaps(yword[rax + 32], ymm1);
					vmovaps(yword[rcx + 32], ymm0);
				}
				else
				{
					for (int i = 0; i < 4; i++)
					{
						movaps(xmm0, xword[rax + (i * 16)]);
						movaps(xmm1, xword[rcx + (i * 16)]);
						movaps(xword[rax + (i * 16)], xmm1);
						movaps(xword[rcx + (i * 16)], xmm0);
					}
				}
				break;

			case shop_fmac:
				{
					Xbyak::Xmm rs1 = regalloc.MapXRegister(op.rs1);
					Xbyak::Xmm rs2 = regalloc.MapXRegister(op.rs2);
					Xbyak::Xmm rs3 = regalloc.MapXRegister(op.rs3);
					Xbyak::Xmm rd = regalloc.MapXRegister(op.rd);
					if (rd == rs2)
					{
						movss(xmm1, rs2);
						rs2 = xmm1;
					}
					if (rd == rs3)
					{
						movss(xmm2, rs3);
						rs3 = xmm2;
					}
					if (op.rs1.is_imm()) // FIXME MapXRegister(op.rs1) would have failed
					{
						mov(eax, op.rs1._imm);
						movd(rd, eax);
					}
					else if (rd != rs1)
					{
						movss(rd, rs1);
					}
					if (cpu.has(Cpu::tFMA) && !config::GGPOEnable)
						vfmadd231ss(rd, rs2, rs3);
					else
					{
						movss(xmm0, rs2);
						mulss(xmm0, rs3);
						addss(rd, xmm0);
					}
				}
				break;
#endif

			default:
#ifndef CANONICAL_TEST
				if (!genBaseOpcode(op))
#endif
					shil_chf[op.op](&op);
				break;
			}
			regalloc.OpEnd(&op);
		}
		regalloc.Cleanup();
		current_opid = -1;

		mov(rax, (size_t)&next_pc);

		switch (block->BlockType) {

		case BET_StaticJump:
		case BET_StaticCall:
			//next_pc = block->BranchBlock;
			mov(dword[rax], block->BranchBlock);
			break;

		case BET_Cond_0:
		case BET_Cond_1:
			{
				//next_pc = next_pc_value;
				//if (*jdyn == 0)
				//next_pc = branch_pc_value;

				mov(dword[rax], block->NextBlock);

				if (block->has_jcond)
					mov(rdx, (size_t)&Sh4cntx.jdyn);
				else
					mov(rdx, (size_t)&sr.T);

				cmp(dword[rdx], block->BlockType & 1);
				Xbyak::Label branch_not_taken;

				jne(branch_not_taken, T_SHORT);
				mov(dword[rax], block->BranchBlock);
				L(branch_not_taken);
			}
			break;

		case BET_DynamicJump:
		case BET_DynamicCall:
		case BET_DynamicRet:
			//next_pc = *jdyn;
			mov(rdx, (size_t)&Sh4cntx.jdyn);
			mov(edx, dword[rdx]);
			mov(dword[rax], edx);
			break;

		case BET_DynamicIntr:
		case BET_StaticIntr:
			if (block->BlockType == BET_DynamicIntr) {
				//next_pc = *jdyn;
				mov(rdx, (size_t)&Sh4cntx.jdyn);
				mov(edx, dword[rdx]);
				mov(dword[rax], edx);
			}
			else {
				//next_pc = next_pc_value;
				mov(dword[rax], block->NextBlock);
			}

			GenCall(UpdateINTC);
			break;

		default:
			die("Invalid block end type");
		}

		L(exit_block);
		add(rsp, STACK_ALIGN);
		ret();

		ready();

		block->code = (DynarecCodeEntryPtr)getCode();
		block->host_code_size = getSize();

		codeBuffer.advance(getSize());
	}

	void canonStart(const shil_opcode& op)
	{
		CC_pars.clear();
	}

	void canonParam(const shil_opcode& op, const shil_param& prm, CanonicalParamType tp) {
		switch (tp)
		{

		case CPT_u32:
		case CPT_ptr:
		case CPT_f32:
		{
			CC_PS t = { tp, &prm };
			CC_pars.push_back(t);
		}
		break;

		// store from EAX
		case CPT_u64rvL:
		case CPT_u32rv:
			mov(rcx, rax);
			host_reg_to_shil_param(prm, ecx);
			break;

		case CPT_u64rvH:
			// assuming CPT_u64rvL has just been called
			shr(rcx, 32);
			host_reg_to_shil_param(prm, ecx);
			break;

		// store from xmm0
		case CPT_f32rv:
			host_reg_to_shil_param(prm, xmm0);
			break;
		}
	}

	void canonCall(const shil_opcode& op, void* function)
	{
		int regused = 0;
		int xmmused = 0;

		for (int i = CC_pars.size(); i-- > 0;)
		{
			verify(xmmused < 4 && regused < 4);
			const shil_param& prm = *CC_pars[i].prm;
			switch (CC_pars[i].type) {
				//push the contents

			case CPT_u32:
				shil_param_to_host_reg(prm, call_regs[regused++]);
				break;

			case CPT_f32:
				shil_param_to_host_reg(prm, call_regsxmm[xmmused++]);
				break;

				//push the ptr itself
			case CPT_ptr:
				verify(prm.is_reg());
				mov(call_regs64[regused++], (size_t)prm.reg_ptr());
				break;

            default:
               // Other cases handled in canonParam
               break;
			}
		}
		GenCall((void (*)())function);
#if ALLOC_F64 == true
		for (const CC_PS& ccParam : CC_pars)
		{
			const shil_param& prm = *ccParam.prm;
			if (ccParam.type == CPT_ptr && prm.count() == 2 && regalloc.IsAllocf(prm) && (op.rd._reg == prm._reg || op.rd2._reg == prm._reg)) {
				// fsca rd param is a pointer to a 64-bit reg so reload the regs if allocated
				mov(rax, (size_t)GetRegPtr(prm._reg));
				movss(regalloc.MapXRegister(prm, 0), dword[rax]);
				mov(rax, (size_t)GetRegPtr(prm._reg + 1));
				movss(regalloc.MapXRegister(prm, 1), dword[rax]);
			}
		}
#endif
	}

	void RegPreload(u32 reg, Xbyak::Operand::Code nreg)
	{
		mov(rax, (size_t)GetRegPtr(reg));
		mov(Xbyak::Reg32(nreg), dword[rax]);
	}
	void RegWriteback(u32 reg, Xbyak::Operand::Code nreg)
	{
		mov(rax, (size_t)GetRegPtr(reg));
		mov(dword[rax], Xbyak::Reg32(nreg));
	}
	void RegPreload_FPU(u32 reg, s8 nreg)
	{
		mov(rax, (size_t)GetRegPtr(reg));
		movss(Xbyak::Xmm(nreg), dword[rax]);
	}
	void RegWriteback_FPU(u32 reg, s8 nreg)
	{
		mov(rax, (size_t)GetRegPtr(reg));
		movss(dword[rax], Xbyak::Xmm(nreg));
	}

	void genMainloop()
	{
		unwinder.start((void *)getCurr());

		push(rbx);
		unwinder.pushReg(getSize(), Xbyak::Operand::RBX);
		push(rbp);
		unwinder.pushReg(getSize(), Xbyak::Operand::RBP);
#ifdef _WIN32
		push(rdi);
		unwinder.pushReg(getSize(), Xbyak::Operand::RDI);
		push(rsi);
		unwinder.pushReg(getSize(), Xbyak::Operand::RSI);
#endif
		push(r12);
		unwinder.pushReg(getSize(), Xbyak::Operand::R12);
		push(r13);
		unwinder.pushReg(getSize(), Xbyak::Operand::R13);
		push(r14);
		unwinder.pushReg(getSize(), Xbyak::Operand::R14);
		push(r15);
		unwinder.pushReg(getSize(), Xbyak::Operand::R15);
		sub(rsp, STACK_ALIGN);
		unwinder.allocStack(getSize(), STACK_ALIGN);
		unwinder.endProlog(getSize());

		mov(qword[rip + &jmp_rsp], rsp);

	//run_loop:
		Xbyak::Label run_loop;
		L(run_loop);
		Xbyak::Label end_run_loop;
		mov(rax, (size_t)&p_sh4rcb->cntx.CpuRunning);
		mov(edx, dword[rax]);

		test(edx, edx);
		je(end_run_loop);

	//slice_loop:
		Xbyak::Label slice_loop;
		L(slice_loop);
		mov(rax, (size_t)&p_sh4rcb->cntx.pc);
		mov(call_regs[0], dword[rax]);
		call(bm_GetCodeByVAddr);
		call(rax);
		mov(rax, (uintptr_t)&p_sh4rcb->cntx.cycle_counter);
		mov(ecx, dword[rax]);
		test(ecx, ecx);
		jg(slice_loop);

		add(ecx, SH4_TIMESLICE);
		mov(dword[rax], ecx);
		call(UpdateSystem_INTC);
		jmp(run_loop);

	//end_run_loop:
		L(end_run_loop);
		add(rsp, STACK_ALIGN);
		pop(r15);
		pop(r14);
		pop(r13);
		pop(r12);
#ifdef _WIN32
		pop(rsi);
		pop(rdi);
#endif
		pop(rbp);
		pop(rbx);
		ret();
		size_t unwindSize = unwinder.end(getSize());
		setSize(getSize() + unwindSize);

		unwinder.start((void *)getCurr());
		size_t startOffset = getSize();
#ifdef _WIN32
		// 32-byte shadow space + 8 for stack 16-byte alignment
		unwinder.allocStack(0, 40);
#else
		// stack 16-byte alignment
		unwinder.allocStack(0, 8);
#endif
		unwinder.endProlog(0);

	//handleException:
		Xbyak::Label handleExceptionLabel;
		L(handleExceptionLabel);
		mov(rsp, qword[rip + &jmp_rsp]);
		jmp(run_loop);

		genMemHandlers();

		size_t savedSize = getSize();
		setSize(codeBuffer.getFreeSpace() - 128 - startOffset);
		unwindSize = unwinder.end(getSize());
		verify(unwindSize <= 128);
		setSize(savedSize);

		ready();
		mainloop = (void (*)())getCode();
		handleException = (void(*)())handleExceptionLabel.getAddress();

		codeBuffer.advance(getSize());
	}

	bool rewriteMemAccess(host_context_t &context)
	{
		if (!addrspace::virtmemEnabled())
			return false;

		//printf("rewriteMemAccess pc %p\n", context.pc);
		if (context.pc < (size_t)MemHandlerStart || context.pc >= (size_t)MemHandlerEnd)
			return false;

		u8 *retAddr = *(u8 **)context.rsp;
		void *ca = *(s32 *)(retAddr - 4) + retAddr;
		for (int size = 0; size < MemSize::Count; size++)
		{
			for (int op = 0; op < MemOp::Count; op++)
			{
				if ((void *)MemHandlers[MemType::Fast][size][op] != ca)
					continue;

				//found !
				const u8 *start = getCurr();
				u32 memAddress = context.r9;
				if (op == MemOp::W && size >= MemSize::S32 && (memAddress >> 26) == 0x38)
					call(MemHandlers[MemType::StoreQueue][size][MemOp::W]);
				else
					call(MemHandlers[MemType::Slow][size][op]);
				verify(getCurr() - start == 5);

				ready();

				context.pc = (uintptr_t)(retAddr - 5);
				// remove the call from the stack
				context.rsp += 8;
				//restore the addr from r9 to arg0 (rcx or rdi) so it's valid again
#ifdef _WIN32
				context.rcx = memAddress;
#else
				context.rdi = memAddress;
#endif

				return true;
			}
		}
		ERROR_LOG(DYNAREC, "rewriteMemAccess code not found: host pc %p", (void *)context.pc);
		die("Failed to match the code");

		return false;
	}

private:
	void genMmuLookup(const RuntimeBlockInfo* block, const shil_opcode& op, u32 write)
	{
		if (mmu_enabled())
		{
#ifdef FAST_MMU
			Xbyak::Label inCache;
			Xbyak::Label done;

			mov(eax, call_regs[0]);
			shr(eax, 12);
			if ((uintptr_t)mmuAddressLUT >> 32 != 0)
			{
				mov(r9, (uintptr_t)mmuAddressLUT);
				mov(eax, dword[r9 + rax * 4]);
			}
			else
			{
				mov(eax, dword[(uintptr_t)mmuAddressLUT + rax * 4]);
			}
			test(eax, eax);
			jne(inCache);
#endif
			mov(call_regs[1], write);
			mov(call_regs[2], block->vaddr + op.guest_offs - (op.delay_slot ? 2 : 0));	// pc
			GenCall(mmuDynarecLookup);
			mov(call_regs[0], eax);
#ifdef FAST_MMU
			jmp(done);
			L(inCache);
			and_(call_regs[0], 0xFFF);
			or_(call_regs[0], eax);
			L(done);
#endif
		}
	}
	bool GenReadMemImmediate(const shil_opcode& op, RuntimeBlockInfo* block)
	{
		if (!op.rs1.is_imm())
			return false;
		void *ptr;
		bool isram;
		u32 addr;
		if (!rdv_readMemImmediate(op.rs1._imm, op.size, ptr, isram, addr, block))
			return false;

		if (isram)
		{
			// Immediate pointer to RAM: super-duper fast access
			mov(rax, reinterpret_cast<uintptr_t>(ptr));
			switch (op.size)
			{
			case 1:
				if (regalloc.IsAllocg(op.rd))
					movsx(regalloc.MapRegister(op.rd), byte[rax]);
				else
				{
					movsx(eax, byte[rax]);
					mov(rcx, (uintptr_t)op.rd.reg_ptr());
					mov(dword[rcx], eax);
				}
				break;

			case 2:
				if (regalloc.IsAllocg(op.rd))
					movsx(regalloc.MapRegister(op.rd), word[rax]);
				else
				{
					movsx(eax, word[rax]);
					mov(rcx, (uintptr_t)op.rd.reg_ptr());
					mov(dword[rcx], eax);
				}
				break;

			case 4:
				if (regalloc.IsAllocg(op.rd))
					mov(regalloc.MapRegister(op.rd), dword[rax]);
				else if (regalloc.IsAllocf(op.rd))
					movd(regalloc.MapXRegister(op.rd), dword[rax]);
				else
				{
					mov(eax, dword[rax]);
					mov(rcx, (uintptr_t)op.rd.reg_ptr());
					mov(dword[rcx], eax);
				}
				break;

			case 8:
#if ALLOC_F64 == false
				mov(rcx, qword[rax]);
				mov(rax, (uintptr_t)op.rd.reg_ptr());
				mov(qword[rax], rcx);
#else
				movd(regalloc.MapXRegister(op.rd, 0), dword[rax]);
				movd(regalloc.MapXRegister(op.rd, 1), dword[rax + 4]);
#endif
				break;

			default:
				die("Invalid immediate size");
					break;
			}
		}
		else
		{
			// Not RAM: the returned pointer is a memory handler
			if (op.size == 8)
			{
				// Need to call the handler twice
				mov(call_regs[0], addr);
				GenCall((void (*)())ptr);
#if ALLOC_F64 == false
				mov(rcx, (size_t)op.rd.reg_ptr());
				mov(dword[rcx], eax);
#else
				movd(regalloc.MapXRegister(op.rd, 0), eax);
#endif

				mov(call_regs[0], addr + 4);
				GenCall((void (*)())ptr);
#if ALLOC_F64 == false
				mov(rcx, (size_t)op.rd.reg_ptr() + 4);
				mov(dword[rcx], eax);
#else
				movd(regalloc.MapXRegister(op.rd, 1), eax);
#endif
			}
			else
			{
				mov(call_regs[0], addr);

				switch(op.size)
				{
				case 1:
					GenCall((void (*)())ptr);
					movsx(eax, al);
					break;

				case 2:
					GenCall((void (*)())ptr);
					movsx(eax, ax);
					break;

				case 4:
					GenCall((void (*)())ptr);
					break;

				default:
					die("Invalid immediate size");
						break;
				}
				mov(ecx, eax);
				host_reg_to_shil_param(op.rd, ecx);
			}
		}

		return true;
	}

	bool GenWriteMemImmediate(const shil_opcode& op, RuntimeBlockInfo* block)
	{
		if (!op.rs1.is_imm())
			return false;
		void *ptr;
		bool isram;
		u32 addr;
		if (!rdv_writeMemImmediate(op.rs1._imm, op.size, ptr, isram, addr, block))
			return false;
		if (isram)
		{
			// Immediate pointer to RAM: super-duper fast access
			mov(rax, reinterpret_cast<uintptr_t>(ptr));
			switch (op.size)
			{
			case 1:
				if (regalloc.IsAllocg(op.rs2))
					mov(byte[rax], regalloc.MapRegister(op.rs2).cvt8());
				else if (op.rs2.is_imm())
					mov(byte[rax], (u8)op.rs2._imm);
				else
				{
					mov(rcx, (uintptr_t)op.rs2.reg_ptr());
					mov(cl, byte[rcx]);
					mov(byte[rax], cl);
				}
				break;

			case 2:
				if (regalloc.IsAllocg(op.rs2))
					mov(word[rax], regalloc.MapRegister(op.rs2).cvt16());
				else if (op.rs2.is_imm())
					mov(word[rax], (u16)op.rs2._imm);
				else
				{
					mov(rcx, (uintptr_t)op.rs2.reg_ptr());
					mov(cx, word[rcx]);
					mov(word[rax], cx);
				}
				break;

			case 4:
				if (regalloc.IsAllocg(op.rs2))
					mov(dword[rax], regalloc.MapRegister(op.rs2));
				else if (regalloc.IsAllocf(op.rs2))
					movd(dword[rax], regalloc.MapXRegister(op.rs2));
				else if (op.rs2.is_imm())
					mov(dword[rax], op.rs2._imm);
				else
				{
					mov(rcx, (uintptr_t)op.rs2.reg_ptr());
					mov(ecx, dword[rcx]);
					mov(dword[rax], ecx);
				}
				break;

			case 8:
#if ALLOC_F64 == false
				mov(rcx, (uintptr_t)op.rs2.reg_ptr());
				mov(rcx, qword[rcx]);
				mov(qword[rax], rcx);
#else
				movd(dword[rax], regalloc.MapXRegister(op.rs2, 0));
				movd(dword[rax + 4], regalloc.MapXRegister(op.rs2, 1));
#endif
				break;

			default:
				die("Invalid immediate size");
				break;
			}
		}
		else
		{
			// Not RAM: the returned pointer is a memory handler
			mov(call_regs[0], addr);
			shil_param_to_host_reg(op.rs2, call_regs[1]);

			GenCall((void (*)())ptr);
		}

		return true;
	}

	void CheckBlock(bool force_checks, RuntimeBlockInfo* block)
	{
		if (mmu_enabled() || force_checks)
			mov(call_regs[0], block->addr);

		// FIXME This test shouldn't be necessary
		// However the decoder makes various assumptions about the current PC value, which are simply not
		// true in a virtualized memory model. So this can only work if virtual and phy addresses are the
		// same at compile and run times.
		if (mmu_enabled())
		{
			mov(rax, (uintptr_t)&next_pc);
			cmp(dword[rax], block->vaddr);
			jne(reinterpret_cast<const void*>(&ngen_blockcheckfail));
		}

		if (!force_checks)
			return;

		s32 sz=block->sh4_code_size;
		u32 sa=block->addr;

		void* ptr = (void*)GetMemPtr(sa, sz > 8 ? 8 : sz);
		if (ptr)
		{
			while (sz > 0)
			{
				uintptr_t uintptr = reinterpret_cast<uintptr_t>(ptr);
				mov(rax, uintptr);

				if (sz >= 8 && !(uintptr & 7)) {
					mov(rdx, *(u64*)ptr);
					cmp(qword[rax], rdx);
					sz -= 8;
					sa += 8;
				}
				else if (sz >= 4 && !(uintptr & 3)) {
					mov(edx, *(u32*)ptr);
					cmp(dword[rax], edx);
					sz -= 4;
					sa += 4;
				}
				else {
					mov(edx, *(u16*)ptr);
					cmp(word[rax],dx);
					sz -= 2;
					sa += 2;
				}
				jne(reinterpret_cast<const void*>(CC_RX2RW(&ngen_blockcheckfail)));
				ptr = (void*)GetMemPtr(sa, sz > 8 ? 8 : sz);
			}
		}
	}

	void genMemHandlers()
	{
		// make sure the memory handlers are set
		verify(ReadMem8 != nullptr);

		MemHandlerStart = getCurr();
		for (int type = 0; type < MemType::Count; type++)
		{
			for (int size = 0; size < MemSize::Count; size++)
			{
				for (int op = 0; op < MemOp::Count; op++)
				{
					MemHandlers[type][size][op] = getCurr();
					if (type == MemType::Fast && addrspace::virtmemEnabled())
					{
						mov(rax, (uintptr_t)addrspace::ram_base);
						mov(r9, call_regs64[0]);
						and_(call_regs[0], 0x1FFFFFFF);

						switch (size)
						{
						case MemSize::S8:
							if (op == MemOp::R)
								movsx(eax, byte[rax + call_regs64[0]]);
							else
								mov(byte[rax + call_regs64[0]], call_regs[1].cvt8());
							break;

						case MemSize::S16:
							if (op == MemOp::R)
								movsx(eax, word[rax + call_regs64[0]]);
							else
								mov(word[rax + call_regs64[0]], call_regs[1].cvt16());
							break;

						case MemSize::S32:
							if (op == MemOp::R)
								mov(eax, dword[rax + call_regs64[0]]);
							else
								mov(dword[rax + call_regs64[0]], call_regs[1]);
							break;

						case MemSize::S64:
							if (op == MemOp::R)
								mov(rax, qword[rax + call_regs64[0]]);
							else
								mov(qword[rax + call_regs64[0]], call_regs64[1]);
							break;
						}
					}
					else if (type == MemType::StoreQueue)
					{
						if (op != MemOp::W || size < MemSize::S32)
							continue;
						Xbyak::Label no_sqw;

						mov(r9d, call_regs[0]);
						shr(r9d, 26);
						cmp(r9d, 0x38);
						jne(no_sqw);
						mov(rax, (uintptr_t)p_sh4rcb->sq_buffer);
						and_(call_regs[0], 0x3F);

						if (size == MemSize::S32)
							mov(dword[rax + call_regs64[0]], call_regs[1]);
						else
							mov(qword[rax + call_regs64[0]], call_regs64[1]);
						ret();
						L(no_sqw);
						if (size == MemSize::S32)
							jmp((const void *)addrspace::write32);	// tail call
						else
							jmp((const void *)addrspace::write64);	// tail call
						continue;
					}
					else
					{
						// Slow path
						if (op == MemOp::R)
						{
							switch (size) {
							case MemSize::S8:
								sub(rsp, STACK_ALIGN);
								call((const void *)addrspace::read8);
								movsx(eax, al);
								add(rsp, STACK_ALIGN);
								break;
							case MemSize::S16:
								sub(rsp, STACK_ALIGN);
								call((const void *)addrspace::read16);
								movsx(eax, ax);
								add(rsp, STACK_ALIGN);
								break;
							case MemSize::S32:
								jmp((const void *)addrspace::read32);	// tail call
								continue;
							case MemSize::S64:
								jmp((const void *)addrspace::read64);	// tail call
								continue;
							default:
								die("1..8 bytes");
							}
						}
						else
						{
							switch (size) {
							case MemSize::S8:
								jmp((const void *)addrspace::write8);		// tail call
								continue;
							case MemSize::S16:
								jmp((const void *)addrspace::write16);	// tail call
								continue;
							case MemSize::S32:
								jmp((const void *)addrspace::write32);	// tail call
								continue;
							case MemSize::S64:
								jmp((const void *)addrspace::write64);	// tail call
								continue;
							default:
								die("1..8 bytes");
							}
						}
					}
					ret();
				}
			}
		}
		MemHandlerEnd = getCurr();
	}

	void saveXmmRegisters()
	{
#ifndef _WIN32
		if (current_opid == (size_t)-1)
			return;

		if (regalloc.IsMapped(xmm8, current_opid))
			movd(ptr[rip + &xmmSave[0]], xmm8);
		if (regalloc.IsMapped(xmm9, current_opid))
			movd(ptr[rip + &xmmSave[1]], xmm9);
		if (regalloc.IsMapped(xmm10, current_opid))
			movd(ptr[rip + &xmmSave[2]], xmm10);
		if (regalloc.IsMapped(xmm11, current_opid))
			movd(ptr[rip + &xmmSave[3]], xmm11);
#endif
	}

	void restoreXmmRegisters()
	{
#ifndef _WIN32
		if (current_opid == (size_t)-1)
			return;

		if (regalloc.IsMapped(xmm8, current_opid))
			movd(xmm8, ptr[rip + &xmmSave[0]]);
		if (regalloc.IsMapped(xmm9, current_opid))
			movd(xmm9, ptr[rip + &xmmSave[1]]);
		if (regalloc.IsMapped(xmm10, current_opid))
			movd(xmm10, ptr[rip + &xmmSave[2]]);
		if (regalloc.IsMapped(xmm11, current_opid))
			movd(xmm11, ptr[rip + &xmmSave[3]]);
#endif
	}

	template<class Ret, class... Params>
	void GenCall(Ret(*function)(Params...), bool skip_floats = false)
	{
		if (!skip_floats)
			saveXmmRegisters();
		call(CC_RX2RW(function));
		if (!skip_floats)
			restoreXmmRegisters();
	}

	struct CC_PS
	{
		CanonicalParamType type;
		const shil_param* prm;
	};
	std::vector<CC_PS> CC_pars;

	X64RegAlloc regalloc;
	Xbyak::util::Cpu cpu;
	size_t current_opid;
	Xbyak::Label exit_block;
};

void X64RegAlloc::Preload(u32 reg, Xbyak::Operand::Code nreg)
{
	compiler->RegPreload(reg, nreg);
}
void X64RegAlloc::Writeback(u32 reg, Xbyak::Operand::Code nreg)
{
	compiler->RegWriteback(reg, nreg);
}
void X64RegAlloc::Preload_FPU(u32 reg, s8 nreg)
{
	compiler->RegPreload_FPU(reg, nreg);
}
void X64RegAlloc::Writeback_FPU(u32 reg, s8 nreg)
{
	compiler->RegWriteback_FPU(reg, nreg);
}

class X64Dynarec : public Sh4Dynarec
{
public:
	X64Dynarec() {
		sh4Dynarec = this;
	}

	void compile(RuntimeBlockInfo* block, bool smc_checks, bool optimise) override
	{
		void* protStart = codeBuffer->get();
		size_t protSize = codeBuffer->getFreeSpace();
		virtmem::jit_set_exec(protStart, protSize, false);

		ccCompiler = new BlockCompiler(*codeBuffer);
		try {
			ccCompiler->compile(block, smc_checks, optimise);
		} catch (const Xbyak::Error& e) {
			ERROR_LOG(DYNAREC, "Fatal xbyak error: %s", e.what());
		}
		delete ccCompiler;
		ccCompiler = nullptr;
		virtmem::jit_set_exec(protStart, protSize, true);
	}

	void init(Sh4CodeBuffer& codeBuffer) override
	{
		this->codeBuffer = &codeBuffer;
	}

	void mainloop(void *) override
	{
		verify(::mainloop != nullptr);
		try {
			::mainloop();
		} catch (const SH4ThrownException& ex) {
			ERROR_LOG(DYNAREC, "SH4ThrownException in mainloop code %x", ex.expEvn);
			throw FlycastException("Fatal: Unhandled SH4 exception");
		}
	}

	void canonStart(const shil_opcode* op) override {
		ccCompiler->canonStart(*op);
	}

	void canonParam(const shil_opcode* op, const shil_param* par, CanonicalParamType tp) override {
		ccCompiler->canonParam(*op, *par, tp);
	}

	void canonCall(const shil_opcode* op, void* function) override {
		ccCompiler->canonCall(*op, function);
	}

	void canonFinish(const shil_opcode* op) override {
	}

	bool rewrite(host_context_t &context, void *faultAddress) override
	{
		if (codeBuffer == nullptr)
			// init() not called yet
			return false;
		void* protStart = codeBuffer->get();
		size_t protSize = codeBuffer->getFreeSpace();
		virtmem::jit_set_exec(protStart, protSize, false);

		u8 *retAddr = *(u8 **)context.rsp - 5;
		BlockCompiler compiler(*codeBuffer, retAddr);
		bool rc = false;
		try {
			rc = compiler.rewriteMemAccess(context);
		} catch (const Xbyak::Error& e) {
			ERROR_LOG(DYNAREC, "Fatal xbyak error: %s", e.what());
		}
		virtmem::jit_set_exec(protStart, protSize, true);
		return rc;
	}

	void handleException(host_context_t &context) override
	{
		context.pc = (uintptr_t)::handleException;
	}

	void reset() override
	{
		unwinder.clear();
		// Avoid generating the main loop more than once
		if (::mainloop != nullptr && ::mainloop != codeBuffer->get())
			return;

		void* protStart = codeBuffer->get();
		size_t protSize = codeBuffer->getFreeSpace();
		virtmem::jit_set_exec(protStart, protSize, false);

		BlockCompiler compiler(*codeBuffer);
		try {
			compiler.genMainloop();
		} catch (const Xbyak::Error& e) {
			ERROR_LOG(DYNAREC, "Fatal xbyak error: %s", e.what());
		}
		virtmem::jit_set_exec(protStart, protSize, true);
	}

private:
	Sh4CodeBuffer *codeBuffer = nullptr;
	BlockCompiler *ccCompiler = nullptr;
};

static X64Dynarec instance;

#endif