From 4beeeb52f54e787dfb5f72fad9d1ffc0ef2d29f6 Mon Sep 17 00:00:00 2001 From: Anthony Pesch Date: Fri, 24 Jul 2015 15:14:16 -0700 Subject: [PATCH] x64 emitter first pass --- CMakeLists.txt | 2 + src/core/math.h | 1 - src/cpu/backend/x64/x64_backend.cc | 82 +- src/cpu/backend/x64/x64_backend.h | 4 +- src/cpu/backend/x64/x64_block.cc | 32 +- src/cpu/backend/x64/x64_block.h | 4 +- src/cpu/backend/x64/x64_emitter.cc | 1251 +++++++++++++++++ src/cpu/backend/x64/x64_emitter.h | 49 + src/cpu/ir/ir_builder.cc | 30 +- src/cpu/ir/ir_builder.h | 25 + src/cpu/ir/passes/register_allocation_pass.cc | 124 +- src/cpu/ir/passes/register_allocation_pass.h | 1 + src/cpu/runtime.cc | 2 +- src/cpu/sh4.cc | 4 +- src/emu/emulator.cc | 4 +- test/test_sh4.cc | 4 +- 16 files changed, 1470 insertions(+), 149 deletions(-) create mode 100644 src/cpu/backend/x64/x64_emitter.cc create mode 100644 src/cpu/backend/x64/x64_emitter.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 830e3527..9a508648 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,7 @@ set(DREAVM_SOURCES src/cpu/backend/interpreter/interpreter_callbacks.cc src/cpu/backend/x64/x64_backend.cc src/cpu/backend/x64/x64_block.cc + src/cpu/backend/x64/x64_emitter.cc src/cpu/frontend/sh4/sh4_builder.cc src/cpu/frontend/sh4/sh4_emit.cc src/cpu/frontend/sh4/sh4_frontend.cc @@ -268,6 +269,7 @@ set(DREAVM_TEST_SOURCES src/cpu/backend/interpreter/interpreter_callbacks.cc src/cpu/backend/x64/x64_backend.cc src/cpu/backend/x64/x64_block.cc + src/cpu/backend/x64/x64_emitter.cc src/cpu/frontend/sh4/sh4_builder.cc src/cpu/frontend/sh4/sh4_emit.cc src/cpu/frontend/sh4/sh4_frontend.cc diff --git a/src/core/math.h b/src/core/math.h index d44e84ab..920a5137 100644 --- a/src/core/math.h +++ b/src/core/math.h @@ -8,7 +8,6 @@ template T align(T v, T alignment) { return (v + alignment - 1) & -alignment; } - } } diff --git a/src/cpu/backend/x64/x64_backend.cc b/src/cpu/backend/x64/x64_backend.cc index 63648dd4..d09f5b15 100644 --- a/src/cpu/backend/x64/x64_backend.cc +++ b/src/cpu/backend/x64/x64_backend.cc @@ -1,46 +1,28 @@ +#include "core/core.h" #include "cpu/backend/x64/x64_backend.h" #include "cpu/backend/x64/x64_block.h" +using namespace dreavm::core; using namespace dreavm::cpu; using namespace dreavm::cpu::backend; using namespace dreavm::cpu::backend::x64; using namespace dreavm::cpu::ir; using namespace dreavm::emu; -static Register x64_registers[] = {{"rax", VALUE_INT_MASK}, - {"rbx", VALUE_INT_MASK}, - {"rcx", VALUE_INT_MASK}, - {"rdx", VALUE_INT_MASK}, - {"rsi", VALUE_INT_MASK}, - {"rdi", VALUE_INT_MASK}, - {"rbp", VALUE_INT_MASK}, - {"rsp", VALUE_INT_MASK}, - {"r8", VALUE_INT_MASK}, - {"r9", VALUE_INT_MASK}, - {"r10", VALUE_INT_MASK}, - {"r11", VALUE_INT_MASK}, - {"r12", VALUE_INT_MASK}, - {"r13", VALUE_INT_MASK}, - {"r14", VALUE_INT_MASK}, - {"r15", VALUE_INT_MASK}, - {"mm0", VALUE_FLOAT_MASK}, - {"mm1", VALUE_FLOAT_MASK}, - {"mm2", VALUE_FLOAT_MASK}, - {"mm3", VALUE_FLOAT_MASK}, - {"mm4", VALUE_FLOAT_MASK}, - {"mm5", VALUE_FLOAT_MASK}, - {"mm6", VALUE_FLOAT_MASK}, - {"mm7", VALUE_FLOAT_MASK}}; +static Register x64_registers[] = { + {"rbx", VALUE_INT_MASK}, {"rbp", VALUE_INT_MASK}, + {"r12", VALUE_INT_MASK}, {"r13", VALUE_INT_MASK}, + {"r14", VALUE_INT_MASK}, {"r15", VALUE_INT_MASK}, + {"xmm2", VALUE_FLOAT_MASK}, {"xmm3", VALUE_FLOAT_MASK}, + {"xmm4", VALUE_FLOAT_MASK}, {"xmm5", VALUE_FLOAT_MASK}, + {"xmm6", VALUE_FLOAT_MASK}, {"xmm7", VALUE_FLOAT_MASK}}; -static const Xbyak::Reg *reg_map[] = { - &Xbyak::util::rax, &Xbyak::util::rbx, &Xbyak::util::rcx, &Xbyak::util::rdx, - &Xbyak::util::rsi, &Xbyak::util::rdi, &Xbyak::util::rbp, &Xbyak::util::rsp, - &Xbyak::util::r8, &Xbyak::util::r9, &Xbyak::util::r10, &Xbyak::util::r11, - &Xbyak::util::r12, &Xbyak::util::r13, &Xbyak::util::r14, &Xbyak::util::r15, - &Xbyak::util::mm0, &Xbyak::util::mm1, &Xbyak::util::mm2, &Xbyak::util::mm3, - &Xbyak::util::mm4, &Xbyak::util::mm5, &Xbyak::util::mm6, &Xbyak::util::mm7}; - -X64Backend::X64Backend(emu::Memory &memory) : Backend(memory) {} +X64Backend::X64Backend(emu::Memory &memory) + : Backend(memory), + // TODO allocate a 32mb buffer for code for now, this needs to be managed + // soon. Freed from when blocks are freed, etc. + codegen_(1024 * 1024 * 32), + emitter_(codegen_) {} X64Backend::~X64Backend() {} @@ -53,36 +35,12 @@ int X64Backend::num_registers() const { bool X64Backend::Init() { return true; } std::unique_ptr X64Backend::AssembleBlock(IRBuilder &builder) { - int guest_cycles = 0; + X64Fn fn = emitter_.Emit(builder); - // 0. LOAD_CONTEXT 40 %0 - // 1. LOAD_CONTEXT 36 %1 - // 2. ADD %0 %1 %2 <--- ideally %0 and %2 should re-use the same register - // 3. STORE_CONTEXT 40 %2 - // 4. LOAD_CONTEXT 16 %3 - // 5. BRANCH %3 + // get number of guest cycles for this block of code + const Value *md_guest_cycles = builder.GetMetadata(MD_GUEST_CYCLES); + CHECK(md_guest_cycles); + int guest_cycles = md_guest_cycles->value(); - // RuntimeContext * is at RCX on Windows, RDI on OSX - - for (auto block : builder.blocks()) { - for (auto instr : block->instrs()) { - if (instr->op() == OP_LOAD_CONTEXT) { - if (instr->arg0()->value() == 40) { - gen_.mov(*reg_map[instr->result()->reg()], gen_.dword[gen_.rdi + 40]); - } else if (instr->arg0()->value() == 36) { - gen_.mov(*reg_map[instr->result()->reg()], gen_.dword[gen_.rdi + 36]); - } - } else if (instr->op() == OP_ADD) { - gen_.add(*reg_map[instr->arg0()->reg()], - *reg_map[instr->arg1()->reg()]); - } else if (instr->op() == OP_STORE_CONTEXT) { - gen_.mov(gen_.dword[gen_.rdi + 40], *reg_map[instr->arg1()->reg()]); - } else if (instr->op() == OP_BRANCH) { - gen_.ret(); - } - } - } - - X64Fn fn = gen_.getCode(); return std::unique_ptr(new X64Block(guest_cycles, fn)); } diff --git a/src/cpu/backend/x64/x64_backend.h b/src/cpu/backend/x64/x64_backend.h index 012a08b5..0aaf1ee0 100644 --- a/src/cpu/backend/x64/x64_backend.h +++ b/src/cpu/backend/x64/x64_backend.h @@ -3,6 +3,7 @@ #include #include "cpu/backend/backend.h" +#include "cpu/backend/x64/x64_emitter.h" #include "cpu/runtime.h" namespace dreavm { @@ -22,7 +23,8 @@ class X64Backend : public Backend { std::unique_ptr AssembleBlock(ir::IRBuilder &builder); private: - Xbyak::CodeGenerator gen_; + Xbyak::CodeGenerator codegen_; + X64Emitter emitter_; }; } } diff --git a/src/cpu/backend/x64/x64_block.cc b/src/cpu/backend/x64/x64_block.cc index abe639d0..2c0b6fbd 100644 --- a/src/cpu/backend/x64/x64_block.cc +++ b/src/cpu/backend/x64/x64_block.cc @@ -1,3 +1,5 @@ +#include +#include #include "cpu/backend/x64/x64_backend.h" #include "cpu/backend/x64/x64_block.h" #include "emu/profiler.h" @@ -12,6 +14,32 @@ X64Block::X64Block(int guest_cycles, X64Fn fn) X64Block::~X64Block() {} uint32_t X64Block::Call(emu::Memory *memory, void *guest_ctx) { - fn_(guest_ctx); - return 0xdeadbeef; + return fn_(guest_ctx, memory); +} + +void X64Block::Dump() { + DISASM dsm; + dsm.Archi = 64; + dsm.EIP = (uintptr_t)fn_; + dsm.SecurityBlock = 0; + + while (true) { + int len = Disasm(&dsm); + if (len == OUT_OF_BLOCK) { + LOG(INFO) << "Disasm engine is not allowed to read more memory"; + break; + } else if (len == UNKNOWN_OPCODE) { + LOG(INFO) << "Unknown opcode"; + break; + } + + LOG(INFO) << std::setw(2) << std::hex << std::setfill('0') + << (int)dsm.VirtualAddr << " " << dsm.CompleteInstr; + + if (dsm.Instruction.BranchType == RetType) { + break; + } + + dsm.EIP = dsm.EIP + len; + } } diff --git a/src/cpu/backend/x64/x64_block.h b/src/cpu/backend/x64/x64_block.h index c9ad42b9..d788f804 100644 --- a/src/cpu/backend/x64/x64_block.h +++ b/src/cpu/backend/x64/x64_block.h @@ -4,13 +4,14 @@ #include #include "cpu/ir/ir_builder.h" #include "cpu/runtime.h" +#include "emu/memory.h" namespace dreavm { namespace cpu { namespace backend { namespace x64 { -typedef void (*X64Fn)(void *guest_ctx); +typedef uint32_t (*X64Fn)(void *guest_ctx, emu::Memory *memory); class X64Block : public RuntimeBlock { public: @@ -18,6 +19,7 @@ class X64Block : public RuntimeBlock { ~X64Block(); uint32_t Call(emu::Memory *memory, void *guest_ctx); + void Dump(); private: X64Fn fn_; diff --git a/src/cpu/backend/x64/x64_emitter.cc b/src/cpu/backend/x64/x64_emitter.cc new file mode 100644 index 00000000..22c24210 --- /dev/null +++ b/src/cpu/backend/x64/x64_emitter.cc @@ -0,0 +1,1251 @@ +#include "cpu/backend/x64/x64_emitter.h" +#include "emu/memory.h" + +using namespace dreavm::core; +using namespace dreavm::cpu::backend::x64; +using namespace dreavm::cpu::ir; +using namespace dreavm::emu; + +static const Xbyak::Reg *reg_map_8[] = {&Xbyak::util::bl, + &Xbyak::util::bpl, + &Xbyak::util::r12b, + &Xbyak::util::r13b, + &Xbyak::util::r14b, + &Xbyak::util::r15b, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr}; + +static const Xbyak::Reg *reg_map_16[] = {&Xbyak::util::bx, + &Xbyak::util::bp, + &Xbyak::util::r12w, + &Xbyak::util::r13w, + &Xbyak::util::r14w, + &Xbyak::util::r15w, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr}; + +static const Xbyak::Reg *reg_map_32[] = { + &Xbyak::util::ebx, &Xbyak::util::ebp, &Xbyak::util::r12d, + &Xbyak::util::r13d, &Xbyak::util::r14d, &Xbyak::util::r15d, + &Xbyak::util::xmm2, &Xbyak::util::xmm3, &Xbyak::util::xmm4, + &Xbyak::util::xmm5, &Xbyak::util::xmm6, &Xbyak::util::xmm7}; + +static const Xbyak::Reg *reg_map_64[] = { + &Xbyak::util::rbx, &Xbyak::util::rbp, &Xbyak::util::r12, + &Xbyak::util::r13, &Xbyak::util::r14, &Xbyak::util::r15, + &Xbyak::util::xmm2, &Xbyak::util::xmm3, &Xbyak::util::xmm4, + &Xbyak::util::xmm5, &Xbyak::util::xmm6, &Xbyak::util::xmm7}; + +// callbacks for emitting each IR op +typedef void (*X64Emit)(X64Emitter &, Xbyak::CodeGenerator &, const Instr *); + +static X64Emit x64_emitters[NUM_OPCODES]; + +#define EMITTER(op) \ + void op(X64Emitter &e, Xbyak::CodeGenerator &c, const Instr *instr); \ + static struct _x64_##op##_init { \ + _x64_##op##_init() { x64_emitters[OP_##op] = &op; } \ + } x64_##op##_init; \ + void op(X64Emitter &e, Xbyak::CodeGenerator &c, const Instr *instr) + +X64Emitter::X64Emitter(Xbyak::CodeGenerator &codegen) + : c_(codegen), operand_arena_(1024) {} + +X64Fn X64Emitter::Emit(IRBuilder &builder) { + // getCurr returns the current spot in the codegen buffer which the function + // is about to emitted to + X64Fn fn = c_.getCurr(); + + // reset arena holding temporary operands used during emitting + operand_arena_.Reset(); + + // stack must be 16 byte aligned + // TODO align each local + int stack_size = 16 + builder.locals_size(); + // add 8 for function return value which will be pushed when this is called + stack_size = align(stack_size, 16) + 8; + assert((stack_size + 8) % 16 == 0); + + c_.inLocalLabel(); + + // emit prolog + // FIXME only push registers that're used + c_.push(Xbyak::util::rbx); + c_.push(Xbyak::util::rbp); + c_.push(Xbyak::util::r12); + c_.push(Xbyak::util::r13); + c_.push(Xbyak::util::r14); + c_.push(Xbyak::util::r15); + + // reserve stack space for rdi copy + c_.sub(Xbyak::util::rsp, stack_size); + c_.mov(c_.qword[Xbyak::util::rsp + STACK_OFFSET_GUEST_CONTEXT], + Xbyak::util::rdi); + c_.mov(c_.qword[Xbyak::util::rsp + STACK_OFFSET_MEMORY], Xbyak::util::rsi); + + // assign ordinals for each block + int ordinal = 0; + for (auto block : builder.blocks()) { + block->set_tag(ordinal++); + } + + for (auto block : builder.blocks()) { + // generate label for this ordinal + c_.L("." + std::to_string((int)block->tag())); + + for (auto instr : block->instrs()) { + X64Emit emit = x64_emitters[instr->op()]; + CHECK(emit) << "Failed to find emitter for " << Opnames[instr->op()]; + emit(*this, c_, instr); + } + } + + // emit prolog + c_.L(".epilog"); + + // reset stack + c_.add(Xbyak::util::rsp, stack_size); + + // TODO only pop registers that're used + c_.pop(Xbyak::util::r15); + c_.pop(Xbyak::util::r14); + c_.pop(Xbyak::util::r13); + c_.pop(Xbyak::util::r12); + c_.pop(Xbyak::util::rbp); + c_.pop(Xbyak::util::rbx); + + c_.ret(); + + c_.outLocalLabel(); + c_.align(16); + + // patch up relocations + c_.ready(); + + // return the start of the buffer + return fn; +} + +// Get the register / local allocated for the supplied value. The size argument +// can be overridden to get a truncated version of the value. +const Xbyak::Operand &X64Emitter::GetOperand(const Value *v, int size) { + if (size == -1) { + size = SizeForType(v->type()); + } + + if (v->reg() != NO_REGISTER) { + const Xbyak::Reg *reg = nullptr; + + switch (size) { + case 8: + reg = reg_map_64[v->reg()]; + break; + case 4: + reg = reg_map_32[v->reg()]; + break; + case 2: + reg = reg_map_16[v->reg()]; + break; + case 1: + reg = reg_map_8[v->reg()]; + break; + } + + CHECK_NOTNULL(reg); + + return *reg; + } else if (v->local() != NO_SLOT) { + Xbyak::Address *addr = operand_arena_.Alloc(); + + int offset = STACK_OFFSET_LOCALS + v->local(); + + switch (size) { + case 8: + *addr = c_.qword[Xbyak::util::rsp + offset]; + break; + case 4: + *addr = c_.dword[Xbyak::util::rsp + offset]; + break; + case 2: + *addr = c_.word[Xbyak::util::rsp + offset]; + break; + case 1: + *addr = c_.byte[Xbyak::util::rsp + offset]; + break; + } + + CHECK_NOTNULL(addr); + + return *addr; + } + + LOG(FATAL) << "Value was not allocated a register or local"; +} + +// If the value is a constant, copy it to the temporary operand, else return +// the local or register allocated for it. +const Xbyak::Operand &X64Emitter::GetOperand(const Value *v, + const Xbyak::Operand &tmp) { + if (v->reg() == NO_REGISTER && v->local() == NO_SLOT) { + // copy constant to tmp + CopyOperand(v, tmp); + return tmp; + } + + return GetOperand(v); +} + +// If the value is a local or constant, copy it to the tempory register, else +// return the register allocated for it. +const Xbyak::Reg &X64Emitter::GetRegister(const Value *v, + const Xbyak::Reg &tmp) { + if (v->reg() == NO_REGISTER) { + // copy local / constant to mp + CopyOperand(v, tmp); + return tmp; + } + + return reinterpret_cast(GetOperand(v)); +} + +// If the value isn't allocated a XMM register copy it to the temporary XMM, +// register, else return the XMM register allocated for it. +const Xbyak::Xmm &X64Emitter::GetXMMRegister(const Value *v, + const Xbyak::Xmm &tmp) { + const Xbyak::Operand &op = GetOperand(v); + + if (!op.isXMM()) { + CopyOperand(v, tmp); + return tmp; + } + + return reinterpret_cast(op); +} + +// If the prefered operand is an XMM register, copy the value to it and return, +// else do the regular GetXMMRegister lookup. +const Xbyak::Xmm &X64Emitter::GetXMMRegister(const Value *v, + const Xbyak::Operand &prefered, + const Xbyak::Xmm &tmp) { + if (prefered.isXMM()) { + CopyOperand(v, prefered); + return reinterpret_cast(prefered); + } + + return GetXMMRegister(v, tmp); +} + +// Copy the value from src to dst if they're not the same operand. +// TODO when copying XMM registers during SIN / COS a movdqa isn't actually +// necessary (could pass in size info to perform movss / movsd). bummer that +// there isn't xmm0d, etc. +void X64Emitter::CopyOperand(const Xbyak::Operand &from, + const Xbyak::Operand &to) { + if (from == to) { + return; + } + + if (to.isXMM()) { + if (from.isXMM()) { + c_.movdqa(reinterpret_cast(to), from); + } else if (from.isBit(32)) { + c_.movss(reinterpret_cast(to), from); + } else if (from.isBit(64)) { + c_.movsd(reinterpret_cast(to), from); + } else { + LOG(FATAL) << "Unsupported copy"; + } + } else if (from.isXMM()) { + CHECK(to.isMEM()) << "Expected destination to be a memory address"; + + if (to.isBit(32)) { + c_.movss(reinterpret_cast(to), + reinterpret_cast(from)); + } else if (to.isBit(64)) { + c_.movsd(reinterpret_cast(to), + reinterpret_cast(from)); + } else { + LOG(FATAL) << "Unsupported copy"; + } + } else { + c_.mov(to, from); + } +} + +// Copy the value to the supplied operand. +void X64Emitter::CopyOperand(const Value *v, const Xbyak::Operand &to) { + if (v->constant()) { + if (to.isXMM()) { + if (v->type() == VALUE_F32) { + float val = v->value(); + c_.mov(c_.r8d, *reinterpret_cast(&val)); + c_.movd(reinterpret_cast(to), c_.r8d); + } else { + double val = v->value(); + c_.mov(c_.r8, *reinterpret_cast(&val)); + c_.movq(reinterpret_cast(to), c_.r8); + } + } else { + c_.mov(to, v->GetZExtValue()); + } + } else { + const Xbyak::Operand &from = GetOperand(v); + + CopyOperand(from, to); + } +} + +bool X64Emitter::CanEncodeAsImmediate(const Value *v) { + if (!v->constant()) { + return false; + } + + return v->type() <= VALUE_I32; +} + +EMITTER(LOAD_CONTEXT) { + int offset = instr->arg0()->value(); + int result_sz = SizeForType(instr->result()->type()); + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (result.isXMM()) { + const Xbyak::Xmm &result_xmm = reinterpret_cast(result); + + if (result_sz == 4) { + c.movss(result_xmm, c.dword[c.rdi + offset]); + } else if (result_sz == 8) { + c.movsd(result_xmm, c.qword[c.rdi + offset]); + } + } else { + if (result_sz == 1) { + c.mov(result, c.byte[c.rdi + offset]); + } else if (result_sz == 2) { + c.mov(result, c.word[c.rdi + offset]); + } else if (result_sz == 4) { + c.mov(result, c.dword[c.rdi + offset]); + } else if (result_sz == 8) { + c.mov(result, c.qword[c.rdi + offset]); + } + } +} + +EMITTER(STORE_CONTEXT) { + int offset = instr->arg0()->value(); + int data_sz = SizeForType(instr->arg1()->type()); + + if (instr->arg1()->constant()) { + if (data_sz == 1) { + c.mov(c.byte[c.rdi + offset], instr->arg1()->value()); + } else if (data_sz == 2) { + c.mov(c.word[c.rdi + offset], instr->arg1()->value()); + } else if (data_sz == 4) { + c.mov(c.dword[c.rdi + offset], instr->arg1()->value()); + } else if (data_sz == 8) { + c.mov(c.qword[c.rdi + offset], instr->arg1()->value()); + } + } else { + const Xbyak::Operand &src = e.GetOperand(instr->arg1()); + + if (src.isXMM()) { + const Xbyak::Xmm &src_xmm = reinterpret_cast(src); + + if (data_sz == 4) { + c.movss(c.dword[c.rdi + offset], src_xmm); + } else if (data_sz == 8) { + c.movsd(c.qword[c.rdi + offset], src_xmm); + } + } else { + if (data_sz == 1) { + c.mov(c.byte[c.rdi + offset], src); + } else if (data_sz == 2) { + c.mov(c.word[c.rdi + offset], src); + } else if (data_sz == 4) { + c.mov(c.dword[c.rdi + offset], src); + } else if (data_sz == 8) { + c.mov(c.qword[c.rdi + offset], src); + } + } + } +} + +uint8_t R8(Memory *memory, uint32_t addr) { return memory->R8(addr); } +uint16_t R16(Memory *memory, uint32_t addr) { return memory->R16(addr); } +uint32_t R32(Memory *memory, uint32_t addr) { return memory->R32(addr); } +uint64_t R64(Memory *memory, uint32_t addr) { return memory->R64(addr); } +EMITTER(LOAD) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + void *fn = nullptr; + switch (instr->result()->type()) { + case VALUE_I8: + fn = reinterpret_cast(&R8); + break; + case VALUE_I16: + fn = reinterpret_cast(&R16); + break; + case VALUE_I32: + fn = reinterpret_cast(&R32); + break; + case VALUE_I64: + fn = reinterpret_cast(&R64); + break; + default: + CHECK(false); + break; + } + + // setup arguments + c.mov(c.rdi, c.rsi); + e.CopyOperand(instr->arg0(), c.rsi); + + // call func + c.mov(c.rax, (uintptr_t)fn); + c.call(c.rax); + + // copy off result + c.mov(result, c.rax); + + // restore rdi / rsi + c.mov(Xbyak::util::rdi, + c.qword[Xbyak::util::rsp + STACK_OFFSET_GUEST_CONTEXT]); + c.mov(Xbyak::util::rsi, c.qword[Xbyak::util::rsp + STACK_OFFSET_MEMORY]); +} + +void W8(Memory *memory, uint32_t addr, uint8_t v) { memory->W8(addr, v); } +void W16(Memory *memory, uint32_t addr, uint16_t v) { memory->W16(addr, v); } +void W32(Memory *memory, uint32_t addr, uint32_t v) { memory->W32(addr, v); } +void W64(Memory *memory, uint32_t addr, uint64_t v) { memory->W64(addr, v); } +EMITTER(STORE) { + void *fn = nullptr; + switch (instr->arg1()->type()) { + case VALUE_I8: + fn = reinterpret_cast(&W8); + break; + case VALUE_I16: + fn = reinterpret_cast(&W16); + break; + case VALUE_I32: + fn = reinterpret_cast(&W32); + break; + case VALUE_I64: + fn = reinterpret_cast(&W64); + break; + default: + CHECK(false); + break; + } + + // setup arguments + c.mov(c.rdi, c.rsi); + e.CopyOperand(instr->arg0(), c.rsi); + e.CopyOperand(instr->arg1(), c.rdx); + + // call func + c.mov(c.rax, (uintptr_t)fn); + c.call(c.rax); + + // restore rdi / rsi + c.mov(Xbyak::util::rdi, + c.qword[Xbyak::util::rsp + STACK_OFFSET_GUEST_CONTEXT]); + c.mov(Xbyak::util::rsi, c.qword[Xbyak::util::rsp + STACK_OFFSET_MEMORY]); +} + +EMITTER(CAST) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + switch (instr->result()->type()) { + case VALUE_I32: + CHECK_EQ(instr->arg0()->type(), VALUE_F32); + c.cvttss2si(result, a); + break; + case VALUE_I64: + CHECK_EQ(instr->arg0()->type(), VALUE_F64); + c.cvttsd2si(result, a); + break; + case VALUE_F32: + CHECK_EQ(instr->arg0()->type(), VALUE_I32); + c.cvtsi2ss(result, a); + break; + case VALUE_F64: + CHECK_EQ(instr->arg0()->type(), VALUE_I64); + c.cvtsi2sd(result, a); + break; + default: + CHECK(false); + break; + } +} + +EMITTER(SEXT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0()); + + if (a == result) { + // already the correct width + return; + } + + const Xbyak::Reg &tmp = e.GetRegister(instr->result(), c.rax); + + if (result.isBit(64) && a.isBit(32)) { + c.movsxd(tmp.cvt64(), a); + } else { + c.movsx(tmp, a); + } + + if (tmp != result) { + c.mov(result, tmp); + } +} + +EMITTER(ZEXT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0()); + + if (a == result) { + // already the correct width + return; + } + + const Xbyak::Reg &tmp = e.GetRegister(instr->result(), c.rax); + + if (result.isBit(64)) { + // mov will automatically zero fill the upper 32-bits + c.mov(tmp.cvt32(), a); + } else { + c.movzx(tmp, a); + } + + if (tmp != result) { + c.mov(result, tmp); + } +} + +EMITTER(TRUNCATE) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0()); + + if (a == result) { + // already the correct width + return; + } + + const Xbyak::Reg &tmp = e.GetRegister(instr->result(), c.rax); + const Xbyak::Operand &truncated = + e.GetOperand(instr->arg0(), result.getBit() >> 3); + + // TODO fixme tmp should be size appropriate, c.mov is unnecesary, only need + // movzx once tmp is correct + if (truncated.isBit(32)) { + c.mov(result, truncated); + } else { + c.movzx(tmp.cvt32(), truncated); + if (tmp != result) { + c.mov(result, tmp); + } + } +} + +EMITTER(SELECT) { + const Xbyak::Reg &cond = e.GetRegister(instr->arg0(), c.rax); + + c.test(cond, cond); + + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg1(), c.rax); + const Xbyak::Operand &b = e.GetOperand(instr->arg2(), c.rcx); + const Xbyak::Reg &tmp = e.GetRegister(instr->result(), c.rdx); + + c.cmovnz(tmp.cvt32(), a); + c.cmovz(tmp.cvt32(), b); + + if (tmp != result) { + c.mov(result, tmp); + } +} + +EMITTER(EQ) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->arg0()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1()); + + if (instr->arg0()->type() == VALUE_F32) { + c.comiss(a, b); + } else { + c.comisd(a, b); + } + } else { + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.cmp(a, b); + } + } + + c.sete(result); +} + +EMITTER(NE) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->arg0()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1()); + + if (instr->arg0()->type() == VALUE_F32) { + c.comiss(a, b); + } else { + c.comisd(a, b); + } + } else { + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.cmp(a, b); + } + } + + c.setne(result); +} + +EMITTER(SGE) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->arg0()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->arg0()->type() == VALUE_F32) { + c.comiss(a, b); + } else { + c.comisd(a, b); + } + + c.setae(result); + } else { + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setge(result); + } +} + +EMITTER(SGT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->arg0()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->arg0()->type() == VALUE_F32) { + c.comiss(a, b); + } else { + c.comisd(a, b); + } + + c.seta(result); + } else { + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setg(result); + } +} + +EMITTER(UGE) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setae(result); +} + +EMITTER(UGT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.seta(result); +} + +EMITTER(SLE) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->arg0()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->arg0()->type() == VALUE_F32) { + c.comiss(a, b); + } else { + c.comisd(a, b); + } + + c.setbe(result); + } else { + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setle(result); + } +} + +EMITTER(SLT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->arg0()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->arg0()->type() == VALUE_F32) { + c.comiss(a, b); + } else { + c.comisd(a, b); + } + + c.setb(result); + } else { + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setl(result); + } +} + +EMITTER(ULE) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setbe(result); +} + +EMITTER(ULT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.rax); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.cmp(a, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rcx); + c.cmp(a, b); + } + + c.setb(result); +} + +EMITTER(ADD) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->result()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), result, c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->result()->type() == VALUE_F32) { + c.addss(a, b); + } else { + c.addsd(a, b); + } + + e.CopyOperand(a, result); + } else { + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.add(result, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.add(result, b); + } + } +} + +EMITTER(SUB) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->result()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), result, c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->result()->type() == VALUE_F32) { + c.subss(a, b); + } else { + c.subsd(a, b); + } + + e.CopyOperand(a, result); + } else { + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.sub(result, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.sub(result, b); + } + } +} + +EMITTER(SMUL) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->result()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), result, c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->result()->type() == VALUE_F32) { + c.mulss(a, b); + } else { + c.mulsd(a, b); + } + + e.CopyOperand(a, result); + } else { + const Xbyak::Reg &tmp = e.GetRegister(instr->result(), c.rax); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + + c.imul(tmp, b); + + if (tmp != result) { + c.mov(result, tmp); + } + } +} + +EMITTER(UMUL) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Reg &tmp = e.GetRegister(instr->result(), c.rax); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + + c.imul(tmp, b); + + if (tmp != result) { + c.mov(result, tmp); + } +} + +// TODO could optimize by having a sdiv / udiv. no need to sign extend the +// accumulation register for udiv. +EMITTER(DIV) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->result()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), result, c.xmm0); + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.xmm1); + + if (instr->result()->type() == VALUE_F32) { + c.divss(a, b); + } else { + c.divsd(a, b); + } + + e.CopyOperand(a, result); + } else { + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + switch (instr->result()->type()) { + case VALUE_I8: + c.mov(c.al, instr->arg1()->value()); + break; + case VALUE_I16: + c.mov(c.ax, instr->arg1()->value()); + c.cwd(); + break; + case VALUE_I32: + c.mov(c.eax, instr->arg1()->value()); + c.cdq(); + break; + case VALUE_I64: + c.mov(c.rax, instr->arg1()->value()); + c.cqo(); + break; + default: + LOG(FATAL) << "Unexpected result type"; + break; + } + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + + switch (instr->result()->type()) { + case VALUE_I8: + c.mov(c.al, b); + break; + case VALUE_I16: + c.mov(c.ax, b); + c.cwd(); + break; + case VALUE_I32: + c.mov(c.eax, b); + c.cdq(); + break; + case VALUE_I64: + c.mov(c.rax, b); + c.cqo(); + break; + default: + LOG(FATAL) << "Unexpected result type"; + break; + } + } + + c.idiv(result); + } +} + +EMITTER(NEG) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->result()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), result, c.xmm0); + + if (instr->result()->type() == VALUE_F32) { + // TODO use xorps + c.movd(c.eax, a); + c.mov(c.ecx, (uint32_t)0x80000000); + c.xor (c.eax, c.ecx); + if (result.isXMM()) { + c.movd(reinterpret_cast(result), c.eax); + } else { + c.mov(result, c.eax); + } + } else { + // TODO use xorpd + c.movq(c.rax, a); + c.mov(c.rcx, (uint64_t)0x8000000000000000); + c.xor (c.rax, c.rcx); + if (result.isXMM()) { + c.movq(reinterpret_cast(result), c.rax); + } else { + c.mov(result, c.rax); + } + } + } else { + e.CopyOperand(instr->arg0(), result); + + c.neg(result); + } +} + +EMITTER(SQRT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + const Xbyak::Operand &a = e.GetOperand(instr->arg0(), c.xmm0); + + const Xbyak::Xmm &tmp = + result.isXMM() ? reinterpret_cast(result) : c.xmm1; + + if (instr->result()->type() == VALUE_F32) { + c.sqrtss(tmp, a); + } else { + c.sqrtsd(tmp, a); + } + + e.CopyOperand(tmp, result); +} + +EMITTER(ABS) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (IsFloatType(instr->result()->type())) { + const Xbyak::Xmm &a = e.GetXMMRegister(instr->arg0(), result, c.xmm0); + + if (instr->result()->type() == VALUE_F32) { + // TODO use andps + c.movd(c.eax, a); + c.mov(c.ecx, (uint32_t)0x7fffffff); + c.and (c.eax, c.ecx); + if (result.isXMM()) { + c.movd(reinterpret_cast(result), c.eax); + } else { + c.mov(result, c.eax); + } + } else { + // TODO use andpd + c.movq(c.rax, a); + c.mov(c.rcx, (uint64_t)0x7fffffffffffffff); + c.and (c.rax, c.rcx); + if (result.isXMM()) { + c.movq(reinterpret_cast(result), c.rax); + } else { + c.mov(result, c.rax); + } + } + } else { + LOG(FATAL) << "Verify this works"; + // c.mov(c.rax, *result); + // c.neg(c.rax); + // c.cmovl(reinterpret_cast(result)->cvt32(), c.rax); + } +} + +EMITTER(SIN) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (instr->result()->type() == VALUE_F32) { + e.CopyOperand(instr->arg0(), c.xmm0); + c.mov(c.rax, (uint64_t)&sinf); + c.call(c.rax); + if (result.isXMM()) { + c.movss(reinterpret_cast(result), c.xmm0); + } else { + c.movss(reinterpret_cast(result), c.xmm0); + } + } else { + e.CopyOperand(instr->arg0(), c.xmm0); + c.mov(c.rax, (uint64_t)&sin); + c.call(c.rax); + if (result.isXMM()) { + c.movsd(reinterpret_cast(result), c.xmm0); + } else { + c.movsd(reinterpret_cast(result), c.xmm0); + } + } + + // restore rdi / rsi + c.mov(Xbyak::util::rdi, + c.qword[Xbyak::util::rsp + STACK_OFFSET_GUEST_CONTEXT]); + c.mov(Xbyak::util::rsi, c.qword[Xbyak::util::rsp + STACK_OFFSET_MEMORY]); +} + +EMITTER(COS) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + if (instr->result()->type() == VALUE_F32) { + e.CopyOperand(instr->arg0(), c.xmm0); + c.mov(c.rax, (uint64_t)&cosf); + c.call(c.rax); + if (result.isXMM()) { + c.movss(reinterpret_cast(result), c.xmm0); + } else { + c.movss(reinterpret_cast(result), c.xmm0); + } + } else { + e.CopyOperand(instr->arg0(), c.xmm0); + c.mov(c.rax, (uint64_t)&cos); + c.call(c.rax); + if (result.isXMM()) { + c.movsd(reinterpret_cast(result), c.xmm0); + } else { + c.movsd(reinterpret_cast(result), c.xmm0); + } + } + + // restore rdi / rsi + c.mov(Xbyak::util::rdi, + c.qword[Xbyak::util::rsp + STACK_OFFSET_GUEST_CONTEXT]); + c.mov(Xbyak::util::rsi, c.qword[Xbyak::util::rsp + STACK_OFFSET_MEMORY]); +} + +EMITTER(AND) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.and (result, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.and (result, b); + } +} + +EMITTER(OR) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.or (result, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.or (result, b); + } +} + +EMITTER(XOR) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.xor (result, (uint32_t)instr->arg1()->GetZExtValue()); + } else { + const Xbyak::Operand &b = e.GetOperand(instr->arg1(), c.rax); + c.xor (result, b); + } +} + +EMITTER(NOT) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + c.not(result); +} + +EMITTER(SHL) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.shl(result, (int)instr->arg1()->GetZExtValue()); + } else { + e.CopyOperand(instr->arg1(), c.cl); + c.shl(result, c.cl); + } +} + +EMITTER(ASHR) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.sar(result, (int)instr->arg1()->GetZExtValue()); + } else { + e.CopyOperand(instr->arg1(), c.cl); + c.sar(result, c.cl); + } +} + +EMITTER(LSHR) { + const Xbyak::Operand &result = e.GetOperand(instr->result()); + + e.CopyOperand(instr->arg0(), result); + + if (e.CanEncodeAsImmediate(instr->arg1())) { + c.shr(result, (int)instr->arg1()->GetZExtValue()); + } else { + e.CopyOperand(instr->arg1(), c.cl); + c.shr(result, c.cl); + } +} + +EMITTER(BRANCH) { + if (instr->arg0()->type() == VALUE_BLOCK) { + // jump to local block + // TODO T_NEAR necessary? + Block *dst = instr->arg0()->value(); + int ordinal = dst->tag(); + c.jmp("." + std::to_string(ordinal), Xbyak::CodeGenerator::T_NEAR); + } else { + // return if we need to branch to a far block + e.CopyOperand(instr->arg0(), c.rax); + + c.jmp(".epilog"); + } +} + +EMITTER(BRANCH_COND) { + const Xbyak::Reg &cond = e.GetRegister(instr->arg0(), c.rax); + + c.test(cond, cond); + + // TODO in most cases, arg0 or arg1 are going to be "next block", fall through + // instead of having an explicit jump + + // if both blocks are a local block this is easy + if (instr->arg1()->type() == VALUE_BLOCK && + instr->arg2()->type() == VALUE_BLOCK) { + // jump to local block + Block *block_true = instr->arg1()->value(); + Block *block_false = instr->arg2()->value(); + + // TODO T_NEAR? + c.jnz("." + std::to_string((int)block_true->tag()), + Xbyak::CodeGenerator::T_NEAR); + c.je("." + std::to_string((int)block_false->tag()), + Xbyak::CodeGenerator::T_NEAR); + } + // if both blocks are a far block this is easy + else if (instr->arg1()->type() != VALUE_BLOCK && + instr->arg2()->type() != VALUE_BLOCK) { + // return if we need to branch to a far block + const Xbyak::Operand &op_true = e.GetOperand(instr->arg1(), c.rax); + const Xbyak::Operand &op_false = e.GetOperand(instr->arg2(), c.rcx); + + c.cmovnz(c.eax, op_true); + c.cmovz(c.eax, op_false); + c.jmp(".epilog"); + } + // if they are mixed, do local block test first, far block second + else { + LOG(FATAL) << "Unsupported mixed mode conditional branch"; + } +} + +EMITTER(CALL_EXTERNAL) { + // rdi is already pointing to guest_ctx + uint64_t addr = instr->arg0()->GetZExtValue(); + c.mov(c.rax, addr); + c.call(c.rax); + + // restore rdi / rsi + c.mov(Xbyak::util::rdi, + c.qword[Xbyak::util::rsp + STACK_OFFSET_GUEST_CONTEXT]); + c.mov(Xbyak::util::rsi, c.qword[Xbyak::util::rsp + STACK_OFFSET_MEMORY]); +} diff --git a/src/cpu/backend/x64/x64_emitter.h b/src/cpu/backend/x64/x64_emitter.h new file mode 100644 index 00000000..dc00c073 --- /dev/null +++ b/src/cpu/backend/x64/x64_emitter.h @@ -0,0 +1,49 @@ +#ifndef X64_EMITTER_H +#define X64_EMITTER_H + +#include +#include +#include "core/arena.h" +#include "cpu/backend/x64/x64_block.h" +#include "cpu/runtime.h" + +namespace dreavm { +namespace cpu { +namespace backend { +namespace x64 { + +enum { + STACK_OFFSET_GUEST_CONTEXT = 0, + STACK_OFFSET_MEMORY = 8, + STACK_OFFSET_LOCALS = 16 +}; + +class X64Emitter { + public: + X64Emitter(Xbyak::CodeGenerator &codegen); + + X64Fn Emit(ir::IRBuilder &builder); + + // helpers for the emitter callbacks + const Xbyak::Operand &GetOperand(const ir::Value *v, int size = -1); + const Xbyak::Operand &GetOperand(const ir::Value *v, + const Xbyak::Operand &tmp); + const Xbyak::Reg &GetRegister(const ir::Value *v, const Xbyak::Reg &tmp); + const Xbyak::Xmm &GetXMMRegister(const ir::Value *v, const Xbyak::Xmm &tmp); + const Xbyak::Xmm &GetXMMRegister(const ir::Value *v, + const Xbyak::Operand &prefered, + const Xbyak::Xmm &tmp); + void CopyOperand(const Xbyak::Operand &from, const Xbyak::Operand &to); + void CopyOperand(const ir::Value *v, const Xbyak::Operand &to); + bool CanEncodeAsImmediate(const ir::Value *v); + + private: + Xbyak::CodeGenerator &c_; + core::Arena operand_arena_; +}; +} +} +} +} + +#endif diff --git a/src/cpu/ir/ir_builder.cc b/src/cpu/ir/ir_builder.cc index 14a88917..3eaa9128 100644 --- a/src/cpu/ir/ir_builder.cc +++ b/src/cpu/ir/ir_builder.cc @@ -11,31 +11,6 @@ const char *dreavm::cpu::ir::Opnames[NUM_OPCODES] = { #include "cpu/ir/ir_ops.inc" }; -static inline bool IsFloatType(ValueTy type) { - return type == VALUE_F32 || type == VALUE_F64; -} - -static inline bool IsIntType(ValueTy type) { return !IsFloatType(type); } - -static inline int SizeForType(ValueTy type) { - switch (type) { - case VALUE_I8: - return 1; - case VALUE_I16: - return 2; - case VALUE_I32: - return 4; - case VALUE_I64: - return 8; - case VALUE_F32: - return 4; - case VALUE_F64: - return 8; - case VALUE_BLOCK: - return 4; - } -} - // // Value // @@ -175,7 +150,7 @@ void IRBuilder::Dump() const { auto res = value_vars.insert(std::make_pair((intptr_t)v, name)); it = res.first; } - ss << it->second; + ss << it->second << " (" << v->reg() << ")"; }; auto DumpValue = [&](std::stringstream &ss, const Value *v) { if (!v) { @@ -324,6 +299,9 @@ void IRBuilder::Store(Value *addr, Value *v) { } Value *IRBuilder::Cast(Value *v, ValueTy dest_type) { + CHECK((IsIntType(v->type()) && IsFloatType(dest_type)) || + (IsFloatType(v->type()) && IsIntType(dest_type))); + Instr *instr = AppendInstr(OP_CAST); Value *result = AllocDynamic(dest_type); instr->set_arg0(v); diff --git a/src/cpu/ir/ir_builder.h b/src/cpu/ir/ir_builder.h index b69a0e68..95afea18 100644 --- a/src/cpu/ir/ir_builder.h +++ b/src/cpu/ir/ir_builder.h @@ -87,6 +87,31 @@ class Block; class Instr; class ValueRef; +static inline bool IsFloatType(ValueTy type) { + return type == VALUE_F32 || type == VALUE_F64; +} + +static inline bool IsIntType(ValueTy type) { return !IsFloatType(type); } + +static inline int SizeForType(ValueTy type) { + switch (type) { + case VALUE_I8: + return 1; + case VALUE_I16: + return 2; + case VALUE_I32: + return 4; + case VALUE_I64: + return 8; + case VALUE_F32: + return 4; + case VALUE_F64: + return 8; + case VALUE_BLOCK: + return 4; + } +} + class Value { public: Value(ValueTy ty); diff --git a/src/cpu/ir/passes/register_allocation_pass.cc b/src/cpu/ir/passes/register_allocation_pass.cc index b96fd31a..f9e47482 100644 --- a/src/cpu/ir/passes/register_allocation_pass.cc +++ b/src/cpu/ir/passes/register_allocation_pass.cc @@ -2,6 +2,7 @@ #include "cpu/ir/passes/register_allocation_pass.h" using namespace dreavm; +using namespace dreavm::cpu::backend; using namespace dreavm::cpu::ir; using namespace dreavm::cpu::ir::passes; @@ -33,7 +34,7 @@ void RegisterAllocationPass::Run(IRBuilder &builder) { Value *result = instr->result(); // only allocate registers for results, assume constants can always be - // encoded by immediates or that the backend has registers reserved + // encoded as immediates or that the backend has registers reserved // for storing the constants if (!result) { continue; @@ -47,36 +48,16 @@ void RegisterAllocationPass::Run(IRBuilder &builder) { // expire any old intervals, freeing up the registers they claimed ExpireOldIntervals(start); - // if the last argument isn't used after this instruction, its register - // can be reused to take advantage of many architectures supporting - // operations where the destination is the last source argument - // FIXME could reorder arguments and do this with any source arguments - // meeting the criteria - Value *last_arg = instr->arg2() - ? instr->arg2() - : (instr->arg1() ? instr->arg1() : instr->arg0()); - if (last_arg && !last_arg->constant()) { - // get the current interval for this register - int last_reg = last_arg->reg(); - - if (last_reg != NO_REGISTER) { - const std::multiset::iterator &it = live_[last_reg]; - - // if the argument isn't used after this instruction, reuse its - // register for the result - if (GetOrdinal(it->end) <= GetOrdinal(start)) { - UpdateInterval(it, result, start, end); - result->set_reg(last_reg); - continue; - } - } - } - - // else, allocate a new register - int reg = AllocFreeRegister(result, start, end); + // first, try and reuse the register of one of the incoming arguments + int reg = ReuuseArgRegister(instr, start, end); if (reg == NO_REGISTER) { - reg = AllocBlockedRegister(builder, result, start, end); - CHECK_NE(reg, NO_REGISTER); + // else, allocate a new register for the result + reg = AllocFreeRegister(result, start, end); + if (reg == NO_REGISTER) { + // if a register couldn't be allocated, spill a register and try again + reg = AllocBlockedRegister(builder, result, start, end); + CHECK_NE(reg, NO_REGISTER); + } } result->set_reg(reg); @@ -143,9 +124,6 @@ void RegisterAllocationPass::UpdateInterval( Instr *end) { int reg = it->reg; - // printf("UpdateRegister %d (%p) -> %d (%p) : (%p)\n", GetOrdinal(start), - // start, GetOrdinal(end), end, value); - // remove the old interval intervals_.erase(it); @@ -160,19 +138,60 @@ void RegisterAllocationPass::UpdateInterval( live_[reg] = intervals_.insert(interval); } -int RegisterAllocationPass::AllocFreeRegister(Value *value, Instr *start, +// If the first argument isn't used after this instruction, its register +// can be reused to take advantage of many architectures supporting +// operations where the destination is the first argument. +// TODO could reorder arguments for communicative binary ops and do this +// with the second argument as well +int RegisterAllocationPass::ReuuseArgRegister(Instr *instr, Instr *start, Instr *end) { - if (!num_free_) { - // LOG(WARNING) << "AllocFreeRegister failed for " << GetOrdinal(start); + if (!instr->arg0() || instr->arg0()->constant()) { return NO_REGISTER; } - // printf("AllocFreeRegister %d (%p) -> %d (%p) : (%p)\n", GetOrdinal(start), - // start, GetOrdinal(end), end, value); + int last_reg = instr->arg0()->reg(); + if (last_reg == NO_REGISTER) { + return NO_REGISTER; + } + + // make sure the register can hold the result type + const Register &r = registers_[last_reg]; + if (!(r.value_types & 1 << (instr->result()->type()))) { + return NO_REGISTER; + } + + // if the argument's register is used after this instruction, it can't be + // reused + const std::multiset::iterator &it = live_[last_reg]; + if (GetOrdinal(it->end) > GetOrdinal(start)) { + return NO_REGISTER; + } + + // the argument's register isn't used afterwards, update its interval and + // reuse + UpdateInterval(it, instr->result(), start, end); + + return last_reg; +} + +int RegisterAllocationPass::AllocFreeRegister(Value *value, Instr *start, + Instr *end) { + // find the first free register that can store this value type + // TODO split up free queue into int / float to avoid this scan + int i; + for (i = 0; i < num_free_; i++) { + const Register &r = registers_[free_[i]]; + if (r.value_types & 1 << (value->type())) { + break; + } + } + if (i == num_free_) { + return NO_REGISTER; + } // remove register from free queue - int reg = free_[0]; - free_[0] = free_[--num_free_]; + int reg = free_[i]; + free_[i] = free_[--num_free_]; // add interval Interval interval; @@ -191,24 +210,31 @@ int RegisterAllocationPass::AllocFreeRegister(Value *value, Instr *start, int RegisterAllocationPass::AllocBlockedRegister(IRBuilder &builder, Value *value, Instr *start, Instr *end) { - CHECK_EQ(num_free_, 0); - CHECK_EQ(num_registers_, (int)intervals_.size()); + // TODO no longer valid due to type masks + // CHECK_EQ(num_free_, 0); + // CHECK_EQ(num_registers_, (int)intervals_.size()); + + // spill the register that ends furthest away that can store this type + auto it = intervals_.rbegin(); + auto e = intervals_.rend(); + for (; it != e; ++it) { + const Register &r = registers_[it->reg]; + + if (r.value_types & 1 << (value->type())) { + break; + } + } + CHECK(it != e); - // spill the register that ends furthest away, or possibly this register - // itself - auto it = --intervals_.end(); const Interval &to_spill = *it; // point spilled value to use stack to_spill.value->set_reg(NO_REGISTER); to_spill.value->set_local(builder.AllocLocal(to_spill.value->type())); - // printf("Spilling %d (%p) -> %d (%p) : (%p)\n", GetOrdinal(to_spill.start), - // to_spill.start, GetOrdinal(to_spill.end), to_spill.end, to_spill.value); - // remove interval free_[num_free_++] = to_spill.reg; - intervals_.erase(it); + intervals_.erase(--it.base()); return AllocFreeRegister(value, start, end); } diff --git a/src/cpu/ir/passes/register_allocation_pass.h b/src/cpu/ir/passes/register_allocation_pass.h index 990f9703..372e277e 100644 --- a/src/cpu/ir/passes/register_allocation_pass.h +++ b/src/cpu/ir/passes/register_allocation_pass.h @@ -52,6 +52,7 @@ class RegisterAllocationPass : public Pass { void ExpireOldIntervals(Instr *start); void UpdateInterval(const std::multiset::iterator &it, Value *value, Instr *start, Instr *end); + int ReuuseArgRegister(Instr *instr, Instr *start, Instr *end); int AllocFreeRegister(Value *value, Instr *start, Instr *end); int AllocBlockedRegister(IRBuilder &builder, Value *value, Instr *start, Instr *end); diff --git a/src/cpu/runtime.cc b/src/cpu/runtime.cc index 1789ac57..288498f8 100644 --- a/src/cpu/runtime.cc +++ b/src/cpu/runtime.cc @@ -79,7 +79,7 @@ void Runtime::ResetBlocks() { pending_reset_ = true; } RuntimeBlock *Runtime::CompileBlock(uint32_t addr) { PROFILER_SCOPE_F("runtime"); - // LOG(INFO) << "Compiling block 0x" << std::hex << addr; + LOG(INFO) << "Compiling block 0x" << std::hex << addr; std::unique_ptr builder = frontend_->BuildBlock(addr); if (!builder) { diff --git a/src/cpu/sh4.cc b/src/cpu/sh4.cc index 06f10744..2a38e244 100644 --- a/src/cpu/sh4.cc +++ b/src/cpu/sh4.cc @@ -256,8 +256,8 @@ void SH4::InitMemory() { void SH4::InitContext() { memset(&ctx_, 0, sizeof(ctx_)); ctx_.sh4 = this; - // ctx_.pc = 0xa0000000; - ctx_.pc = 0x0c010000; + ctx_.pc = 0xa0000000; + // ctx_.pc = 0x0c010000; ctx_.pr = 0xdeadbeef; #define SH4_REG(addr, name, flags, default, reset, sleep, standby, type) \ if (default != HELD) { \ diff --git a/src/emu/emulator.cc b/src/emu/emulator.cc index e0468fc8..b714cb69 100644 --- a/src/emu/emulator.cc +++ b/src/emu/emulator.cc @@ -24,8 +24,8 @@ Emulator::Emulator(System &sys) processor_(scheduler_, memory_), holly_(scheduler_, memory_, processor_) { rt_frontend_ = new SH4Frontend(memory_); - rt_backend_ = new InterpreterBackend(memory_); - // rt_backend_ = new X64Backend(*memory_); + // rt_backend_ = new InterpreterBackend(memory_); + rt_backend_ = new X64Backend(memory_); rb_ = new GLBackend(sys); } diff --git a/test/test_sh4.cc b/test/test_sh4.cc index 535ac475..0a890847 100644 --- a/test/test_sh4.cc +++ b/test/test_sh4.cc @@ -24,8 +24,8 @@ void RunSH4Test(const SH4Test &test) { // initialize runtime frontend::sh4::SH4Frontend rt_frontend(memory); - // backend::x64::X64Backend rt_backend(memory); - backend::interpreter::InterpreterBackend rt_backend(memory); + backend::x64::X64Backend rt_backend(memory); + // backend::interpreter::InterpreterBackend rt_backend(memory); Runtime runtime(memory); ASSERT_TRUE(runtime.Init(&rt_frontend, &rt_backend));