1856 lines
55 KiB
C++
1856 lines
55 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2025 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "xenia/cpu/backend/x64/x64_backend.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstddef>
|
|
#include "third_party/capstone/include/capstone/capstone.h"
|
|
#include "third_party/capstone/include/capstone/x86.h"
|
|
|
|
#include "xenia/base/exception_handler.h"
|
|
#include "xenia/base/logging.h"
|
|
#include "xenia/cpu/backend/x64/x64_assembler.h"
|
|
#include "xenia/cpu/backend/x64/x64_code_cache.h"
|
|
#include "xenia/cpu/backend/x64/x64_emitter.h"
|
|
#include "xenia/cpu/backend/x64/x64_function.h"
|
|
#include "xenia/cpu/backend/x64/x64_sequences.h"
|
|
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
|
#include "xenia/cpu/breakpoint.h"
|
|
#include "xenia/cpu/processor.h"
|
|
#include "xenia/cpu/stack_walker.h"
|
|
#include "xenia/cpu/xex_module.h"
|
|
|
|
DEFINE_bool(record_mmio_access_exceptions, true,
|
|
"For guest addresses records whether we caught any mmio accesses "
|
|
"for them. This info can then be used on a subsequent run to "
|
|
"instruct the recompiler to emit checks",
|
|
"x64");
|
|
|
|
DEFINE_int64(max_stackpoints, 65536,
|
|
"Max number of host->guest stack mappings we can record.", "x64");
|
|
|
|
DEFINE_bool(enable_host_guest_stack_synchronization, true,
|
|
"Records entries for guest/host stack mappings at function starts "
|
|
"and checks for reentry at return sites. Has slight performance "
|
|
"impact, but fixes crashes in games that use setjmp/longjmp.",
|
|
"x64");
|
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
|
DECLARE_bool(instrument_call_times);
|
|
#endif
|
|
|
|
namespace xe {
|
|
namespace cpu {
|
|
namespace backend {
|
|
namespace x64 {
|
|
|
|
class X64HelperEmitter : public X64Emitter {
|
|
public:
|
|
struct _code_offsets {
|
|
size_t prolog;
|
|
size_t prolog_stack_alloc;
|
|
size_t body;
|
|
size_t epilog;
|
|
size_t tail;
|
|
};
|
|
X64HelperEmitter(X64Backend* backend, XbyakAllocator* allocator);
|
|
~X64HelperEmitter() override;
|
|
HostToGuestThunk EmitHostToGuestThunk();
|
|
GuestToHostThunk EmitGuestToHostThunk();
|
|
ResolveFunctionThunk EmitResolveFunctionThunk();
|
|
void* EmitGuestAndHostSynchronizeStackHelper();
|
|
// 1 for loading byte, 2 for halfword and 4 for word.
|
|
// these specialized versions save space in the caller
|
|
void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|
void* sync_func, unsigned stack_element_size);
|
|
|
|
void* EmitTryAcquireReservationHelper();
|
|
void* EmitReservedStoreHelper(bool bit64 = false);
|
|
|
|
void* EmitScalarVRsqrteHelper();
|
|
void* EmitVectorVRsqrteHelper(void* scalar_helper);
|
|
|
|
void* EmitFrsqrteHelper();
|
|
|
|
private:
|
|
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
|
size_t stack_size = 0);
|
|
// The following four functions provide save/load functionality for registers.
|
|
// They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
|
|
// allocated on the stack.
|
|
void EmitSaveVolatileRegs();
|
|
void EmitLoadVolatileRegs();
|
|
void EmitSaveNonvolatileRegs();
|
|
void EmitLoadNonvolatileRegs();
|
|
};
|
|
|
|
#if XE_PLATFORM_WIN32
|
|
static constexpr unsigned char guest_trampoline_template[] = {
|
|
0x48, 0xBA, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x49,
|
|
0xB8, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB9,
|
|
0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB8, 0x99,
|
|
0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0xFF, 0xE0};
|
|
|
|
#else
|
|
// sysv x64 abi, exact same offsets for args
|
|
static constexpr unsigned char guest_trampoline_template[] = {
|
|
0x48, 0xBF, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48,
|
|
0xBE, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB9,
|
|
0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB8, 0x99,
|
|
0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0xFF, 0xE0};
|
|
#endif
|
|
static constexpr uint32_t guest_trampoline_template_offset_arg1 = 2,
|
|
guest_trampoline_template_offset_arg2 = 0xC,
|
|
guest_trampoline_template_offset_rcx = 0x16,
|
|
guest_trampoline_template_offset_rax = 0x20;
|
|
X64Backend::X64Backend() : Backend(), code_cache_(nullptr) {
|
|
if (cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) != CS_ERR_OK) {
|
|
assert_always("Failed to initialize capstone");
|
|
}
|
|
cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL);
|
|
cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_ON);
|
|
cs_option(capstone_handle_, CS_OPT_SKIPDATA, CS_OPT_OFF);
|
|
uint32_t base_address = 0x10000;
|
|
void* buf_trampoline_code = nullptr;
|
|
while (base_address < 0x80000000) {
|
|
buf_trampoline_code = memory::AllocFixed(
|
|
(void*)(uintptr_t)base_address,
|
|
sizeof(guest_trampoline_template) * MAX_GUEST_TRAMPOLINES,
|
|
xe::memory::AllocationType::kReserveCommit,
|
|
xe::memory::PageAccess::kExecuteReadWrite);
|
|
if (!buf_trampoline_code) {
|
|
base_address += 65536;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
xenia_assert(buf_trampoline_code);
|
|
guest_trampoline_memory_ = (uint8_t*)buf_trampoline_code;
|
|
guest_trampoline_address_bitmap_.Resize(MAX_GUEST_TRAMPOLINES);
|
|
}
|
|
|
|
X64Backend::~X64Backend() {
|
|
if (capstone_handle_) {
|
|
cs_close(&capstone_handle_);
|
|
}
|
|
|
|
X64Emitter::FreeConstData(emitter_data_);
|
|
ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this);
|
|
if (guest_trampoline_memory_) {
|
|
memory::DeallocFixed(
|
|
guest_trampoline_memory_,
|
|
sizeof(guest_trampoline_template) * MAX_GUEST_TRAMPOLINES,
|
|
memory::DeallocationType::kRelease);
|
|
guest_trampoline_memory_ = nullptr;
|
|
}
|
|
}
|
|
|
|
static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
|
|
reinterpret_cast<X64Backend*>(context)
|
|
->RecordMMIOExceptionForGuestInstruction(hostaddr);
|
|
}
|
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
|
// todo: better way of passing to atexit. maybe do in destructor instead?
|
|
// nope, destructor is never called
|
|
static GuestProfilerData* backend_profiler_data = nullptr;
|
|
|
|
static uint64_t nanosecond_lifetime_start = 0;
|
|
static void WriteGuestProfilerData() {
|
|
if (cvars::instrument_call_times) {
|
|
uint64_t end = Clock::QueryHostSystemTime();
|
|
|
|
uint64_t total = end - nanosecond_lifetime_start;
|
|
|
|
double totaltime_divisor = static_cast<double>(total);
|
|
|
|
FILE* output_file = nullptr;
|
|
std::vector<std::pair<uint32_t, uint64_t>> unsorted_profile{};
|
|
for (auto&& entry : *backend_profiler_data) {
|
|
if (entry.second) { // skip times of 0
|
|
unsorted_profile.emplace_back(entry.first, entry.second);
|
|
}
|
|
}
|
|
|
|
std::sort(unsorted_profile.begin(), unsorted_profile.end(),
|
|
[](auto& x, auto& y) { return x.second < y.second; });
|
|
|
|
fopen_s(&output_file, "profile_times.txt", "w");
|
|
FILE* idapy_file = nullptr;
|
|
fopen_s(&idapy_file, "profile_print_times.py", "w");
|
|
|
|
for (auto&& sorted_entry : unsorted_profile) {
|
|
// double time_in_seconds =
|
|
// static_cast<double>(sorted_entry.second) / 10000000.0;
|
|
double time_in_milliseconds =
|
|
static_cast<double>(sorted_entry.second) / (10000000.0 / 1000.0);
|
|
|
|
double slice = static_cast<double>(sorted_entry.second) /
|
|
static_cast<double>(totaltime_divisor);
|
|
|
|
fprintf(output_file,
|
|
"%X took %.20f milliseconds, totaltime slice percentage %.20f \n",
|
|
sorted_entry.first, time_in_milliseconds, slice);
|
|
|
|
fprintf(idapy_file,
|
|
"print(get_name(0x%X) + ' took %.20f ms, %.20f percent')\n",
|
|
sorted_entry.first, time_in_milliseconds, slice);
|
|
}
|
|
|
|
fclose(output_file);
|
|
fclose(idapy_file);
|
|
}
|
|
}
|
|
|
|
static void GuestProfilerUpdateThreadProc() {
|
|
nanosecond_lifetime_start = Clock::QueryHostSystemTime();
|
|
|
|
do {
|
|
xe::threading::Sleep(std::chrono::seconds(30));
|
|
WriteGuestProfilerData();
|
|
} while (true);
|
|
}
|
|
static std::unique_ptr<xe::threading::Thread> g_profiler_update_thread{};
|
|
#endif
|
|
|
|
bool X64Backend::Initialize(Processor* processor) {
|
|
if (!Backend::Initialize(processor)) {
|
|
return false;
|
|
}
|
|
|
|
Xbyak::util::Cpu cpu;
|
|
if (!cpu.has(Xbyak::util::Cpu::tAVX)) {
|
|
XELOGE("This CPU does not support AVX. The emulator will now crash.");
|
|
return false;
|
|
}
|
|
|
|
// Need movbe to do advanced LOAD/STORE tricks.
|
|
if (cvars::x64_extension_mask & kX64EmitMovbe) {
|
|
machine_info_.supports_extended_load_store =
|
|
cpu.has(Xbyak::util::Cpu::tMOVBE);
|
|
} else {
|
|
machine_info_.supports_extended_load_store = false;
|
|
}
|
|
|
|
auto& gprs = machine_info_.register_sets[0];
|
|
gprs.id = 0;
|
|
std::strcpy(gprs.name, "gpr");
|
|
gprs.types = MachineInfo::RegisterSet::INT_TYPES;
|
|
gprs.count = X64Emitter::GPR_COUNT;
|
|
|
|
auto& xmms = machine_info_.register_sets[1];
|
|
xmms.id = 1;
|
|
std::strcpy(xmms.name, "xmm");
|
|
xmms.types = MachineInfo::RegisterSet::FLOAT_TYPES |
|
|
MachineInfo::RegisterSet::VEC_TYPES;
|
|
xmms.count = X64Emitter::XMM_COUNT;
|
|
|
|
code_cache_ = X64CodeCache::Create();
|
|
Backend::code_cache_ = code_cache_.get();
|
|
if (!code_cache_->Initialize()) {
|
|
return false;
|
|
}
|
|
// HV range
|
|
code_cache()->CommitExecutableRange(GUEST_TRAMPOLINE_BASE,
|
|
GUEST_TRAMPOLINE_END);
|
|
// Allocate emitter constant data.
|
|
emitter_data_ = X64Emitter::PlaceConstData();
|
|
|
|
// Generate thunks used to transition between jitted code and host code.
|
|
XbyakAllocator allocator;
|
|
X64HelperEmitter thunk_emitter(this, &allocator);
|
|
host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
|
|
guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
|
|
resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
|
|
|
|
if (cvars::enable_host_guest_stack_synchronization) {
|
|
synchronize_guest_and_host_stack_helper_ =
|
|
thunk_emitter.EmitGuestAndHostSynchronizeStackHelper();
|
|
|
|
synchronize_guest_and_host_stack_helper_size8_ =
|
|
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|
synchronize_guest_and_host_stack_helper_, 1);
|
|
synchronize_guest_and_host_stack_helper_size16_ =
|
|
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|
synchronize_guest_and_host_stack_helper_, 2);
|
|
synchronize_guest_and_host_stack_helper_size32_ =
|
|
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|
synchronize_guest_and_host_stack_helper_, 4);
|
|
}
|
|
try_acquire_reservation_helper_ =
|
|
thunk_emitter.EmitTryAcquireReservationHelper();
|
|
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
|
|
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
|
|
vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
|
|
vrsqrtefp_vector_helper =
|
|
thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
|
|
frsqrtefp_helper = thunk_emitter.EmitFrsqrteHelper();
|
|
// Set the code cache to use the ResolveFunction thunk for default
|
|
// indirections.
|
|
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
|
code_cache_->set_indirection_default(
|
|
uint32_t(uint64_t(resolve_function_thunk_)));
|
|
|
|
// Allocate some special indirections.
|
|
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
|
|
|
|
// Setup exception callback
|
|
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
|
if (cvars::record_mmio_access_exceptions) {
|
|
processor->memory()->SetMMIOExceptionRecordingCallback(
|
|
ForwardMMIOAccessForRecording, (void*)this);
|
|
}
|
|
|
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
|
if (cvars::instrument_call_times) {
|
|
backend_profiler_data = &profiler_data_;
|
|
xe::threading::Thread::CreationParameters slimparams;
|
|
|
|
slimparams.create_suspended = false;
|
|
slimparams.initial_priority = xe::threading::ThreadPriority::kLowest;
|
|
slimparams.stack_size = 65536 * 4;
|
|
|
|
g_profiler_update_thread = std::move(xe::threading::Thread::Create(
|
|
slimparams, GuestProfilerUpdateThreadProc));
|
|
}
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
void X64Backend::CommitExecutableRange(uint32_t guest_low,
|
|
uint32_t guest_high) {
|
|
code_cache_->CommitExecutableRange(guest_low, guest_high);
|
|
}
|
|
|
|
std::unique_ptr<Assembler> X64Backend::CreateAssembler() {
|
|
return std::make_unique<X64Assembler>(this);
|
|
}
|
|
|
|
std::unique_ptr<GuestFunction> X64Backend::CreateGuestFunction(
|
|
Module* module, uint32_t address) {
|
|
return std::make_unique<X64Function>(module, address);
|
|
}
|
|
|
|
uint64_t ReadCapstoneReg(HostThreadContext* context, x86_reg reg) {
|
|
switch (reg) {
|
|
case X86_REG_RAX:
|
|
return context->rax;
|
|
case X86_REG_RCX:
|
|
return context->rcx;
|
|
case X86_REG_RDX:
|
|
return context->rdx;
|
|
case X86_REG_RBX:
|
|
return context->rbx;
|
|
case X86_REG_RSP:
|
|
return context->rsp;
|
|
case X86_REG_RBP:
|
|
return context->rbp;
|
|
case X86_REG_RSI:
|
|
return context->rsi;
|
|
case X86_REG_RDI:
|
|
return context->rdi;
|
|
case X86_REG_R8:
|
|
return context->r8;
|
|
case X86_REG_R9:
|
|
return context->r9;
|
|
case X86_REG_R10:
|
|
return context->r10;
|
|
case X86_REG_R11:
|
|
return context->r11;
|
|
case X86_REG_R12:
|
|
return context->r12;
|
|
case X86_REG_R13:
|
|
return context->r13;
|
|
case X86_REG_R14:
|
|
return context->r14;
|
|
case X86_REG_R15:
|
|
return context->r15;
|
|
default:
|
|
assert_unhandled_case(reg);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#define X86_EFLAGS_CF 0x00000001 // Carry Flag
|
|
#define X86_EFLAGS_PF 0x00000004 // Parity Flag
|
|
#define X86_EFLAGS_ZF 0x00000040 // Zero Flag
|
|
#define X86_EFLAGS_SF 0x00000080 // Sign Flag
|
|
#define X86_EFLAGS_OF 0x00000800 // Overflow Flag
|
|
bool TestCapstoneEflags(uint32_t eflags, uint32_t insn) {
|
|
// https://www.felixcloutier.com/x86/Jcc.html
|
|
switch (insn) {
|
|
case X86_INS_JAE:
|
|
// CF=0 && ZF=0
|
|
return ((eflags & X86_EFLAGS_CF) == 0) && ((eflags & X86_EFLAGS_ZF) == 0);
|
|
case X86_INS_JA:
|
|
// CF=0
|
|
return (eflags & X86_EFLAGS_CF) == 0;
|
|
case X86_INS_JBE:
|
|
// CF=1 || ZF=1
|
|
return ((eflags & X86_EFLAGS_CF) == X86_EFLAGS_CF) ||
|
|
((eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF);
|
|
case X86_INS_JB:
|
|
// CF=1
|
|
return (eflags & X86_EFLAGS_CF) == X86_EFLAGS_CF;
|
|
case X86_INS_JE:
|
|
// ZF=1
|
|
return (eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF;
|
|
case X86_INS_JGE:
|
|
// SF=OF
|
|
return (eflags & X86_EFLAGS_SF) == (eflags & X86_EFLAGS_OF);
|
|
case X86_INS_JG:
|
|
// ZF=0 && SF=OF
|
|
return ((eflags & X86_EFLAGS_ZF) == 0) &&
|
|
((eflags & X86_EFLAGS_SF) == (eflags & X86_EFLAGS_OF));
|
|
case X86_INS_JLE:
|
|
// ZF=1 || SF!=OF
|
|
return ((eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF) ||
|
|
((eflags & X86_EFLAGS_SF) != X86_EFLAGS_OF);
|
|
case X86_INS_JL:
|
|
// SF!=OF
|
|
return (eflags & X86_EFLAGS_SF) != (eflags & X86_EFLAGS_OF);
|
|
case X86_INS_JNE:
|
|
// ZF=0
|
|
return (eflags & X86_EFLAGS_ZF) == 0;
|
|
case X86_INS_JNO:
|
|
// OF=0
|
|
return (eflags & X86_EFLAGS_OF) == 0;
|
|
case X86_INS_JNP:
|
|
// PF=0
|
|
return (eflags & X86_EFLAGS_PF) == 0;
|
|
case X86_INS_JNS:
|
|
// SF=0
|
|
return (eflags & X86_EFLAGS_SF) == 0;
|
|
case X86_INS_JO:
|
|
// OF=1
|
|
return (eflags & X86_EFLAGS_OF) == X86_EFLAGS_OF;
|
|
case X86_INS_JP:
|
|
// PF=1
|
|
return (eflags & X86_EFLAGS_PF) == X86_EFLAGS_PF;
|
|
case X86_INS_JS:
|
|
// SF=1
|
|
return (eflags & X86_EFLAGS_SF) == X86_EFLAGS_SF;
|
|
default:
|
|
assert_unhandled_case(insn);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
uint64_t X64Backend::CalculateNextHostInstruction(ThreadDebugInfo* thread_info,
|
|
uint64_t current_pc) {
|
|
auto machine_code_ptr = reinterpret_cast<const uint8_t*>(current_pc);
|
|
size_t remaining_machine_code_size = 64;
|
|
uint64_t host_address = current_pc;
|
|
cs_insn insn = {0};
|
|
cs_detail all_detail = {0};
|
|
insn.detail = &all_detail;
|
|
cs_disasm_iter(capstone_handle_, &machine_code_ptr,
|
|
&remaining_machine_code_size, &host_address, &insn);
|
|
auto& detail = all_detail.x86;
|
|
switch (insn.id) {
|
|
default:
|
|
// Not a branching instruction - just move over it.
|
|
return current_pc + insn.size;
|
|
case X86_INS_CALL: {
|
|
assert_true(detail.op_count == 1);
|
|
assert_true(detail.operands[0].type == X86_OP_REG);
|
|
uint64_t target_pc =
|
|
ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg);
|
|
return target_pc;
|
|
} break;
|
|
case X86_INS_RET: {
|
|
assert_zero(detail.op_count);
|
|
auto stack_ptr =
|
|
reinterpret_cast<uint64_t*>(thread_info->host_context.rsp);
|
|
uint64_t target_pc = stack_ptr[0];
|
|
return target_pc;
|
|
} break;
|
|
case X86_INS_JMP: {
|
|
assert_true(detail.op_count == 1);
|
|
if (detail.operands[0].type == X86_OP_IMM) {
|
|
uint64_t target_pc = static_cast<uint64_t>(detail.operands[0].imm);
|
|
return target_pc;
|
|
} else if (detail.operands[0].type == X86_OP_REG) {
|
|
uint64_t target_pc =
|
|
ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg);
|
|
return target_pc;
|
|
} else {
|
|
// TODO(benvanik): find some more uses of this.
|
|
assert_always("jmp branch emulation not yet implemented");
|
|
return current_pc + insn.size;
|
|
}
|
|
} break;
|
|
case X86_INS_JCXZ:
|
|
case X86_INS_JECXZ:
|
|
case X86_INS_JRCXZ:
|
|
assert_always("j*cxz branch emulation not yet implemented");
|
|
return current_pc + insn.size;
|
|
case X86_INS_JAE:
|
|
case X86_INS_JA:
|
|
case X86_INS_JBE:
|
|
case X86_INS_JB:
|
|
case X86_INS_JE:
|
|
case X86_INS_JGE:
|
|
case X86_INS_JG:
|
|
case X86_INS_JLE:
|
|
case X86_INS_JL:
|
|
case X86_INS_JNE:
|
|
case X86_INS_JNO:
|
|
case X86_INS_JNP:
|
|
case X86_INS_JNS:
|
|
case X86_INS_JO:
|
|
case X86_INS_JP:
|
|
case X86_INS_JS: {
|
|
assert_true(detail.op_count == 1);
|
|
assert_true(detail.operands[0].type == X86_OP_IMM);
|
|
uint64_t target_pc = static_cast<uint64_t>(detail.operands[0].imm);
|
|
bool test_passed =
|
|
TestCapstoneEflags(thread_info->host_context.eflags, insn.id);
|
|
if (test_passed) {
|
|
return target_pc;
|
|
} else {
|
|
return current_pc + insn.size;
|
|
}
|
|
} break;
|
|
}
|
|
}
|
|
|
|
void X64Backend::InstallBreakpoint(Breakpoint* breakpoint) {
|
|
breakpoint->ForEachHostAddress([breakpoint](uint64_t host_address) {
|
|
auto ptr = reinterpret_cast<void*>(host_address);
|
|
auto original_bytes = xe::load_and_swap<uint16_t>(ptr);
|
|
assert_true(original_bytes != 0x0F0B);
|
|
xe::store_and_swap<uint16_t>(ptr, 0x0F0B);
|
|
breakpoint->backend_data().emplace_back(host_address, original_bytes);
|
|
});
|
|
}
|
|
|
|
void X64Backend::InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {
|
|
assert_true(breakpoint->address_type() == Breakpoint::AddressType::kGuest);
|
|
assert_true(fn->is_guest());
|
|
auto guest_function = reinterpret_cast<cpu::GuestFunction*>(fn);
|
|
auto host_address =
|
|
guest_function->MapGuestAddressToMachineCode(breakpoint->guest_address());
|
|
if (!host_address) {
|
|
assert_always();
|
|
return;
|
|
}
|
|
|
|
// Assume we haven't already installed a breakpoint in this spot.
|
|
auto ptr = reinterpret_cast<void*>(host_address);
|
|
auto original_bytes = xe::load_and_swap<uint16_t>(ptr);
|
|
assert_true(original_bytes != 0x0F0B);
|
|
xe::store_and_swap<uint16_t>(ptr, 0x0F0B);
|
|
breakpoint->backend_data().emplace_back(host_address, original_bytes);
|
|
}
|
|
|
|
void X64Backend::UninstallBreakpoint(Breakpoint* breakpoint) {
|
|
for (auto& pair : breakpoint->backend_data()) {
|
|
auto ptr = reinterpret_cast<uint8_t*>(pair.first);
|
|
auto instruction_bytes = xe::load_and_swap<uint16_t>(ptr);
|
|
assert_true(instruction_bytes == 0x0F0B);
|
|
xe::store_and_swap<uint16_t>(ptr, static_cast<uint16_t>(pair.second));
|
|
}
|
|
breakpoint->backend_data().clear();
|
|
}
|
|
|
|
bool X64Backend::ExceptionCallbackThunk(Exception* ex, void* data) {
|
|
auto backend = reinterpret_cast<X64Backend*>(data);
|
|
return backend->ExceptionCallback(ex);
|
|
}
|
|
void X64Backend::RecordMMIOExceptionForGuestInstruction(void* host_address) {
|
|
uint64_t host_addr_u64 = (uint64_t)host_address;
|
|
|
|
auto fnfor = code_cache()->LookupFunction(host_addr_u64);
|
|
if (fnfor) {
|
|
uint32_t guestaddr = fnfor->MapMachineCodeToGuestAddress(host_addr_u64);
|
|
|
|
Module* guest_module = fnfor->module();
|
|
if (guest_module) {
|
|
XexModule* xex_guest_module = dynamic_cast<XexModule*>(guest_module);
|
|
|
|
if (xex_guest_module) {
|
|
cpu::InfoCacheFlags* icf =
|
|
xex_guest_module->GetInstructionAddressFlags(guestaddr);
|
|
|
|
if (icf) {
|
|
icf->accessed_mmio = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
bool X64Backend::ExceptionCallback(Exception* ex) {
|
|
if (ex->code() != Exception::Code::kIllegalInstruction) {
|
|
// We only care about illegal instructions. Other things will be handled by
|
|
// other handlers (probably). If nothing else picks it up we'll be called
|
|
// with OnUnhandledException to do real crash handling.
|
|
return false;
|
|
}
|
|
|
|
// processor_->memory()->LookupVirtualMappedRange()
|
|
|
|
// Verify an expected illegal instruction.
|
|
auto instruction_bytes =
|
|
xe::load_and_swap<uint16_t>(reinterpret_cast<void*>(ex->pc()));
|
|
if (instruction_bytes != 0x0F0B) {
|
|
// Not our ud2 - not us.
|
|
return false;
|
|
}
|
|
|
|
// Let the processor handle things.
|
|
return processor()->OnThreadBreakpointHit(ex);
|
|
}
|
|
|
|
X64HelperEmitter::X64HelperEmitter(X64Backend* backend,
|
|
XbyakAllocator* allocator)
|
|
: X64Emitter(backend, allocator) {}
|
|
|
|
X64HelperEmitter::~X64HelperEmitter() {}
|
|
void* X64HelperEmitter::EmitCurrentForOffsets(const _code_offsets& code_offsets,
|
|
size_t stack_size) {
|
|
EmitFunctionInfo func_info = {};
|
|
func_info.code_size.total = getSize();
|
|
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
|
|
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
|
|
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
|
|
func_info.code_size.tail = getSize() - code_offsets.tail;
|
|
func_info.prolog_stack_alloc_offset =
|
|
code_offsets.prolog_stack_alloc - code_offsets.prolog;
|
|
func_info.stack_size = stack_size;
|
|
|
|
void* fn = Emplace(func_info);
|
|
return fn;
|
|
}
|
|
HostToGuestThunk X64HelperEmitter::EmitHostToGuestThunk() {
|
|
#ifdef XE_PLATFORM_WIN32
|
|
// rcx = target
|
|
// rdx = arg0 (context)
|
|
// r8 = arg1 (guest return address)
|
|
|
|
_code_offsets code_offsets = {};
|
|
|
|
constexpr size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
|
|
|
code_offsets.prolog = getSize();
|
|
|
|
// rsp + 0 = return address
|
|
mov(qword[rsp + 8 * 3], r8);
|
|
mov(qword[rsp + 8 * 2], rdx);
|
|
mov(qword[rsp + 8 * 1], rcx);
|
|
sub(rsp, stack_size);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
|
|
// Save nonvolatile registers.
|
|
EmitSaveNonvolatileRegs();
|
|
|
|
mov(rax, rcx);
|
|
mov(rsi, rdx); // context
|
|
mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]); // membase
|
|
mov(rcx, r8); // return address
|
|
call(rax);
|
|
vzeroupper();
|
|
EmitLoadNonvolatileRegs();
|
|
|
|
code_offsets.epilog = getSize();
|
|
|
|
add(rsp, stack_size);
|
|
mov(rcx, qword[rsp + 8 * 1]);
|
|
mov(rdx, qword[rsp + 8 * 2]);
|
|
mov(r8, qword[rsp + 8 * 3]);
|
|
ret();
|
|
#elif XE_PLATFORM_LINUX || XE_PLATFORM_MAC
|
|
// System-V ABI args:
|
|
// rdi = target
|
|
// rsi = arg0 (context)
|
|
// rdx = arg1 (guest return address)
|
|
|
|
struct _code_offsets {
|
|
size_t prolog;
|
|
size_t prolog_stack_alloc;
|
|
size_t body;
|
|
size_t epilog;
|
|
size_t tail;
|
|
} code_offsets = {};
|
|
|
|
constexpr size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
|
|
|
code_offsets.prolog = getSize();
|
|
// rsp + 0 = return address
|
|
sub(rsp, stack_size);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
|
|
// Save nonvolatile registers.
|
|
EmitSaveNonvolatileRegs();
|
|
|
|
mov(rax, rdi);
|
|
// mov(rsi, rsi); // context
|
|
mov(rdi, ptr[rsi + offsetof(ppc::PPCContext, virtual_membase)]); // membase
|
|
mov(rcx, rdx); // return address
|
|
call(rax);
|
|
|
|
EmitLoadNonvolatileRegs();
|
|
|
|
code_offsets.epilog = getSize();
|
|
|
|
add(rsp, stack_size);
|
|
ret();
|
|
#else
|
|
assert_always("Unknown platform ABI in host to guest thunk!");
|
|
#endif
|
|
|
|
code_offsets.tail = getSize();
|
|
|
|
assert_zero(code_offsets.prolog);
|
|
EmitFunctionInfo func_info = {};
|
|
func_info.code_size.total = getSize();
|
|
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
|
|
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
|
|
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
|
|
func_info.code_size.tail = getSize() - code_offsets.tail;
|
|
func_info.prolog_stack_alloc_offset =
|
|
code_offsets.prolog_stack_alloc - code_offsets.prolog;
|
|
func_info.stack_size = stack_size;
|
|
|
|
void* fn = Emplace(func_info);
|
|
return (HostToGuestThunk)fn;
|
|
}
|
|
|
|
GuestToHostThunk X64HelperEmitter::EmitGuestToHostThunk() {
|
|
#if XE_PLATFORM_WIN32
|
|
// rcx = target function
|
|
// rdx = arg0
|
|
// r8 = arg1
|
|
// r9 = arg2
|
|
|
|
_code_offsets code_offsets = {};
|
|
|
|
constexpr size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
|
|
|
code_offsets.prolog = getSize();
|
|
|
|
// rsp + 0 = return address
|
|
sub(rsp, stack_size);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
// chrispy: added this for proper vmsum impl, avx2 bitshifts
|
|
vzeroupper();
|
|
// Save off volatile registers.
|
|
EmitSaveVolatileRegs();
|
|
|
|
mov(rax, rcx); // function
|
|
mov(rcx, GetContextReg()); // context
|
|
call(rax);
|
|
|
|
EmitLoadVolatileRegs();
|
|
|
|
code_offsets.epilog = getSize();
|
|
|
|
add(rsp, stack_size);
|
|
ret();
|
|
#elif XE_PLATFORM_LINUX || XE_PLATFORM_MAC
|
|
// This function is being called using the Microsoft ABI from CallNative
|
|
// rcx = target function
|
|
// rdx = arg0
|
|
// r8 = arg1
|
|
// r9 = arg2
|
|
|
|
// Must be translated to System-V ABI:
|
|
// rdi = target function
|
|
// rsi = arg0
|
|
// rdx = arg1
|
|
// rcx = arg2
|
|
// r8, r9 - unused argument registers
|
|
|
|
struct _code_offsets {
|
|
size_t prolog;
|
|
size_t prolog_stack_alloc;
|
|
size_t body;
|
|
size_t epilog;
|
|
size_t tail;
|
|
} code_offsets = {};
|
|
|
|
constexpr size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
|
|
|
code_offsets.prolog = getSize();
|
|
|
|
// rsp + 0 = return address
|
|
sub(rsp, stack_size);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
|
|
// Save off volatile registers.
|
|
EmitSaveVolatileRegs();
|
|
|
|
mov(rax, rcx); // function
|
|
mov(rdi, GetContextReg()); // context
|
|
mov(rsi, rdx); // arg0
|
|
mov(rdx, r8); // arg1
|
|
mov(rcx, r9); // arg2
|
|
call(rax);
|
|
|
|
EmitLoadVolatileRegs();
|
|
|
|
code_offsets.epilog = getSize();
|
|
|
|
add(rsp, stack_size);
|
|
ret();
|
|
#else
|
|
assert_always("Unknown platform ABI in guest to host thunk!")
|
|
#endif
|
|
|
|
code_offsets.tail = getSize();
|
|
|
|
assert_zero(code_offsets.prolog);
|
|
EmitFunctionInfo func_info = {};
|
|
func_info.code_size.total = getSize();
|
|
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
|
|
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
|
|
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
|
|
func_info.code_size.tail = getSize() - code_offsets.tail;
|
|
func_info.prolog_stack_alloc_offset =
|
|
code_offsets.prolog_stack_alloc - code_offsets.prolog;
|
|
func_info.stack_size = stack_size;
|
|
|
|
void* fn = Emplace(func_info);
|
|
return (GuestToHostThunk)fn;
|
|
}
|
|
|
|
// X64Emitter handles actually resolving functions.
|
|
uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
|
|
|
|
ResolveFunctionThunk X64HelperEmitter::EmitResolveFunctionThunk() {
|
|
#if XE_PLATFORM_WIN32
|
|
// ebx = target PPC address
|
|
// rcx = context
|
|
|
|
_code_offsets code_offsets = {};
|
|
|
|
constexpr size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
|
|
|
code_offsets.prolog = getSize();
|
|
|
|
// rsp + 0 = return address
|
|
sub(rsp, stack_size);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
|
|
// Save volatile registers
|
|
EmitSaveVolatileRegs();
|
|
|
|
mov(rcx, rsi); // context
|
|
mov(rdx, rbx);
|
|
mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
|
|
call(rax);
|
|
|
|
EmitLoadVolatileRegs();
|
|
|
|
code_offsets.epilog = getSize();
|
|
|
|
add(rsp, stack_size);
|
|
jmp(rax);
|
|
#elif XE_PLATFORM_LINUX || XE_PLATFORM_MAC
|
|
// Function is called with the following params:
|
|
// ebx = target PPC address
|
|
// rsi = context
|
|
|
|
// System-V ABI args:
|
|
// rdi = context
|
|
// rsi = target PPC address
|
|
|
|
struct _code_offsets {
|
|
size_t prolog;
|
|
size_t prolog_stack_alloc;
|
|
size_t body;
|
|
size_t epilog;
|
|
size_t tail;
|
|
} code_offsets = {};
|
|
constexpr size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
|
|
|
code_offsets.prolog = getSize();
|
|
|
|
// rsp + 0 = return address
|
|
sub(rsp, stack_size);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
|
|
// Save volatile registers
|
|
EmitSaveVolatileRegs();
|
|
mov(rdi, rsi); // context
|
|
mov(rsi, rbx); // target PPC address
|
|
mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
|
|
call(rax);
|
|
|
|
EmitLoadVolatileRegs();
|
|
|
|
code_offsets.epilog = getSize();
|
|
|
|
add(rsp, stack_size);
|
|
jmp(rax);
|
|
#else
|
|
assert_always("Unknown platform ABI in resolve function!");
|
|
#endif
|
|
|
|
code_offsets.tail = getSize();
|
|
|
|
assert_zero(code_offsets.prolog);
|
|
EmitFunctionInfo func_info = {};
|
|
func_info.code_size.total = getSize();
|
|
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
|
|
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
|
|
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
|
|
func_info.code_size.tail = getSize() - code_offsets.tail;
|
|
func_info.prolog_stack_alloc_offset =
|
|
code_offsets.prolog_stack_alloc - code_offsets.prolog;
|
|
func_info.stack_size = stack_size;
|
|
|
|
void* fn = Emplace(func_info);
|
|
return (ResolveFunctionThunk)fn;
|
|
}
|
|
// r11 = size of callers stack, r8 = return address w/ adjustment
|
|
// i'm not proud of this code, but it shouldn't be executed frequently at all
|
|
void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
|
|
_code_offsets code_offsets = {};
|
|
code_offsets.prolog = getSize();
|
|
push(rbx);
|
|
mov(rbx, GetBackendCtxPtr(offsetof(X64BackendContext, stackpoints)));
|
|
mov(eax,
|
|
GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)));
|
|
|
|
lea(ecx, ptr[eax - 1]);
|
|
mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, r[1])]);
|
|
|
|
Xbyak::Label looper{};
|
|
Xbyak::Label loopout{};
|
|
Xbyak::Label signed_underflow{};
|
|
xor_(r12d, r12d);
|
|
|
|
// todo: should use Loop instruction here if hasFastLoop,
|
|
// currently xbyak does not support it but its super easy to modify xbyak to
|
|
// have it
|
|
L(looper);
|
|
imul(edx, ecx, sizeof(X64BackendStackpoint));
|
|
mov(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]);
|
|
|
|
cmp(r10d, r9d);
|
|
|
|
jge(loopout, T_NEAR);
|
|
|
|
inc(r12d);
|
|
|
|
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
|
dec(ecx);
|
|
} else {
|
|
sub(ecx, 1);
|
|
}
|
|
js(signed_underflow, T_NEAR); // should be impossible!!
|
|
|
|
jmp(looper, T_NEAR);
|
|
L(loopout);
|
|
Xbyak::Label skip_adjust{};
|
|
cmp(r12d, 1); // should never happen?
|
|
jle(skip_adjust, T_NEAR);
|
|
Xbyak::Label we_good{};
|
|
|
|
// now we need to make sure that the return address matches
|
|
|
|
// mov(r9d, ptr[GetContextReg() + offsetof(ppc::PPCContext, lr)]);
|
|
pop(r9); // guest retaddr
|
|
// r10d = the guest_stack
|
|
// while guest_stack is equal and return address is not equal, decrement
|
|
|
|
Xbyak::Label search_for_retaddr{};
|
|
Xbyak::Label we_good_but_increment{};
|
|
L(search_for_retaddr);
|
|
|
|
imul(edx, ecx, sizeof(X64BackendStackpoint));
|
|
|
|
cmp(r10d, ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_stack_)]);
|
|
|
|
jnz(we_good_but_increment, T_NEAR);
|
|
|
|
cmp(r9d,
|
|
ptr[rbx + rdx + offsetof(X64BackendStackpoint, guest_return_address_)]);
|
|
jz(we_good, T_NEAR); // stack is equal, return address is equal, we've got
|
|
// our destination stack
|
|
dec(ecx);
|
|
jmp(search_for_retaddr, T_NEAR);
|
|
Xbyak::Label checkbp{};
|
|
|
|
L(we_good_but_increment);
|
|
add(edx, sizeof(X64BackendStackpoint));
|
|
inc(ecx);
|
|
jmp(checkbp, T_NEAR);
|
|
L(we_good);
|
|
// we're popping this return address, so go down by one
|
|
sub(edx, sizeof(X64BackendStackpoint));
|
|
dec(ecx);
|
|
L(checkbp);
|
|
mov(rsp, ptr[rbx + rdx + offsetof(X64BackendStackpoint, host_stack_)]);
|
|
if (IsFeatureEnabled(kX64FlagsIndependentVars)) {
|
|
inc(ecx);
|
|
} else {
|
|
add(ecx, 1);
|
|
}
|
|
|
|
sub(rsp, r11); // adjust stack
|
|
|
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, current_stackpoint_depth)),
|
|
ecx); // set next stackpoint index to be after the one we restored to
|
|
jmp(r8);
|
|
L(skip_adjust);
|
|
pop(rbx);
|
|
jmp(r8); // return to caller
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
|
|
L(signed_underflow);
|
|
// find a good, compact way to signal error here
|
|
// maybe an invalid opcode that we execute, then detect in an exception
|
|
// handler?
|
|
|
|
this->DebugBreak();
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
|
|
void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|
void* sync_func, unsigned stack_element_size) {
|
|
_code_offsets code_offsets = {};
|
|
code_offsets.prolog = getSize();
|
|
pop(r8); // return address
|
|
|
|
switch (stack_element_size) {
|
|
case 4:
|
|
mov(r11d, ptr[r8]);
|
|
break;
|
|
case 2:
|
|
movzx(r11d, word[r8]);
|
|
break;
|
|
case 1:
|
|
movzx(r11d, byte[r8]);
|
|
break;
|
|
}
|
|
add(r8, stack_element_size);
|
|
jmp(sync_func, T_NEAR);
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
|
|
void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
|
|
_code_offsets code_offsets = {};
|
|
|
|
Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1;
|
|
Xbyak::Label LC1, _LCPI3_1;
|
|
Xbyak::Label handle_denormal_input;
|
|
Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret,
|
|
handle_oddball_denormal;
|
|
|
|
auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) {
|
|
inLocalLabel();
|
|
Xbyak::Label end_lzcnt;
|
|
bsr(scratch_reg, reg);
|
|
mov(reg, 0x20);
|
|
jz(end_lzcnt);
|
|
xor_(scratch_reg, 0x1F);
|
|
mov(reg, scratch_reg);
|
|
L(end_lzcnt);
|
|
outLocalLabel();
|
|
};
|
|
|
|
vmovd(r8d, xmm0);
|
|
vmovaps(xmm1, xmm0);
|
|
mov(ecx, r8d);
|
|
// extract mantissa
|
|
and_(ecx, 0x7fffff);
|
|
mov(edx, ecx);
|
|
cmp(r8d, 0xff800000);
|
|
jz(specialcheck_1, CodeGenerator::T_NEAR);
|
|
// is exponent zero?
|
|
test(r8d, 0x7f800000);
|
|
jne(L18);
|
|
test(ecx, ecx);
|
|
jne(L2);
|
|
|
|
L(L18);
|
|
// extract biased exponent and unbias
|
|
mov(r9d, r8d);
|
|
shr(r9d, 23);
|
|
movzx(r9d, r9b);
|
|
lea(eax, ptr[r9 - 127]);
|
|
cmp(r9d, 255);
|
|
jne(L4);
|
|
jmp(L35);
|
|
|
|
L(L2);
|
|
|
|
bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
|
|
jnc(handle_denormal_input, CodeGenerator::T_NEAR);
|
|
|
|
// handle denormal input with NJM on
|
|
// denorms get converted to zero w/ input sign, jump to our label
|
|
// that handles inputs of 0 for this
|
|
|
|
jmp(convert_to_signed_inf_and_ret);
|
|
L(L35);
|
|
|
|
vxorps(xmm0, xmm0, xmm0);
|
|
mov(eax, 128);
|
|
vcomiss(xmm1, xmm0);
|
|
jb(L4);
|
|
test(ecx, ecx);
|
|
jne(L8);
|
|
ret();
|
|
|
|
L(L4);
|
|
cmp(eax, 128);
|
|
jne(L9);
|
|
vxorps(xmm0, xmm0, xmm0);
|
|
vcomiss(xmm0, xmm1);
|
|
jbe(L9);
|
|
vmovss(xmm2, ptr[rip + LC1]);
|
|
vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32));
|
|
|
|
test(edx, edx);
|
|
jne(L8);
|
|
vorps(xmm0, xmm2, xmm2);
|
|
ret();
|
|
|
|
L(L9);
|
|
test(edx, edx);
|
|
je(L10);
|
|
cmp(eax, 128);
|
|
jne(L11);
|
|
L(L8);
|
|
or_(r8d, 0x400000);
|
|
vmovd(xmm0, r8d);
|
|
ret();
|
|
L(L10);
|
|
test(r9d, r9d);
|
|
jne(L11);
|
|
L(convert_to_signed_inf_and_ret);
|
|
not_(r8d);
|
|
shr(r8d, 31);
|
|
|
|
lea(rdx, ptr[rip + _LCPI3_1]);
|
|
shl(r8d, 2);
|
|
vmovss(xmm0, ptr[r8 + rdx]);
|
|
ret();
|
|
|
|
L(L11);
|
|
vxorps(xmm2, xmm2, xmm2);
|
|
vmovss(xmm0, ptr[rip + LC1]);
|
|
vcomiss(xmm2, xmm1);
|
|
ja(L1, CodeGenerator::T_NEAR);
|
|
mov(ecx, 127);
|
|
sal(eax, 4);
|
|
sub(ecx, r9d);
|
|
mov(r9d, edx);
|
|
and_(eax, 16);
|
|
shr(edx, 9);
|
|
shr(r9d, 19);
|
|
and_(edx, 1023);
|
|
sar(ecx, 1);
|
|
or_(eax, r9d);
|
|
xor_(eax, 16);
|
|
mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) +
|
|
rax * 4]);
|
|
mov(eax, r9d);
|
|
shr(r9d, 16);
|
|
imul(edx, r9d);
|
|
sal(eax, 10);
|
|
and_(eax, 0x3fffc00);
|
|
sub(eax, edx);
|
|
bt(eax, 25);
|
|
jc(L12);
|
|
mov(edx, eax);
|
|
add(ecx, 6);
|
|
and_(edx, 0x1ffffff);
|
|
|
|
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
|
lzcnt(edx, edx);
|
|
} else {
|
|
emulate_lzcnt_helper_unary_reg(edx, r9d);
|
|
}
|
|
|
|
lea(r9d, ptr[rdx - 6]);
|
|
sub(ecx, edx);
|
|
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
|
shlx(eax, eax, r9d);
|
|
} else {
|
|
xchg(ecx, r9d);
|
|
shl(eax, cl);
|
|
xchg(ecx, r9d);
|
|
}
|
|
|
|
L(L12);
|
|
test(al, 5);
|
|
je(L13);
|
|
test(al, 2);
|
|
je(L13);
|
|
add(eax, 4);
|
|
|
|
L(L13);
|
|
sal(ecx, 23);
|
|
and_(r8d, 0x80000000);
|
|
shr(eax, 2);
|
|
add(ecx, 0x3f800000);
|
|
and_(eax, 0x7fffff);
|
|
vxorps(xmm1, xmm1);
|
|
or_(ecx, r8d);
|
|
or_(ecx, eax);
|
|
vmovd(xmm0, ecx);
|
|
vaddss(xmm0, xmm1); // apply DAZ behavior to output
|
|
|
|
L(L1);
|
|
ret();
|
|
|
|
L(handle_denormal_input);
|
|
mov(r9d, r8d);
|
|
and_(r9d, 0x7FFFFFFF);
|
|
cmp(r9d, 0x400000);
|
|
jz(handle_oddball_denormal);
|
|
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
|
lzcnt(ecx, ecx);
|
|
} else {
|
|
emulate_lzcnt_helper_unary_reg(ecx, r9d);
|
|
}
|
|
|
|
mov(r9d, 9);
|
|
mov(eax, -118);
|
|
lea(edx, ptr[rcx - 8]);
|
|
sub(r9d, ecx);
|
|
sub(eax, ecx);
|
|
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
|
shlx(edx, r8d, edx);
|
|
} else {
|
|
xchg(ecx, edx);
|
|
// esi is just the value of xmm0's low word, so we can restore it from there
|
|
shl(r8d, cl);
|
|
mov(ecx,
|
|
edx); // restore ecx, dont xchg because we're going to spoil edx anyway
|
|
mov(edx, r8d);
|
|
vmovd(r8d, xmm0);
|
|
}
|
|
and_(edx, 0x7ffffe);
|
|
jmp(L4);
|
|
|
|
L(specialcheck_1);
|
|
// should be extremely rare
|
|
vmovss(xmm0, ptr[rip + LC1]);
|
|
ret();
|
|
|
|
L(handle_oddball_denormal);
|
|
not_(r8d);
|
|
lea(r9, ptr[rip + LC1]);
|
|
|
|
shr(r8d, 31);
|
|
movss(xmm0, ptr[r9 + r8 * 4]);
|
|
ret();
|
|
|
|
L(_LCPI3_1);
|
|
dd(0xFF800000);
|
|
dd(0x7F800000);
|
|
L(LC1);
|
|
// the position of 7FC00000 here matters, this address will be indexed in
|
|
// handle_oddball_denormal
|
|
dd(0x7FC00000);
|
|
dd(0x5F34FD00);
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.prolog = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
|
|
void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
|
|
_code_offsets code_offsets = {};
|
|
Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version;
|
|
auto result_ptr =
|
|
GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0]));
|
|
auto counter_ptr =
|
|
GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2]));
|
|
counter_ptr.setBit(64);
|
|
|
|
// shuffle and xor to check whether all lanes are equal
|
|
// sadly has to leave the float pipeline for the vptest, which is moderate
|
|
// yikes
|
|
vmovhlps(xmm2, xmm0, xmm0);
|
|
vmovsldup(xmm1, xmm0);
|
|
vxorps(xmm1, xmm1, xmm0);
|
|
vxorps(xmm2, xmm2, xmm0);
|
|
vorps(xmm2, xmm1, xmm2);
|
|
vptest(xmm2, xmm2);
|
|
jnz(check_scalar_operation_in_vmx);
|
|
// jmp(scalar_helper, CodeGenerator::T_NEAR);
|
|
call(scalar_helper);
|
|
vshufps(xmm0, xmm0, xmm0, 0);
|
|
ret();
|
|
|
|
L(check_scalar_operation_in_vmx);
|
|
|
|
vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]);
|
|
jnz(actual_vector_version);
|
|
vshufps(xmm0, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
|
call(scalar_helper);
|
|
// this->DebugBreak();
|
|
vinsertps(xmm0, xmm0, (3 << 4));
|
|
|
|
vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
|
|
0b0111);
|
|
|
|
ret();
|
|
|
|
L(actual_vector_version);
|
|
|
|
xor_(ecx, ecx);
|
|
vmovaps(result_ptr, xmm0);
|
|
|
|
mov(counter_ptr, rcx);
|
|
Xbyak::Label loop;
|
|
|
|
L(loop);
|
|
lea(rax, result_ptr);
|
|
vmovss(xmm0, ptr[rax + rcx * 4]);
|
|
call(scalar_helper);
|
|
mov(rcx, counter_ptr);
|
|
lea(rax, result_ptr);
|
|
vmovss(ptr[rax + rcx * 4], xmm0);
|
|
inc(ecx);
|
|
cmp(ecx, 4);
|
|
mov(counter_ptr, rcx);
|
|
jl(loop);
|
|
vmovaps(xmm0, result_ptr);
|
|
ret();
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
code_offsets.prolog = getSize();
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
|
|
void* X64HelperEmitter::EmitFrsqrteHelper() {
|
|
_code_offsets code_offsets = {};
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
code_offsets.prolog = getSize();
|
|
|
|
Xbyak::Label L2, L7, L6, L9, L1, L12, L24, L3, L25, frsqrte_table2, LC1;
|
|
bt(GetBackendFlagsPtr(), kX64BackendNonIEEEMode);
|
|
vmovq(rax, xmm0);
|
|
jc(L24, CodeGenerator::T_NEAR);
|
|
L(L2);
|
|
mov(rcx, rax);
|
|
add(rcx, rcx);
|
|
je(L3, CodeGenerator::T_NEAR);
|
|
mov(rdx, 0x7ff0000000000000ULL);
|
|
vxorpd(xmm1, xmm1, xmm1);
|
|
if (IsFeatureEnabled(kX64EmitBMI1)) {
|
|
andn(rcx, rax, rdx);
|
|
} else {
|
|
mov(rcx, rax);
|
|
not_(rcx);
|
|
and_(rcx, rdx);
|
|
}
|
|
|
|
jne(L6);
|
|
cmp(rax, rdx);
|
|
je(L1, CodeGenerator::T_NEAR);
|
|
mov(r8, rax);
|
|
sal(r8, 12);
|
|
jne(L7);
|
|
vcomisd(xmm0, xmm1);
|
|
jb(L12, CodeGenerator::T_NEAR);
|
|
|
|
L(L7);
|
|
mov(rdx, 0x7ff8000000000000ULL);
|
|
or_(rax, rdx);
|
|
vmovq(xmm1, rax);
|
|
vmovapd(xmm0, xmm1);
|
|
ret();
|
|
|
|
L(L6);
|
|
vcomisd(xmm1, xmm0);
|
|
ja(L12, CodeGenerator::T_NEAR);
|
|
mov(rcx, rax);
|
|
mov(rdx, 0xfffffffffffffULL);
|
|
shr(rcx, 52);
|
|
and_(ecx, 2047);
|
|
and_(rax, rdx);
|
|
je(L9);
|
|
test(ecx, ecx);
|
|
je(L25, CodeGenerator::T_NEAR);
|
|
|
|
L(L9);
|
|
lea(edx, ptr[0 + rcx * 8]);
|
|
shr(rax, 49);
|
|
sub(ecx, 1023);
|
|
and_(edx, 8);
|
|
and_(eax, 7);
|
|
shr(ecx, 1);
|
|
or_(eax, edx);
|
|
mov(edx, 1022);
|
|
xor_(eax, 8);
|
|
sub(edx, ecx);
|
|
lea(rcx, ptr[rip + frsqrte_table2]);
|
|
movzx(eax, byte[rax + rcx]);
|
|
sal(rdx, 52);
|
|
sal(rax, 44);
|
|
or_(rax, rdx);
|
|
vmovq(xmm1, rax);
|
|
|
|
L(L1);
|
|
vmovapd(xmm0, xmm1);
|
|
ret();
|
|
|
|
L(L12);
|
|
vmovsd(xmm1, qword[rip + LC1]);
|
|
vmovapd(xmm0, xmm1);
|
|
ret();
|
|
|
|
L(L24);
|
|
mov(r8, rax);
|
|
sal(r8, 12);
|
|
je(L2);
|
|
mov(rdx, 0x7ff0000000000000);
|
|
test(rax, rdx);
|
|
jne(L2);
|
|
mov(rdx, 0x8000000000000000ULL);
|
|
and_(rax, rdx);
|
|
|
|
L(L3);
|
|
mov(rdx, 0x8000000000000000ULL);
|
|
and_(rax, rdx);
|
|
mov(rdx, 0x7ff0000000000000ULL);
|
|
or_(rax, rdx);
|
|
vmovq(xmm1, rax);
|
|
vmovapd(xmm0, xmm1);
|
|
ret();
|
|
|
|
L(L25);
|
|
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
|
lzcnt(rdx, rax);
|
|
} else {
|
|
Xbyak::Label end_lzcnt;
|
|
bsr(rcx, rax);
|
|
mov(rdx, 0x40);
|
|
jz(end_lzcnt);
|
|
xor_(rcx, 0x3F);
|
|
mov(rdx, rcx);
|
|
L(end_lzcnt);
|
|
}
|
|
lea(ecx, ptr[rdx - 11]);
|
|
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
|
shlx(rax, rax, rcx);
|
|
} else {
|
|
shl(rax, cl);
|
|
}
|
|
mov(ecx, 12);
|
|
sub(ecx, edx);
|
|
jmp(L9, CodeGenerator::T_NEAR);
|
|
|
|
L(frsqrte_table2);
|
|
static constexpr unsigned char table_values[] = {
|
|
241u, 216u, 192u, 168u, 152u, 136u, 128u, 112u,
|
|
96u, 76u, 60u, 48u, 32u, 24u, 16u, 8u};
|
|
db(table_values, sizeof(table_values));
|
|
|
|
L(LC1);
|
|
dd(0);
|
|
dd(0x7ff80000);
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
|
|
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
|
_code_offsets code_offsets = {};
|
|
code_offsets.prolog = getSize();
|
|
|
|
Xbyak::Label already_has_a_reservation;
|
|
Xbyak::Label acquire_new_reservation;
|
|
|
|
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
|
|
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
|
jc(already_has_a_reservation);
|
|
|
|
shr(ecx, RESERVE_BLOCK_SHIFT);
|
|
xor_(r9d, r9d);
|
|
mov(edx, ecx);
|
|
shr(edx, 6); // divide by 64
|
|
lea(rdx, ptr[r8 + rdx * 8]);
|
|
and_(ecx, 64 - 1);
|
|
|
|
lock();
|
|
bts(qword[rdx], rcx);
|
|
// set flag on local backend context for thread to indicate our previous
|
|
// attempt to get the reservation succeeded
|
|
setnc(r9b); // success = bitmap did not have a set bit at the idx
|
|
shl(r9b, kX64BackendHasReserveBit);
|
|
|
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
|
rdx);
|
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
|
|
|
|
or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
|
|
ret();
|
|
L(already_has_a_reservation);
|
|
DebugBreak();
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
// ecx=guest addr
|
|
// r9 = host addr
|
|
// r8 = value
|
|
// if ZF is set and CF is set, we succeeded
|
|
void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
|
|
_code_offsets code_offsets = {};
|
|
code_offsets.prolog = getSize();
|
|
Xbyak::Label done;
|
|
Xbyak::Label reservation_isnt_for_our_addr;
|
|
Xbyak::Label somehow_double_cleared;
|
|
// carry must be set + zero flag must be set
|
|
|
|
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
|
|
|
|
jnc(done);
|
|
|
|
mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
|
|
|
shr(ecx, RESERVE_BLOCK_SHIFT);
|
|
mov(edx, ecx);
|
|
shr(edx, 6); // divide by 64
|
|
lea(rdx, ptr[rax + rdx * 8]);
|
|
// begin acquiring exclusive access to cacheline containing our bit
|
|
prefetchw(ptr[rdx]);
|
|
|
|
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
|
rdx);
|
|
jnz(reservation_isnt_for_our_addr);
|
|
|
|
mov(rax,
|
|
GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
|
|
|
|
// we need modulo bitsize, it turns out bittests' modulus behavior for the
|
|
// bitoffset only applies for register operands, for memory ones we bug out
|
|
// todo: actually, the above note may not be true, double check it
|
|
and_(ecx, 64 - 1);
|
|
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
|
|
jnz(reservation_isnt_for_our_addr);
|
|
|
|
// was our memory modified by kernel code or something?
|
|
lock();
|
|
if (bit64) {
|
|
cmpxchg(ptr[r9], r8);
|
|
|
|
} else {
|
|
cmpxchg(ptr[r9], r8d);
|
|
}
|
|
// the ZF flag is unaffected by BTR! we exploit this for the retval
|
|
|
|
// cancel our lock on the 65k block
|
|
lock();
|
|
btr(qword[rdx], rcx);
|
|
|
|
jnc(somehow_double_cleared);
|
|
|
|
L(done);
|
|
// i don't care that theres a dependency on the prev value of rax atm
|
|
// sadly theres no CF&ZF condition code
|
|
setz(al);
|
|
setc(ah);
|
|
cmp(ax, 0x0101);
|
|
ret();
|
|
|
|
// could be the same label, but otherwise we don't know where we came from
|
|
// when one gets triggered
|
|
L(reservation_isnt_for_our_addr);
|
|
DebugBreak();
|
|
|
|
L(somehow_double_cleared); // somehow, something else cleared our reserve??
|
|
DebugBreak();
|
|
|
|
code_offsets.prolog_stack_alloc = getSize();
|
|
code_offsets.body = getSize();
|
|
code_offsets.epilog = getSize();
|
|
code_offsets.tail = getSize();
|
|
return EmitCurrentForOffsets(code_offsets);
|
|
}
|
|
|
|
void X64HelperEmitter::EmitSaveVolatileRegs() {
|
|
// Save off volatile registers.
|
|
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rdx);
|
|
#if XE_PLATFORM_LINUX
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], rsi);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], rdi);
|
|
#endif
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r8);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r9);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[7])], r10);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[8])], r11);
|
|
|
|
// vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm0);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm1);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm2);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm3);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm4);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
|
|
}
|
|
|
|
void X64HelperEmitter::EmitLoadVolatileRegs() {
|
|
// mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
|
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
|
mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
|
|
#if XE_PLATFORM_LINUX
|
|
mov(rsi, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
|
|
mov(rdi, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
|
|
#endif
|
|
mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
|
|
mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
|
|
mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[7])]);
|
|
mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[8])]);
|
|
|
|
// vmovaps(xmm0, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
|
|
vmovaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
|
|
vmovaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
|
|
vmovaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
|
|
vmovaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
|
|
vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
|
|
}
|
|
|
|
void X64HelperEmitter::EmitSaveNonvolatileRegs() {
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
|
|
#if XE_PLATFORM_WIN32
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rcx);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], rsi);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], rdi);
|
|
#endif
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r12);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r13);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[7])], r14);
|
|
mov(qword[rsp + offsetof(StackLayout::Thunk, r[8])], r15);
|
|
|
|
// SysV does not have nonvolatile XMM registers.
|
|
#if XE_PLATFORM_WIN32
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm6);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm7);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm8);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm9);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm10);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm11);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[6])], xmm12);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[7])], xmm13);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[8])], xmm14);
|
|
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[9])], xmm15);
|
|
#endif
|
|
}
|
|
|
|
void X64HelperEmitter::EmitLoadNonvolatileRegs() {
|
|
mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
|
|
mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
|
|
#if XE_PLATFORM_WIN32
|
|
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
|
|
mov(rsi, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
|
|
mov(rdi, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
|
|
#endif
|
|
mov(r12, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
|
|
mov(r13, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
|
|
mov(r14, qword[rsp + offsetof(StackLayout::Thunk, r[7])]);
|
|
mov(r15, qword[rsp + offsetof(StackLayout::Thunk, r[8])]);
|
|
|
|
#if XE_PLATFORM_WIN32
|
|
vmovaps(xmm6, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
|
|
vmovaps(xmm7, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
|
|
vmovaps(xmm8, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
|
|
vmovaps(xmm9, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
|
|
vmovaps(xmm10, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
|
|
vmovaps(xmm11, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
|
|
vmovaps(xmm12, qword[rsp + offsetof(StackLayout::Thunk, xmm[6])]);
|
|
vmovaps(xmm13, qword[rsp + offsetof(StackLayout::Thunk, xmm[7])]);
|
|
vmovaps(xmm14, qword[rsp + offsetof(StackLayout::Thunk, xmm[8])]);
|
|
vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
|
|
#endif
|
|
}
|
|
void X64Backend::InitializeBackendContext(void* ctx) {
|
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
|
bctx->mxcsr_fpu =
|
|
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
|
|
// rounding on ppc is at startup
|
|
|
|
/*
|
|
todo: stackpoint arrays should be pooled virtual memory at the very
|
|
least there may be some fancy virtual address tricks we can do here
|
|
|
|
*/
|
|
|
|
bctx->stackpoints = cvars::enable_host_guest_stack_synchronization
|
|
? new X64BackendStackpoint[cvars::max_stackpoints]
|
|
: nullptr;
|
|
bctx->current_stackpoint_depth = 0;
|
|
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
|
bctx->flags = (1U << kX64BackendNJMOn); // NJM on by default
|
|
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
|
bctx->Ox1000 = 0x1000;
|
|
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
|
bctx->reserve_helper_ = &reserve_helper_;
|
|
}
|
|
void X64Backend::DeinitializeBackendContext(void* ctx) {
|
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
|
|
|
if (bctx->stackpoints) {
|
|
delete[] bctx->stackpoints;
|
|
bctx->stackpoints = nullptr;
|
|
}
|
|
}
|
|
|
|
void X64Backend::PrepareForReentry(void* ctx) {
|
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
|
|
|
bctx->current_stackpoint_depth = 0;
|
|
}
|
|
|
|
constexpr uint32_t mxcsr_table[8] = {
|
|
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
|
};
|
|
|
|
void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
|
|
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
|
|
|
uint32_t control = mode & 7;
|
|
_mm_setcsr(mxcsr_table[control]);
|
|
bctx->mxcsr_fpu = mxcsr_table[control];
|
|
auto ppc_context = ((ppc::PPCContext*)ctx);
|
|
ppc_context->fpscr.bits.rn = control;
|
|
ppc_context->fpscr.bits.ni = control >> 2;
|
|
}
|
|
|
|
bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
|
|
if (!cvars::enable_host_guest_stack_synchronization) {
|
|
return false;
|
|
}
|
|
|
|
ThreadState* thrd_state = ThreadState::Get();
|
|
if (!thrd_state) {
|
|
return false; // we're not a guest!
|
|
}
|
|
ppc::PPCContext* ctx = thrd_state->context();
|
|
|
|
X64BackendContext* backend_ctx = BackendContextForGuestContext(ctx);
|
|
|
|
uint32_t depth = backend_ctx->current_stackpoint_depth - 1;
|
|
if (static_cast<int32_t>(depth) < 1) {
|
|
return false;
|
|
}
|
|
uint32_t num_entries_to_populate =
|
|
std::min(MAX_GUEST_PSEUDO_STACKTRACE_ENTRIES, depth);
|
|
|
|
st->count = num_entries_to_populate;
|
|
st->truncated_flag = num_entries_to_populate < depth ? 1 : 0;
|
|
|
|
X64BackendStackpoint* current_stackpoint =
|
|
&backend_ctx->stackpoints[backend_ctx->current_stackpoint_depth - 1];
|
|
|
|
for (uint32_t stp_index = 0; stp_index < num_entries_to_populate;
|
|
++stp_index) {
|
|
st->return_addrs[stp_index] = current_stackpoint->guest_return_address_;
|
|
current_stackpoint--;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
|
uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) {
|
|
// who knows, we might want to compile different versions of a function one
|
|
// day
|
|
auto entry = profiler_data_.find(guest_address);
|
|
|
|
if (entry != profiler_data_.end()) {
|
|
return &entry->second;
|
|
} else {
|
|
profiler_data_[guest_address] = 0;
|
|
|
|
return &profiler_data_[guest_address];
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
// todo:flush cache
|
|
uint32_t X64Backend::CreateGuestTrampoline(GuestTrampolineProc proc,
|
|
void* userdata1, void* userdata2,
|
|
bool longterm) {
|
|
size_t new_index;
|
|
if (longterm) {
|
|
new_index = guest_trampoline_address_bitmap_.AcquireFromBack();
|
|
} else {
|
|
new_index = guest_trampoline_address_bitmap_.Acquire();
|
|
}
|
|
|
|
xenia_assert(new_index != (size_t)-1);
|
|
|
|
uint8_t* write_pos =
|
|
&guest_trampoline_memory_[sizeof(guest_trampoline_template) * new_index];
|
|
|
|
memcpy(write_pos, guest_trampoline_template,
|
|
sizeof(guest_trampoline_template));
|
|
|
|
*reinterpret_cast<void**>(&write_pos[guest_trampoline_template_offset_arg1]) =
|
|
userdata1;
|
|
*reinterpret_cast<void**>(&write_pos[guest_trampoline_template_offset_arg2]) =
|
|
userdata2;
|
|
*reinterpret_cast<GuestTrampolineProc*>(
|
|
&write_pos[guest_trampoline_template_offset_rcx]) = proc;
|
|
*reinterpret_cast<GuestToHostThunk*>(
|
|
&write_pos[guest_trampoline_template_offset_rax]) = guest_to_host_thunk_;
|
|
|
|
uint32_t indirection_guest_addr =
|
|
GUEST_TRAMPOLINE_BASE +
|
|
(static_cast<uint32_t>(new_index) * GUEST_TRAMPOLINE_MIN_LEN);
|
|
|
|
code_cache()->AddIndirection(
|
|
indirection_guest_addr,
|
|
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(write_pos)));
|
|
|
|
return indirection_guest_addr;
|
|
}
|
|
|
|
void X64Backend::FreeGuestTrampoline(uint32_t trampoline_addr) {
|
|
xenia_assert(trampoline_addr >= GUEST_TRAMPOLINE_BASE &&
|
|
trampoline_addr < GUEST_TRAMPOLINE_END);
|
|
size_t index =
|
|
(trampoline_addr - GUEST_TRAMPOLINE_BASE) / GUEST_TRAMPOLINE_MIN_LEN;
|
|
guest_trampoline_address_bitmap_.Release(index);
|
|
}
|
|
} // namespace x64
|
|
} // namespace backend
|
|
} // namespace cpu
|
|
} // namespace xe
|