diff --git a/.gitmodules b/.gitmodules index c27065533..142b85d70 100644 --- a/.gitmodules +++ b/.gitmodules @@ -85,3 +85,6 @@ [submodule "third_party/VulkanMemoryAllocator"] path = third_party/VulkanMemoryAllocator url = https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.git +[submodule "third_party/oaknut"] + path = third_party/oaknut + url = https://github.com/merryhime/oaknut.git diff --git a/premake5.lua b/premake5.lua index 6739d6dba..accb8f91e 100644 --- a/premake5.lua +++ b/premake5.lua @@ -54,7 +54,7 @@ filter("configurations:Checked") defines({ "DEBUG", }) -filter({"configurations:Checked", "platforms:Windows"}) +filter({"configurations:Checked", "platforms:Windows-*"}) buildoptions({ "/RTCsu", -- Full Run-Time Checks. }) @@ -153,7 +153,7 @@ filter("platforms:Android-*") "log", }) -filter("platforms:Windows") +filter("platforms:Windows-*") system("windows") toolset("msc") buildoptions({ @@ -179,8 +179,12 @@ filter("platforms:Windows") "_CRT_SECURE_NO_WARNINGS", "WIN32", "_WIN64=1", - "_AMD64=1", }) + filter("architecture:x86_64") + defines({ + "_AMD64=1", + }) + filter({}) linkoptions({ "/ignore:4006", -- Ignores complaints about empty obj files. "/ignore:4221", @@ -198,7 +202,7 @@ filter("platforms:Windows") }) -- Embed the manifest for things like dependencies and DPI awareness. -filter({"platforms:Windows", "kind:ConsoleApp or WindowedApp"}) +filter({"platforms:Windows-*", "kind:ConsoleApp or WindowedApp"}) files({ "src/xenia/base/app_win32.manifest" }) @@ -228,7 +232,12 @@ workspace("xenia") ["ARCHS"] = "x86_64" }) elseif os.istarget("windows") then - platforms({"Windows"}) + platforms({"Windows-ARM64", "Windows-x86_64"}) + filter("platforms:Windows-ARM64") + architecture("ARM64") + filter("platforms:Windows-x86_64") + architecture("x86_64") + filter({}) -- 10.0.15063.0: ID3D12GraphicsCommandList1::SetSamplePositions. -- 10.0.19041.0: D3D12_HEAP_FLAG_CREATE_NOT_ZEROED. -- 10.0.22000.0: DWMWA_WINDOW_CORNER_PREFERENCE. @@ -284,7 +293,13 @@ workspace("xenia") include("src/xenia/apu/nop") include("src/xenia/base") include("src/xenia/cpu") - include("src/xenia/cpu/backend/x64") + + filter("architecture:x86_64") + include("src/xenia/cpu/backend/x64") + filter("architecture:ARM64") + include("src/xenia/cpu/backend/a64") + filter({}) + include("src/xenia/debug/ui") include("src/xenia/gpu") include("src/xenia/gpu/null") diff --git a/src/xenia/app/premake5.lua b/src/xenia/app/premake5.lua index 86fcef758..eb9ded7da 100644 --- a/src/xenia/app/premake5.lua +++ b/src/xenia/app/premake5.lua @@ -32,6 +32,7 @@ project("xenia-app") "libavcodec", "libavutil", "mspack", + "SDL2", "snappy", "xxhash", }) @@ -72,13 +73,18 @@ project("xenia-app") "xenia-cpu-backend-x64", }) + filter("architecture:ARM64") + links({ + "xenia-cpu-backend-a64", + }) + -- TODO(Triang3l): The emulator itself on Android. filter("platforms:not Android-*") files({ "xenia_main.cc", }) - filter("platforms:Windows") + filter("platforms:Windows-*") files({ "main_resources.rc", }) @@ -104,7 +110,7 @@ project("xenia-app") "SDL2", }) - filter("platforms:Windows") + filter("platforms:Windows-*") links({ "xenia-apu-xaudio2", "xenia-gpu-d3d12", @@ -113,13 +119,13 @@ project("xenia-app") "xenia-ui-d3d12", }) - filter({"platforms:Windows", SINGLE_LIBRARY_FILTER}) + filter({"platforms:Windows-*", SINGLE_LIBRARY_FILTER}) links({ "xenia-gpu-d3d12-trace-viewer", "xenia-ui-window-d3d12-demo", }) - filter("platforms:Windows") + filter("platforms:Windows-*") -- Only create the .user file if it doesn't already exist. local user_file = project_root.."/build/xenia-app.vcxproj.user" if not os.isfile(user_file) then diff --git a/src/xenia/base/clock.cc b/src/xenia/base/clock.cc index 058eae43a..dd9972ad6 100644 --- a/src/xenia/base/clock.cc +++ b/src/xenia/base/clock.cc @@ -21,8 +21,9 @@ DEFINE_bool(clock_no_scaling, false, "Guest system time is directly pulled from host.", "CPU"); DEFINE_bool(clock_source_raw, false, - "Use the RDTSC instruction as the time source. " - "Host CPU must support invariant TSC.", + "On x64, Use the RDTSC instruction as the time source. Requires " + "invariant TSC. " + "On a64, Use the CNTVCT_EL0 register as the time source", "CPU"); namespace xe { diff --git a/src/xenia/base/clock.h b/src/xenia/base/clock.h index 67a3ebb67..1b57d8b52 100644 --- a/src/xenia/base/clock.h +++ b/src/xenia/base/clock.h @@ -18,6 +18,8 @@ #if XE_ARCH_AMD64 #define XE_CLOCK_RAW_AVAILABLE 1 +#elif XE_ARCH_ARM64 +#define XE_CLOCK_RAW_AVAILABLE 1 #endif DECLARE_bool(clock_no_scaling); diff --git a/src/xenia/base/clock_a64.cc b/src/xenia/base/clock_a64.cc new file mode 100644 index 000000000..6ca3569fe --- /dev/null +++ b/src/xenia/base/clock_a64.cc @@ -0,0 +1,50 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/base/clock.h" +#include "xenia/base/platform.h" + +#if XE_ARCH_ARM64 && XE_CLOCK_RAW_AVAILABLE + +#include "xenia/base/logging.h" + +#ifdef _MSC_VER +#include +#include +#else +#include +#endif + +// Wrap all these different cpu compiler intrinsics. +#if XE_COMPILER_MSVC +constexpr int32_t CNTFRQ_EL0 = ARM64_SYSREG(3, 3, 14, 0, 0); +constexpr int32_t CNTVCT_EL0 = ARM64_SYSREG(3, 3, 14, 0, 2); +#define xe_cpu_mrs(reg) _ReadStatusReg(reg) +#elif XE_COMPILER_CLANG || XE_COMPILER_GNUC +constexpr int32_t CNTFRQ_EL0 = 0b11'011'1110'0000'000; +constexpr int32_t CNTVCT_EL0 = 0b11'011'1110'0000'010; + +uint64_t xe_cpu_mrs(uint32_t reg) { + uint64_t result; + __asm__ volatile("mrs \t%0," #reg : "=r"(result)); + return result; +} +#else +#error \ + "No cpu instruction wrappers xe_cpu_mrs(CNTVCT_EL0); for current compiler implemented." +#endif + +namespace xe { + +uint64_t Clock::host_tick_frequency_raw() { return xe_cpu_mrs(CNTFRQ_EL0); } +uint64_t Clock::host_tick_count_raw() { return xe_cpu_mrs(CNTVCT_EL0); } + +} // namespace xe + +#endif diff --git a/src/xenia/base/exception_handler_win.cc b/src/xenia/base/exception_handler_win.cc index 786a129a5..49e49643f 100644 --- a/src/xenia/base/exception_handler_win.cc +++ b/src/xenia/base/exception_handler_win.cc @@ -36,12 +36,22 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) { } HostThreadContext thread_context; + +#if XE_ARCH_AMD64 thread_context.rip = ex_info->ContextRecord->Rip; thread_context.eflags = ex_info->ContextRecord->EFlags; std::memcpy(thread_context.int_registers, &ex_info->ContextRecord->Rax, sizeof(thread_context.int_registers)); std::memcpy(thread_context.xmm_registers, &ex_info->ContextRecord->Xmm0, sizeof(thread_context.xmm_registers)); +#elif XE_ARCH_ARM64 + thread_context.pc = ex_info->ContextRecord->Pc; + thread_context.cpsr = ex_info->ContextRecord->Cpsr; + std::memcpy(thread_context.x, &ex_info->ContextRecord->X, + sizeof(thread_context.x)); + std::memcpy(thread_context.v, &ex_info->ContextRecord->V, + sizeof(thread_context.v)); +#endif // https://msdn.microsoft.com/en-us/library/ms679331(v=vs.85).aspx // https://msdn.microsoft.com/en-us/library/aa363082(v=vs.85).aspx @@ -78,6 +88,7 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) { for (size_t i = 0; i < xe::countof(handlers_) && handlers_[i].first; ++i) { if (handlers_[i].first(&ex, handlers_[i].second)) { // Exception handled. +#if XE_ARCH_AMD64 ex_info->ContextRecord->Rip = thread_context.rip; ex_info->ContextRecord->EFlags = thread_context.eflags; uint32_t modified_register_index; @@ -98,6 +109,28 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) { &thread_context.xmm_registers[modified_register_index], sizeof(vec128_t)); } +#elif XE_ARCH_ARM64 + ex_info->ContextRecord->Pc = thread_context.pc; + ex_info->ContextRecord->Cpsr = thread_context.cpsr; + uint32_t modified_register_index; + uint16_t modified_int_registers_remaining = ex.modified_x_registers(); + while (xe::bit_scan_forward(modified_int_registers_remaining, + &modified_register_index)) { + modified_int_registers_remaining &= + ~(UINT16_C(1) << modified_register_index); + ex_info->ContextRecord->X[modified_register_index] = + thread_context.x[modified_register_index]; + } + uint16_t modified_xmm_registers_remaining = ex.modified_v_registers(); + while (xe::bit_scan_forward(modified_xmm_registers_remaining, + &modified_register_index)) { + modified_xmm_registers_remaining &= + ~(UINT16_C(1) << modified_register_index); + std::memcpy(&ex_info->ContextRecord->V + modified_register_index, + &thread_context.v[modified_register_index], + sizeof(vec128_t)); + } +#endif return EXCEPTION_CONTINUE_EXECUTION; } } diff --git a/src/xenia/base/host_thread_context.cc b/src/xenia/base/host_thread_context.cc index bf668bdd3..24b2b6e12 100644 --- a/src/xenia/base/host_thread_context.cc +++ b/src/xenia/base/host_thread_context.cc @@ -67,7 +67,7 @@ std::string HostThreadContext::GetStringFromValue(HostRegister reg, case Arm64Register::kPc: return hex ? string_util::to_hex_string(pc) : std::to_string(pc); case Arm64Register::kPstate: - return hex ? string_util::to_hex_string(pstate) : std::to_string(pstate); + return hex ? string_util::to_hex_string(cpsr) : std::to_string(cpsr); case Arm64Register::kFpsr: return hex ? string_util::to_hex_string(fpsr) : std::to_string(fpsr); case Arm64Register::kFpcr: diff --git a/src/xenia/base/host_thread_context.h b/src/xenia/base/host_thread_context.h index 554d09f44..6379f62f8 100644 --- a/src/xenia/base/host_thread_context.h +++ b/src/xenia/base/host_thread_context.h @@ -202,7 +202,7 @@ class HostThreadContext { uint64_t x[31]; uint64_t sp; uint64_t pc; - uint64_t pstate; + uint32_t cpsr; uint32_t fpsr; uint32_t fpcr; vec128_t v[32]; diff --git a/src/xenia/base/main_init_win.cc b/src/xenia/base/main_init_win.cc index 6b0a9059a..e67e50b66 100644 --- a/src/xenia/base/main_init_win.cc +++ b/src/xenia/base/main_init_win.cc @@ -11,6 +11,8 @@ #include +#if XE_ARCH_AMD64 + // Includes Windows headers, so it goes after platform_win.h. #include "third_party/xbyak/xbyak/xbyak_util.h" @@ -39,3 +41,5 @@ class StartupAvxCheck { #pragma warning(suppress : 4073) #pragma init_seg(lib) static StartupAvxCheck gStartupAvxCheck; + +#endif \ No newline at end of file diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 55dce4b45..14dd8d6b1 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -31,6 +31,8 @@ #if XE_ARCH_AMD64 #include +#elif XE_ARCH_ARM64 +#include #endif namespace xe { @@ -135,10 +137,17 @@ constexpr inline uint32_t bit_count(T v) { } #else #if XE_COMPILER_MSVC || XE_COMPILER_INTEL +#if XE_ARCH_AMD64 inline uint32_t bit_count(uint32_t v) { return __popcnt(v); } inline uint32_t bit_count(uint64_t v) { return static_cast(__popcnt64(v)); } +#elif XE_ARCH_ARM64 +inline uint32_t bit_count(uint32_t v) { return _CountOneBits(v); } +inline uint32_t bit_count(uint64_t v) { + return static_cast(_CountOneBits64(v)); +} +#endif #elif XE_COMPILER_GCC || XE_COMPILER_CLANG static_assert(sizeof(unsigned int) == sizeof(uint32_t)); static_assert(sizeof(unsigned long long) == sizeof(uint64_t)); @@ -372,6 +381,24 @@ template int64_t m128_i64(const __m128& v) { return m128_i64(_mm_castps_pd(v)); } +#elif XE_ARCH_ARM64 +// Utilities for NEON values. +template +float m128_f32(const float32x4_t& v) { + return vgetq_lane_f32(v, N); +} +template +int32_t m128_i32(const int32x4_t& v) { + return vgetq_lane_s32(v, N); +} +template +double m128_f64(const float64x2_t& v) { + return vgetq_lane_f64(v, N); +} +template +int64_t m128_i64(const int64x2_t& v) { + return vgetq_lane_s64(v, N); +} #endif // Similar to the C++ implementation of XMConvertFloatToHalf and diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index 439d0c467..c852ad649 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -66,6 +66,14 @@ #define XE_ARCH_PPC 1 #endif +#ifdef XE_ARCH_AMD64 +#define XE_HOST_ARCH_NAME "x64" +#elif XE_ARCH_ARM64 +#define XE_HOST_ARCH_NAME "a64" +#elif XE_ARCH_PPC +#define XE_HOST_ARCH_NAME "ppc" +#endif + #if XE_PLATFORM_WIN32 #define WIN32_LEAN_AND_MEAN #define NOMINMAX // Don't want windows.h including min/max macros. diff --git a/src/xenia/cpu/backend/a64/a64_assembler.cc b/src/xenia/cpu/backend/a64/a64_assembler.cc new file mode 100644 index 000000000..280b82468 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_assembler.cc @@ -0,0 +1,146 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_assembler.h" + +#include + +#include "third_party/capstone/include/capstone/arm64.h" +#include "third_party/capstone/include/capstone/capstone.h" +#include "xenia/base/profiling.h" +#include "xenia/base/reset_scope.h" +#include "xenia/base/string.h" +#include "xenia/cpu/backend/a64/a64_backend.h" +#include "xenia/cpu/backend/a64/a64_code_cache.h" +#include "xenia/cpu/backend/a64/a64_emitter.h" +#include "xenia/cpu/backend/a64/a64_function.h" +#include "xenia/cpu/cpu_flags.h" +#include "xenia/cpu/hir/hir_builder.h" +#include "xenia/cpu/hir/label.h" +#include "xenia/cpu/processor.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +using xe::cpu::hir::HIRBuilder; + +A64Assembler::A64Assembler(A64Backend* backend) + : Assembler(backend), a64_backend_(backend), capstone_handle_(0) { + if (cs_open(CS_ARCH_ARM64, CS_MODE_LITTLE_ENDIAN, &capstone_handle_) != + CS_ERR_OK) { + assert_always("Failed to initialize capstone"); + } + cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL); + cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_OFF); +} + +A64Assembler::~A64Assembler() { + // Emitter must be freed before the allocator. + emitter_.reset(); + + if (capstone_handle_) { + cs_close(&capstone_handle_); + } +} + +bool A64Assembler::Initialize() { + if (!Assembler::Initialize()) { + return false; + } + + emitter_.reset(new A64Emitter(a64_backend_)); + + return true; +} + +void A64Assembler::Reset() { + string_buffer_.Reset(); + Assembler::Reset(); +} + +bool A64Assembler::Assemble(GuestFunction* function, HIRBuilder* builder, + uint32_t debug_info_flags, + std::unique_ptr debug_info) { + SCOPE_profile_cpu_f("cpu"); + + // Reset when we leave. + xe::make_reset_scope(this); + + // Lower HIR -> a64. + void* machine_code = nullptr; + size_t code_size = 0; + if (!emitter_->Emit(function, builder, debug_info_flags, debug_info.get(), + &machine_code, &code_size, &function->source_map())) { + return false; + } + + // Stash generated machine code. + if (debug_info_flags & DebugInfoFlags::kDebugInfoDisasmMachineCode) { + DumpMachineCode(machine_code, code_size, function->source_map(), + &string_buffer_); + debug_info->set_machine_code_disasm(xe_strdup(string_buffer_.buffer())); + string_buffer_.Reset(); + } + + function->set_debug_info(std::move(debug_info)); + static_cast(function)->Setup( + reinterpret_cast(machine_code), code_size); + + // Install into indirection table. + const uint64_t host_address = reinterpret_cast(machine_code); + assert_true((host_address >> 32) == 0); + reinterpret_cast(backend_->code_cache()) + ->AddIndirection(function->address(), + static_cast(host_address)); + + return true; +} + +void A64Assembler::DumpMachineCode( + void* machine_code, size_t code_size, + const std::vector& source_map, StringBuffer* str) { + if (source_map.empty()) { + return; + } + auto source_map_index = 0; + uint32_t next_code_offset = source_map[0].code_offset; + + const uint8_t* code_ptr = reinterpret_cast(machine_code); + size_t remaining_code_size = code_size; + uint64_t address = uint64_t(machine_code); + cs_insn insn = {0}; + while (remaining_code_size && + cs_disasm_iter(capstone_handle_, &code_ptr, &remaining_code_size, + &address, &insn)) { + // Look up source offset. + auto code_offset = + uint32_t(code_ptr - reinterpret_cast(machine_code)); + if (code_offset >= next_code_offset && + source_map_index < source_map.size()) { + auto& source_map_entry = source_map[source_map_index]; + str->AppendFormat("{:08X} ", source_map_entry.guest_address); + ++source_map_index; + next_code_offset = source_map_index < source_map.size() + ? source_map[source_map_index].code_offset + : UINT_MAX; + } else { + str->Append(" "); + } + + str->AppendFormat("{:08X} {:<6} {}\n", uint32_t(insn.address), + insn.mnemonic, insn.op_str); + } +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_assembler.h b/src/xenia/cpu/backend/a64/a64_assembler.h new file mode 100644 index 000000000..95e0a6f1e --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_assembler.h @@ -0,0 +1,59 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_ASSEMBLER_H_ +#define XENIA_CPU_BACKEND_A64_A64_ASSEMBLER_H_ + +#include +#include + +#include "xenia/base/string_buffer.h" +#include "xenia/cpu/backend/assembler.h" +#include "xenia/cpu/function.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +class A64Backend; +class A64Emitter; + +class A64Assembler : public Assembler { + public: + explicit A64Assembler(A64Backend* backend); + ~A64Assembler() override; + + bool Initialize() override; + + void Reset() override; + + bool Assemble(GuestFunction* function, hir::HIRBuilder* builder, + uint32_t debug_info_flags, + std::unique_ptr debug_info) override; + + private: + void DumpMachineCode(void* machine_code, size_t code_size, + const std::vector& source_map, + StringBuffer* str); + + private: + A64Backend* a64_backend_; + std::unique_ptr emitter_; + uintptr_t capstone_handle_; + + StringBuffer string_buffer_; +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_ASSEMBLER_H_ diff --git a/src/xenia/cpu/backend/a64/a64_backend.cc b/src/xenia/cpu/backend/a64/a64_backend.cc new file mode 100644 index 000000000..8b3f3a6f7 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_backend.cc @@ -0,0 +1,735 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_backend.h" + +#include + +#include "third_party/capstone/include/capstone/arm64.h" +#include "third_party/capstone/include/capstone/capstone.h" + +#include "xenia/base/exception_handler.h" +#include "xenia/base/logging.h" +#include "xenia/cpu/backend/a64/a64_assembler.h" +#include "xenia/cpu/backend/a64/a64_code_cache.h" +#include "xenia/cpu/backend/a64/a64_emitter.h" +#include "xenia/cpu/backend/a64/a64_function.h" +#include "xenia/cpu/backend/a64/a64_sequences.h" +#include "xenia/cpu/backend/a64/a64_stack_layout.h" +#include "xenia/cpu/breakpoint.h" +#include "xenia/cpu/processor.h" +#include "xenia/cpu/stack_walker.h" + +DEFINE_int32(a64_extension_mask, -1, + "Allow the detection and utilization of specific instruction set " + "features.\n" + " 0 = armv8.0\n" + " 1 = LSE\n" + " 2 = F16C\n" + " -1 = Detect and utilize all possible processor features\n", + "a64"); + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +using namespace oaknut::util; + +class A64ThunkEmitter : public A64Emitter { + public: + A64ThunkEmitter(A64Backend* backend); + ~A64ThunkEmitter() override; + HostToGuestThunk EmitHostToGuestThunk(); + GuestToHostThunk EmitGuestToHostThunk(); + ResolveFunctionThunk EmitResolveFunctionThunk(); + + private: + // The following four functions provide save/load functionality for registers. + // They assume at least StackLayout::THUNK_STACK_SIZE bytes have been + // allocated on the stack. + + // Caller saved: + // Dont assume these registers will survive a subroutine call + // x0, v0 is not saved for use as arg0/return + // x1-x15, x30 | v0-v7 and v16-v31 + void EmitSaveVolatileRegs(); + void EmitLoadVolatileRegs(); + + // Callee saved: + // Subroutines must preserve these registers if they intend to use them + // x19-x30 | d8-d15 + void EmitSaveNonvolatileRegs(); + void EmitLoadNonvolatileRegs(); +}; + +A64Backend::A64Backend() : Backend(), code_cache_(nullptr) { + if (cs_open(CS_ARCH_ARM64, CS_MODE_LITTLE_ENDIAN, &capstone_handle_) != + CS_ERR_OK) { + assert_always("Failed to initialize capstone"); + } + cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL); + cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_ON); + cs_option(capstone_handle_, CS_OPT_SKIPDATA, CS_OPT_OFF); +} + +A64Backend::~A64Backend() { + if (capstone_handle_) { + cs_close(&capstone_handle_); + } + + A64Emitter::FreeConstData(emitter_data_); + ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this); +} + +bool A64Backend::Initialize(Processor* processor) { + if (!Backend::Initialize(processor)) { + return false; + } + + auto& gprs = machine_info_.register_sets[0]; + gprs.id = 0; + std::strcpy(gprs.name, "x"); + gprs.types = MachineInfo::RegisterSet::INT_TYPES; + gprs.count = A64Emitter::GPR_COUNT; + + auto& fprs = machine_info_.register_sets[1]; + fprs.id = 1; + std::strcpy(fprs.name, "v"); + fprs.types = MachineInfo::RegisterSet::FLOAT_TYPES | + MachineInfo::RegisterSet::VEC_TYPES; + fprs.count = A64Emitter::FPR_COUNT; + + code_cache_ = A64CodeCache::Create(); + Backend::code_cache_ = code_cache_.get(); + if (!code_cache_->Initialize()) { + return false; + } + + // Generate thunks used to transition between jitted code and host code. + A64ThunkEmitter thunk_emitter(this); + host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk(); + guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk(); + resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk(); + + // Set the code cache to use the ResolveFunction thunk for default + // indirections. + assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull); + code_cache_->set_indirection_default( + uint32_t(uint64_t(resolve_function_thunk_))); + + // Allocate some special indirections. + code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF); + + // Allocate emitter constant data. + emitter_data_ = A64Emitter::PlaceConstData(); + + // Setup exception callback + ExceptionHandler::Install(&ExceptionCallbackThunk, this); + + return true; +} + +void A64Backend::CommitExecutableRange(uint32_t guest_low, + uint32_t guest_high) { + code_cache_->CommitExecutableRange(guest_low, guest_high); +} + +std::unique_ptr A64Backend::CreateAssembler() { + return std::make_unique(this); +} + +std::unique_ptr A64Backend::CreateGuestFunction( + Module* module, uint32_t address) { + return std::make_unique(module, address); +} + +uint64_t ReadCapstoneReg(HostThreadContext* context, arm64_reg reg) { + switch (reg) { + case ARM64_REG_X0: + return context->x[0]; + case ARM64_REG_X1: + return context->x[1]; + case ARM64_REG_X2: + return context->x[2]; + case ARM64_REG_X3: + return context->x[3]; + case ARM64_REG_X4: + return context->x[4]; + case ARM64_REG_X5: + return context->x[5]; + case ARM64_REG_X6: + return context->x[6]; + case ARM64_REG_X7: + return context->x[7]; + case ARM64_REG_X8: + return context->x[8]; + case ARM64_REG_X9: + return context->x[9]; + case ARM64_REG_X10: + return context->x[10]; + case ARM64_REG_X11: + return context->x[11]; + case ARM64_REG_X12: + return context->x[12]; + case ARM64_REG_X13: + return context->x[13]; + case ARM64_REG_X14: + return context->x[14]; + case ARM64_REG_X15: + return context->x[15]; + case ARM64_REG_X16: + return context->x[16]; + case ARM64_REG_X17: + return context->x[17]; + case ARM64_REG_X18: + return context->x[18]; + case ARM64_REG_X19: + return context->x[19]; + case ARM64_REG_X20: + return context->x[20]; + case ARM64_REG_X21: + return context->x[21]; + case ARM64_REG_X22: + return context->x[22]; + case ARM64_REG_X23: + return context->x[23]; + case ARM64_REG_X24: + return context->x[24]; + case ARM64_REG_X25: + return context->x[25]; + case ARM64_REG_X26: + return context->x[26]; + case ARM64_REG_X27: + return context->x[27]; + case ARM64_REG_X28: + return context->x[28]; + case ARM64_REG_X29: + return context->x[29]; + case ARM64_REG_X30: + return context->x[30]; + case ARM64_REG_W0: + return uint32_t(context->x[0]); + case ARM64_REG_W1: + return uint32_t(context->x[1]); + case ARM64_REG_W2: + return uint32_t(context->x[2]); + case ARM64_REG_W3: + return uint32_t(context->x[3]); + case ARM64_REG_W4: + return uint32_t(context->x[4]); + case ARM64_REG_W5: + return uint32_t(context->x[5]); + case ARM64_REG_W6: + return uint32_t(context->x[6]); + case ARM64_REG_W7: + return uint32_t(context->x[7]); + case ARM64_REG_W8: + return uint32_t(context->x[8]); + case ARM64_REG_W9: + return uint32_t(context->x[9]); + case ARM64_REG_W10: + return uint32_t(context->x[10]); + case ARM64_REG_W11: + return uint32_t(context->x[11]); + case ARM64_REG_W12: + return uint32_t(context->x[12]); + case ARM64_REG_W13: + return uint32_t(context->x[13]); + case ARM64_REG_W14: + return uint32_t(context->x[14]); + case ARM64_REG_W15: + return uint32_t(context->x[15]); + case ARM64_REG_W16: + return uint32_t(context->x[16]); + case ARM64_REG_W17: + return uint32_t(context->x[17]); + case ARM64_REG_W18: + return uint32_t(context->x[18]); + case ARM64_REG_W19: + return uint32_t(context->x[19]); + case ARM64_REG_W20: + return uint32_t(context->x[20]); + case ARM64_REG_W21: + return uint32_t(context->x[21]); + case ARM64_REG_W22: + return uint32_t(context->x[22]); + case ARM64_REG_W23: + return uint32_t(context->x[23]); + case ARM64_REG_W24: + return uint32_t(context->x[24]); + case ARM64_REG_W25: + return uint32_t(context->x[25]); + case ARM64_REG_W26: + return uint32_t(context->x[26]); + case ARM64_REG_W27: + return uint32_t(context->x[27]); + case ARM64_REG_W28: + return uint32_t(context->x[28]); + case ARM64_REG_W29: + return uint32_t(context->x[29]); + case ARM64_REG_W30: + return uint32_t(context->x[30]); + default: + assert_unhandled_case(reg); + return 0; + } +} + +bool TestCapstonePstate(arm64_cc cond, uint32_t pstate) { + // https://devblogs.microsoft.com/oldnewthing/20220815-00/?p=106975 + // Upper 4 bits of pstate are NZCV + const bool N = !!(pstate & 0x80000000); + const bool Z = !!(pstate & 0x40000000); + const bool C = !!(pstate & 0x20000000); + const bool V = !!(pstate & 0x10000000); + switch (cond) { + case ARM64_CC_EQ: + return (Z == true); + case ARM64_CC_NE: + return (Z == false); + case ARM64_CC_HS: + return (C == true); + case ARM64_CC_LO: + return (C == false); + case ARM64_CC_MI: + return (N == true); + case ARM64_CC_PL: + return (N == false); + case ARM64_CC_VS: + return (V == true); + case ARM64_CC_VC: + return (V == false); + case ARM64_CC_HI: + return ((C == true) && (Z == false)); + case ARM64_CC_LS: + return ((C == false) || (Z == true)); + case ARM64_CC_GE: + return (N == V); + case ARM64_CC_LT: + return (N != V); + case ARM64_CC_GT: + return ((Z == false) && (N == V)); + case ARM64_CC_LE: + return ((Z == true) || (N != V)); + case ARM64_CC_AL: + return true; + case ARM64_CC_NV: + return false; + default: + assert_unhandled_case(cond); + return false; + } +} + +uint64_t A64Backend::CalculateNextHostInstruction(ThreadDebugInfo* thread_info, + uint64_t current_pc) { + auto machine_code_ptr = reinterpret_cast(current_pc); + size_t remaining_machine_code_size = 64; + uint64_t host_address = current_pc; + cs_insn insn = {0}; + cs_detail all_detail = {0}; + insn.detail = &all_detail; + cs_disasm_iter(capstone_handle_, &machine_code_ptr, + &remaining_machine_code_size, &host_address, &insn); + const auto& detail = all_detail.arm64; + switch (insn.id) { + case ARM64_INS_B: + case ARM64_INS_BL: { + assert_true(detail.operands[0].type == ARM64_OP_IMM); + const int64_t pc_offset = static_cast(detail.operands[0].imm); + const bool test_passed = + TestCapstonePstate(detail.cc, thread_info->host_context.cpsr); + if (test_passed) { + return current_pc + pc_offset; + } else { + return current_pc + insn.size; + } + } break; + case ARM64_INS_BR: + case ARM64_INS_BLR: { + assert_true(detail.operands[0].type == ARM64_OP_REG); + const uint64_t target_pc = + ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg); + return target_pc; + } break; + case ARM64_INS_RET: { + assert_true(detail.operands[0].type == ARM64_OP_REG); + const uint64_t target_pc = + ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg); + return target_pc; + } break; + case ARM64_INS_CBNZ: { + assert_true(detail.operands[0].type == ARM64_OP_REG); + assert_true(detail.operands[1].type == ARM64_OP_IMM); + const int64_t pc_offset = static_cast(detail.operands[1].imm); + const bool test_passed = (0 != ReadCapstoneReg(&thread_info->host_context, + detail.operands[0].reg)); + if (test_passed) { + return current_pc + pc_offset; + } else { + return current_pc + insn.size; + } + } break; + case ARM64_INS_CBZ: { + assert_true(detail.operands[0].type == ARM64_OP_REG); + assert_true(detail.operands[1].type == ARM64_OP_IMM); + const int64_t pc_offset = static_cast(detail.operands[1].imm); + const bool test_passed = (0 == ReadCapstoneReg(&thread_info->host_context, + detail.operands[0].reg)); + if (test_passed) { + return current_pc + pc_offset; + } else { + return current_pc + insn.size; + } + } break; + default: { + // Not a branching instruction - just move over it. + return current_pc + insn.size; + } break; + } +} + +void A64Backend::InstallBreakpoint(Breakpoint* breakpoint) { + breakpoint->ForEachHostAddress([breakpoint](uint64_t host_address) { + auto ptr = reinterpret_cast(host_address); + auto original_bytes = xe::load_and_swap(ptr); + assert_true(original_bytes != 0x0000'dead); + xe::store_and_swap(ptr, 0x0000'dead); + breakpoint->backend_data().emplace_back(host_address, original_bytes); + }); +} + +void A64Backend::InstallBreakpoint(Breakpoint* breakpoint, Function* fn) { + assert_true(breakpoint->address_type() == Breakpoint::AddressType::kGuest); + assert_true(fn->is_guest()); + auto guest_function = reinterpret_cast(fn); + auto host_address = + guest_function->MapGuestAddressToMachineCode(breakpoint->guest_address()); + if (!host_address) { + assert_always(); + return; + } + + // Assume we haven't already installed a breakpoint in this spot. + auto ptr = reinterpret_cast(host_address); + auto original_bytes = xe::load_and_swap(ptr); + assert_true(original_bytes != 0x0000'dead); + xe::store_and_swap(ptr, 0x0000'dead); + breakpoint->backend_data().emplace_back(host_address, original_bytes); +} + +void A64Backend::UninstallBreakpoint(Breakpoint* breakpoint) { + for (auto& pair : breakpoint->backend_data()) { + auto ptr = reinterpret_cast(pair.first); + auto instruction_bytes = xe::load_and_swap(ptr); + assert_true(instruction_bytes == 0x0000'dead); + xe::store_and_swap(ptr, static_cast(pair.second)); + } + breakpoint->backend_data().clear(); +} + +bool A64Backend::ExceptionCallbackThunk(Exception* ex, void* data) { + auto backend = reinterpret_cast(data); + return backend->ExceptionCallback(ex); +} + +bool A64Backend::ExceptionCallback(Exception* ex) { + if (ex->code() != Exception::Code::kIllegalInstruction) { + // We only care about illegal instructions. Other things will be handled by + // other handlers (probably). If nothing else picks it up we'll be called + // with OnUnhandledException to do real crash handling. + return false; + } + + // Verify an expected illegal instruction. + auto instruction_bytes = + xe::load_and_swap(reinterpret_cast(ex->pc())); + if (instruction_bytes != 0x0000'dead) { + // Not our `udf #0xdead` - not us. + return false; + } + + // Let the processor handle things. + return processor()->OnThreadBreakpointHit(ex); +} + +A64ThunkEmitter::A64ThunkEmitter(A64Backend* backend) : A64Emitter(backend) {} + +A64ThunkEmitter::~A64ThunkEmitter() {} + +HostToGuestThunk A64ThunkEmitter::EmitHostToGuestThunk() { + // X0 = target + // X1 = arg0 (context) + // X2 = arg1 (guest return address) + + struct _code_offsets { + size_t prolog; + size_t prolog_stack_alloc; + size_t body; + size_t epilog; + size_t tail; + } code_offsets = {}; + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + + code_offsets.prolog = offset(); + + SUB(SP, SP, stack_size); + + code_offsets.prolog_stack_alloc = offset(); + code_offsets.body = offset(); + + EmitSaveNonvolatileRegs(); + + MOV(X16, X0); + MOV(GetContextReg(), X1); // context + MOV(X0, X2); // return address + BLR(X16); + + EmitLoadNonvolatileRegs(); + + code_offsets.epilog = offset(); + + ADD(SP, SP, stack_size); + + RET(); + + code_offsets.tail = offset(); + + assert_zero(code_offsets.prolog); + EmitFunctionInfo func_info = {}; + func_info.code_size.total = offset(); + func_info.code_size.prolog = code_offsets.body - code_offsets.prolog; + func_info.code_size.body = code_offsets.epilog - code_offsets.body; + func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog; + func_info.code_size.tail = offset() - code_offsets.tail; + func_info.prolog_stack_alloc_offset = + code_offsets.prolog_stack_alloc - code_offsets.prolog; + func_info.stack_size = stack_size; + + void* fn = Emplace(func_info); + return (HostToGuestThunk)fn; +} + +GuestToHostThunk A64ThunkEmitter::EmitGuestToHostThunk() { + // X0 = target function + // X1 = arg0 + // X2 = arg1 + // X3 = arg2 + + struct _code_offsets { + size_t prolog; + size_t prolog_stack_alloc; + size_t body; + size_t epilog; + size_t tail; + } code_offsets = {}; + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + + code_offsets.prolog = offset(); + + SUB(SP, SP, stack_size); + + code_offsets.prolog_stack_alloc = offset(); + code_offsets.body = offset(); + + EmitSaveVolatileRegs(); + + MOV(X16, X0); // function + MOV(X0, GetContextReg()); // context + BLR(X16); + + EmitLoadVolatileRegs(); + + code_offsets.epilog = offset(); + + ADD(SP, SP, stack_size); + RET(); + + code_offsets.tail = offset(); + + assert_zero(code_offsets.prolog); + EmitFunctionInfo func_info = {}; + func_info.code_size.total = offset(); + func_info.code_size.prolog = code_offsets.body - code_offsets.prolog; + func_info.code_size.body = code_offsets.epilog - code_offsets.body; + func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog; + func_info.code_size.tail = offset() - code_offsets.tail; + func_info.prolog_stack_alloc_offset = + code_offsets.prolog_stack_alloc - code_offsets.prolog; + func_info.stack_size = stack_size; + + void* fn = Emplace(func_info); + return (GuestToHostThunk)fn; +} + +// A64Emitter handles actually resolving functions. +uint64_t ResolveFunction(void* raw_context, uint64_t target_address); + +ResolveFunctionThunk A64ThunkEmitter::EmitResolveFunctionThunk() { + // Entry: + // W17 = target PPC address + // X0 = context + + struct _code_offsets { + size_t prolog; + size_t prolog_stack_alloc; + size_t body; + size_t epilog; + size_t tail; + } code_offsets = {}; + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + + code_offsets.prolog = offset(); + + // Preserve context register + STP(ZR, X0, SP, PRE_INDEXED, -16); + + SUB(SP, SP, stack_size); + + code_offsets.prolog_stack_alloc = offset(); + code_offsets.body = offset(); + + EmitSaveVolatileRegs(); + + // mov(rcx, rsi); // context + // mov(rdx, rbx); + // mov(rax, reinterpret_cast(&ResolveFunction)); + // call(rax) + MOV(X0, GetContextReg()); // context + MOV(W1, W17); + MOV(X16, reinterpret_cast(&ResolveFunction)); + BLR(X16); + MOV(X16, X0); + + EmitLoadVolatileRegs(); + + code_offsets.epilog = offset(); + + // add(rsp, stack_size); + // jmp(rax); + ADD(SP, SP, stack_size); + + // Reload context register + LDP(ZR, X0, SP, POST_INDEXED, 16); + BR(X16); + + code_offsets.tail = offset(); + + assert_zero(code_offsets.prolog); + EmitFunctionInfo func_info = {}; + func_info.code_size.total = offset(); + func_info.code_size.prolog = code_offsets.body - code_offsets.prolog; + func_info.code_size.body = code_offsets.epilog - code_offsets.body; + func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog; + func_info.code_size.tail = offset() - code_offsets.tail; + func_info.prolog_stack_alloc_offset = + code_offsets.prolog_stack_alloc - code_offsets.prolog; + func_info.stack_size = stack_size; + + void* fn = Emplace(func_info); + return (ResolveFunctionThunk)fn; +} + +void A64ThunkEmitter::EmitSaveVolatileRegs() { + // Save off volatile registers. + // Preserve arguments passed to and returned from a subroutine + // STR(X0, SP, offsetof(StackLayout::Thunk, r[0])); + STP(X1, X2, SP, offsetof(StackLayout::Thunk, r[0])); + STP(X3, X4, SP, offsetof(StackLayout::Thunk, r[2])); + STP(X5, X6, SP, offsetof(StackLayout::Thunk, r[4])); + STP(X7, X8, SP, offsetof(StackLayout::Thunk, r[6])); + STP(X9, X10, SP, offsetof(StackLayout::Thunk, r[8])); + STP(X11, X12, SP, offsetof(StackLayout::Thunk, r[10])); + STP(X13, X14, SP, offsetof(StackLayout::Thunk, r[12])); + STP(X15, X30, SP, offsetof(StackLayout::Thunk, r[14])); + + // Preserve arguments passed to and returned from a subroutine + // STR(Q0, SP, offsetof(StackLayout::Thunk, xmm[0])); + STP(Q1, Q2, SP, offsetof(StackLayout::Thunk, xmm[0])); + STP(Q3, Q4, SP, offsetof(StackLayout::Thunk, xmm[2])); + STP(Q5, Q6, SP, offsetof(StackLayout::Thunk, xmm[4])); + STP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6])); + STP(Q17, Q18, SP, offsetof(StackLayout::Thunk, xmm[8])); + STP(Q19, Q20, SP, offsetof(StackLayout::Thunk, xmm[10])); + STP(Q21, Q22, SP, offsetof(StackLayout::Thunk, xmm[12])); + STP(Q23, Q24, SP, offsetof(StackLayout::Thunk, xmm[14])); + STP(Q25, Q26, SP, offsetof(StackLayout::Thunk, xmm[16])); + STP(Q27, Q28, SP, offsetof(StackLayout::Thunk, xmm[18])); + STP(Q29, Q30, SP, offsetof(StackLayout::Thunk, xmm[20])); + STR(Q31, SP, offsetof(StackLayout::Thunk, xmm[21])); +} + +void A64ThunkEmitter::EmitLoadVolatileRegs() { + // Preserve arguments passed to and returned from a subroutine + // LDR(X0, SP, offsetof(StackLayout::Thunk, r[0])); + LDP(X1, X2, SP, offsetof(StackLayout::Thunk, r[0])); + LDP(X3, X4, SP, offsetof(StackLayout::Thunk, r[2])); + LDP(X5, X6, SP, offsetof(StackLayout::Thunk, r[4])); + LDP(X7, X8, SP, offsetof(StackLayout::Thunk, r[6])); + LDP(X9, X10, SP, offsetof(StackLayout::Thunk, r[8])); + LDP(X11, X12, SP, offsetof(StackLayout::Thunk, r[10])); + LDP(X13, X14, SP, offsetof(StackLayout::Thunk, r[12])); + LDP(X15, X30, SP, offsetof(StackLayout::Thunk, r[14])); + + // Preserve arguments passed to and returned from a subroutine + // LDR(Q0, SP, offsetof(StackLayout::Thunk, xmm[0])); + LDP(Q1, Q2, SP, offsetof(StackLayout::Thunk, xmm[0])); + LDP(Q3, Q4, SP, offsetof(StackLayout::Thunk, xmm[2])); + LDP(Q5, Q6, SP, offsetof(StackLayout::Thunk, xmm[4])); + LDP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6])); + LDP(Q17, Q18, SP, offsetof(StackLayout::Thunk, xmm[8])); + LDP(Q19, Q20, SP, offsetof(StackLayout::Thunk, xmm[10])); + LDP(Q21, Q22, SP, offsetof(StackLayout::Thunk, xmm[12])); + LDP(Q23, Q24, SP, offsetof(StackLayout::Thunk, xmm[14])); + LDP(Q25, Q26, SP, offsetof(StackLayout::Thunk, xmm[16])); + LDP(Q27, Q28, SP, offsetof(StackLayout::Thunk, xmm[18])); + LDP(Q29, Q30, SP, offsetof(StackLayout::Thunk, xmm[20])); + LDR(Q31, SP, offsetof(StackLayout::Thunk, xmm[21])); +} + +void A64ThunkEmitter::EmitSaveNonvolatileRegs() { + STP(X19, X20, SP, offsetof(StackLayout::Thunk, r[0])); + STP(X21, X22, SP, offsetof(StackLayout::Thunk, r[2])); + STP(X23, X24, SP, offsetof(StackLayout::Thunk, r[4])); + STP(X25, X26, SP, offsetof(StackLayout::Thunk, r[6])); + STP(X27, X28, SP, offsetof(StackLayout::Thunk, r[8])); + STP(X29, X30, SP, offsetof(StackLayout::Thunk, r[10])); + + STR(X17, SP, offsetof(StackLayout::Thunk, r[12])); + + STP(D8, D9, SP, offsetof(StackLayout::Thunk, xmm[0])); + STP(D10, D11, SP, offsetof(StackLayout::Thunk, xmm[1])); + STP(D12, D13, SP, offsetof(StackLayout::Thunk, xmm[2])); + STP(D14, D15, SP, offsetof(StackLayout::Thunk, xmm[3])); +} + +void A64ThunkEmitter::EmitLoadNonvolatileRegs() { + LDP(X19, X20, SP, offsetof(StackLayout::Thunk, r[0])); + LDP(X21, X22, SP, offsetof(StackLayout::Thunk, r[2])); + LDP(X23, X24, SP, offsetof(StackLayout::Thunk, r[4])); + LDP(X25, X26, SP, offsetof(StackLayout::Thunk, r[6])); + LDP(X27, X28, SP, offsetof(StackLayout::Thunk, r[8])); + LDP(X29, X30, SP, offsetof(StackLayout::Thunk, r[10])); + + LDR(X17, SP, offsetof(StackLayout::Thunk, r[12])); + + LDP(D8, D9, SP, offsetof(StackLayout::Thunk, xmm[0])); + LDP(D10, D11, SP, offsetof(StackLayout::Thunk, xmm[1])); + LDP(D12, D13, SP, offsetof(StackLayout::Thunk, xmm[2])); + LDP(D14, D15, SP, offsetof(StackLayout::Thunk, xmm[3])); +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_backend.h b/src/xenia/cpu/backend/a64/a64_backend.h new file mode 100644 index 000000000..57557414c --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_backend.h @@ -0,0 +1,88 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_BACKEND_H_ +#define XENIA_CPU_BACKEND_A64_A64_BACKEND_H_ + +#include + +#include "xenia/base/cvar.h" +#include "xenia/cpu/backend/backend.h" + +DECLARE_int32(a64_extension_mask); + +namespace xe { +class Exception; +} // namespace xe +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +class A64CodeCache; + +typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); +typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); +typedef void (*ResolveFunctionThunk)(); + +class A64Backend : public Backend { + public: + static const uint32_t kForceReturnAddress = 0x9FFF0000u; + + explicit A64Backend(); + ~A64Backend() override; + + A64CodeCache* code_cache() const { return code_cache_.get(); } + uintptr_t emitter_data() const { return emitter_data_; } + + // Call a generated function, saving all stack parameters. + HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; } + // Function that guest code can call to transition into host code. + GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; } + // Function that thunks to the ResolveFunction in A64Emitter. + ResolveFunctionThunk resolve_function_thunk() const { + return resolve_function_thunk_; + } + + bool Initialize(Processor* processor) override; + + void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) override; + + std::unique_ptr CreateAssembler() override; + + std::unique_ptr CreateGuestFunction(Module* module, + uint32_t address) override; + + uint64_t CalculateNextHostInstruction(ThreadDebugInfo* thread_info, + uint64_t current_pc) override; + + void InstallBreakpoint(Breakpoint* breakpoint) override; + void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override; + void UninstallBreakpoint(Breakpoint* breakpoint) override; + + private: + static bool ExceptionCallbackThunk(Exception* ex, void* data); + bool ExceptionCallback(Exception* ex); + + uintptr_t capstone_handle_ = 0; + + std::unique_ptr code_cache_; + uintptr_t emitter_data_ = 0; + + HostToGuestThunk host_to_guest_thunk_; + GuestToHostThunk guest_to_host_thunk_; + ResolveFunctionThunk resolve_function_thunk_; +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_BACKEND_H_ diff --git a/src/xenia/cpu/backend/a64/a64_code_cache.cc b/src/xenia/cpu/backend/a64/a64_code_cache.cc new file mode 100644 index 000000000..f484967ac --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_code_cache.cc @@ -0,0 +1,342 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_code_cache.h" + +#include +#include + +#include "third_party/fmt/include/fmt/format.h" +#include "xenia/base/assert.h" +#include "xenia/base/clock.h" +#include "xenia/base/literals.h" +#include "xenia/base/logging.h" +#include "xenia/base/math.h" +#include "xenia/base/memory.h" +#include "xenia/cpu/function.h" +#include "xenia/cpu/module.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +using namespace xe::literals; + +A64CodeCache::A64CodeCache() = default; + +A64CodeCache::~A64CodeCache() { + if (indirection_table_base_) { + xe::memory::DeallocFixed(indirection_table_base_, 0, + xe::memory::DeallocationType::kRelease); + } + + // Unmap all views and close mapping. + if (mapping_ != xe::memory::kFileMappingHandleInvalid) { + if (generated_code_write_base_ && + generated_code_write_base_ != generated_code_execute_base_) { + xe::memory::UnmapFileView(mapping_, generated_code_write_base_, + kGeneratedCodeSize); + } + if (generated_code_execute_base_) { + xe::memory::UnmapFileView(mapping_, generated_code_execute_base_, + kGeneratedCodeSize); + } + xe::memory::CloseFileMappingHandle(mapping_, file_name_); + mapping_ = xe::memory::kFileMappingHandleInvalid; + } +} + +bool A64CodeCache::Initialize() { + indirection_table_base_ = reinterpret_cast(xe::memory::AllocFixed( + reinterpret_cast(kIndirectionTableBase), kIndirectionTableSize, + xe::memory::AllocationType::kReserve, + xe::memory::PageAccess::kReadWrite)); + if (!indirection_table_base_) { + XELOGE("Unable to allocate code cache indirection table"); + XELOGE( + "This is likely because the {:X}-{:X} range is in use by some other " + "system DLL", + static_cast(kIndirectionTableBase), + kIndirectionTableBase + kIndirectionTableSize); + } + + // Create mmap file. This allows us to share the code cache with the debugger. + file_name_ = fmt::format("xenia_code_cache_{}", Clock::QueryHostTickCount()); + mapping_ = xe::memory::CreateFileMappingHandle( + file_name_, kGeneratedCodeSize, xe::memory::PageAccess::kExecuteReadWrite, + false); + if (mapping_ == xe::memory::kFileMappingHandleInvalid) { + XELOGE("Unable to create code cache mmap"); + return false; + } + + // Map generated code region into the file. Pages are committed as required. + if (xe::memory::IsWritableExecutableMemoryPreferred()) { + generated_code_execute_base_ = + reinterpret_cast(xe::memory::MapFileView( + mapping_, reinterpret_cast(kGeneratedCodeExecuteBase), + kGeneratedCodeSize, xe::memory::PageAccess::kExecuteReadWrite, 0)); + generated_code_write_base_ = generated_code_execute_base_; + if (!generated_code_execute_base_ || !generated_code_write_base_) { + XELOGE("Unable to allocate code cache generated code storage"); + XELOGE( + "This is likely because the {:X}-{:X} range is in use by some other " + "system DLL", + uint64_t(kGeneratedCodeExecuteBase), + uint64_t(kGeneratedCodeExecuteBase + kGeneratedCodeSize)); + return false; + } + } else { + generated_code_execute_base_ = + reinterpret_cast(xe::memory::MapFileView( + mapping_, reinterpret_cast(kGeneratedCodeExecuteBase), + kGeneratedCodeSize, xe::memory::PageAccess::kExecuteReadOnly, 0)); + generated_code_write_base_ = + reinterpret_cast(xe::memory::MapFileView( + mapping_, reinterpret_cast(kGeneratedCodeWriteBase), + kGeneratedCodeSize, xe::memory::PageAccess::kReadWrite, 0)); + if (!generated_code_execute_base_ || !generated_code_write_base_) { + XELOGE("Unable to allocate code cache generated code storage"); + XELOGE( + "This is likely because the {:X}-{:X} and {:X}-{:X} ranges are in " + "use by some other system DLL", + uint64_t(kGeneratedCodeExecuteBase), + uint64_t(kGeneratedCodeExecuteBase + kGeneratedCodeSize), + uint64_t(kGeneratedCodeWriteBase), + uint64_t(kGeneratedCodeWriteBase + kGeneratedCodeSize)); + return false; + } + } + + // Preallocate the function map to a large, reasonable size. + generated_code_map_.reserve(kMaximumFunctionCount); + + return true; +} + +void A64CodeCache::set_indirection_default(uint32_t default_value) { + indirection_default_value_ = default_value; +} + +void A64CodeCache::AddIndirection(uint32_t guest_address, + uint32_t host_address) { + if (!indirection_table_base_) { + return; + } + + uint32_t* indirection_slot = reinterpret_cast( + indirection_table_base_ + (guest_address - kIndirectionTableBase)); + *indirection_slot = host_address; +} + +void A64CodeCache::CommitExecutableRange(uint32_t guest_low, + uint32_t guest_high) { + if (!indirection_table_base_) { + return; + } + + // Commit the memory. + xe::memory::AllocFixed( + indirection_table_base_ + (guest_low - kIndirectionTableBase), + guest_high - guest_low, xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kReadWrite); + + // Fill memory with the default value. + uint32_t* p = reinterpret_cast(indirection_table_base_); + for (uint32_t address = guest_low; address < guest_high; ++address) { + p[(address - kIndirectionTableBase) / 4] = indirection_default_value_; + } +} + +void A64CodeCache::PlaceHostCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, + void*& code_execute_address_out, + void*& code_write_address_out) { + // Same for now. We may use different pools or whatnot later on, like when + // we only want to place guest code in a serialized cache on disk. + PlaceGuestCode(guest_address, machine_code, func_info, nullptr, + code_execute_address_out, code_write_address_out); +} + +void A64CodeCache::PlaceGuestCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, + GuestFunction* function_info, + void*& code_execute_address_out, + void*& code_write_address_out) { + // Hold a lock while we bump the pointers up. This is important as the + // unwind table requires entries AND code to be sorted in order. + size_t low_mark; + size_t high_mark; + uint8_t* code_execute_address; + UnwindReservation unwind_reservation; + { + auto global_lock = global_critical_region_.Acquire(); + + low_mark = generated_code_offset_; + + // Reserve code. + // Always move the code to land on 16b alignment. + code_execute_address = + generated_code_execute_base_ + generated_code_offset_; + code_execute_address_out = code_execute_address; + uint8_t* code_write_address = + generated_code_write_base_ + generated_code_offset_; + code_write_address_out = code_write_address; + generated_code_offset_ += xe::round_up(func_info.code_size.total, 16); + + auto tail_write_address = + generated_code_write_base_ + generated_code_offset_; + + // Reserve unwind info. + // We go on the high size of the unwind info as we don't know how big we + // need it, and a few extra bytes of padding isn't the worst thing. + unwind_reservation = RequestUnwindReservation(generated_code_write_base_ + + generated_code_offset_); + generated_code_offset_ += xe::round_up(unwind_reservation.data_size, 16); + + auto end_write_address = + generated_code_write_base_ + generated_code_offset_; + + high_mark = generated_code_offset_; + + // Store in map. It is maintained in sorted order of host PC dependent on + // us also being append-only. + generated_code_map_.emplace_back( + (uint64_t(code_execute_address - generated_code_execute_base_) << 32) | + generated_code_offset_, + function_info); + + // TODO(DrChat): The following code doesn't really need to be under the + // global lock except for PlaceCode (but it depends on the previous code + // already being ran) + + // If we are going above the high water mark of committed memory, commit + // some more. It's ok if multiple threads do this, as redundant commits + // aren't harmful. + size_t old_commit_mark, new_commit_mark; + do { + old_commit_mark = generated_code_commit_mark_; + if (high_mark <= old_commit_mark) break; + + new_commit_mark = old_commit_mark + 16_MiB; + if (generated_code_execute_base_ == generated_code_write_base_) { + xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark, + xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kExecuteReadWrite); + } else { + xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark, + xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kExecuteReadOnly); + xe::memory::AllocFixed(generated_code_write_base_, new_commit_mark, + xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kReadWrite); + } + } while (generated_code_commit_mark_.compare_exchange_weak( + old_commit_mark, new_commit_mark)); + + // Copy code. + std::memcpy(code_write_address, machine_code, func_info.code_size.total); + + // Fill unused slots with 0x00 + std::memset(tail_write_address, 0x00, + static_cast(end_write_address - tail_write_address)); + + // Notify subclasses of placed code. + PlaceCode(guest_address, machine_code, func_info, code_execute_address, + unwind_reservation); + } + + // Now that everything is ready, fix up the indirection table. + // Note that we do support code that doesn't have an indirection fixup, so + // ignore those when we see them. + if (guest_address && indirection_table_base_) { + uint32_t* indirection_slot = reinterpret_cast( + indirection_table_base_ + (guest_address - kIndirectionTableBase)); + *indirection_slot = + uint32_t(reinterpret_cast(code_execute_address)); + } +} + +uint32_t A64CodeCache::PlaceData(const void* data, size_t length) { + // Hold a lock while we bump the pointers up. + size_t high_mark; + uint8_t* data_address = nullptr; + { + auto global_lock = global_critical_region_.Acquire(); + + // Reserve code. + // Always move the code to land on 16b alignment. + data_address = generated_code_write_base_ + generated_code_offset_; + generated_code_offset_ += xe::round_up(length, 16); + + high_mark = generated_code_offset_; + } + + // If we are going above the high water mark of committed memory, commit some + // more. It's ok if multiple threads do this, as redundant commits aren't + // harmful. + size_t old_commit_mark, new_commit_mark; + do { + old_commit_mark = generated_code_commit_mark_; + if (high_mark <= old_commit_mark) break; + + new_commit_mark = old_commit_mark + 16_MiB; + if (generated_code_execute_base_ == generated_code_write_base_) { + xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark, + xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kExecuteReadWrite); + } else { + xe::memory::AllocFixed(generated_code_execute_base_, new_commit_mark, + xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kExecuteReadOnly); + xe::memory::AllocFixed(generated_code_write_base_, new_commit_mark, + xe::memory::AllocationType::kCommit, + xe::memory::PageAccess::kReadWrite); + } + } while (generated_code_commit_mark_.compare_exchange_weak(old_commit_mark, + new_commit_mark)); + + // Copy code. + std::memcpy(data_address, data, length); + + return uint32_t(uintptr_t(data_address)); +} + +GuestFunction* A64CodeCache::LookupFunction(uint64_t host_pc) { + uint32_t key = uint32_t(host_pc - kGeneratedCodeExecuteBase); + void* fn_entry = std::bsearch( + &key, generated_code_map_.data(), generated_code_map_.size() + 1, + sizeof(std::pair), + [](const void* key_ptr, const void* element_ptr) { + auto key = *reinterpret_cast(key_ptr); + auto element = + reinterpret_cast*>( + element_ptr); + if (key < (element->first >> 32)) { + return -1; + } else if (key > uint32_t(element->first)) { + return 1; + } else { + return 0; + } + }); + if (fn_entry) { + return reinterpret_cast*>( + fn_entry) + ->second; + } else { + return nullptr; + } +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_code_cache.h b/src/xenia/cpu/backend/a64/a64_code_cache.h new file mode 100644 index 000000000..2bc9ed59f --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_code_cache.h @@ -0,0 +1,151 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_CODE_CACHE_H_ +#define XENIA_CPU_BACKEND_A64_A64_CODE_CACHE_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "xenia/base/memory.h" +#include "xenia/base/mutex.h" +#include "xenia/cpu/backend/code_cache.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +struct EmitFunctionInfo { + struct _code_size { + size_t prolog; + size_t body; + size_t epilog; + size_t tail; + size_t total; + } code_size; + size_t prolog_stack_alloc_offset; // offset of instruction after stack alloc + size_t stack_size; +}; + +class A64CodeCache : public CodeCache { + public: + ~A64CodeCache() override; + + static std::unique_ptr Create(); + + virtual bool Initialize(); + + const std::filesystem::path& file_name() const override { return file_name_; } + uintptr_t execute_base_address() const override { + return kGeneratedCodeExecuteBase; + } + size_t total_size() const override { return kGeneratedCodeSize; } + + // TODO(benvanik): ELF serialization/etc + // TODO(benvanik): keep track of code blocks + // TODO(benvanik): padding/guards/etc + + bool has_indirection_table() { return indirection_table_base_ != nullptr; } + void set_indirection_default(uint32_t default_value); + void AddIndirection(uint32_t guest_address, uint32_t host_address); + + void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high); + + void PlaceHostCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, + void*& code_execute_address_out, + void*& code_write_address_out); + void PlaceGuestCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, + GuestFunction* function_info, + void*& code_execute_address_out, + void*& code_write_address_out); + uint32_t PlaceData(const void* data, size_t length); + + GuestFunction* LookupFunction(uint64_t host_pc) override; + + protected: + // All executable code falls within 0x80000000 to 0x9FFFFFFF, so we can + // only map enough for lookups within that range. + static const size_t kIndirectionTableSize = 0x1FFFFFFF; + static const uintptr_t kIndirectionTableBase = 0x80000000; + // The code range is 512MB, but we know the total code games will have is + // pretty small (dozens of mb at most) and our expansion is reasonablish + // so 256MB should be more than enough. + static const size_t kGeneratedCodeSize = 0x0FFFFFFF; + static const uintptr_t kGeneratedCodeExecuteBase = 0xA0000000; + // Used for writing when PageAccess::kExecuteReadWrite is not supported. + static const uintptr_t kGeneratedCodeWriteBase = + kGeneratedCodeExecuteBase + kGeneratedCodeSize + 1; + + // This is picked to be high enough to cover whatever we can reasonably + // expect. If we hit issues with this it probably means some corner case + // in analysis triggering. + static const size_t kMaximumFunctionCount = 100000; + + struct UnwindReservation { + size_t data_size = 0; + size_t table_slot = 0; + uint8_t* entry_address = 0; + }; + + A64CodeCache(); + + virtual UnwindReservation RequestUnwindReservation(uint8_t* entry_address) { + return UnwindReservation(); + } + virtual void PlaceCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, + void* code_execute_address, + UnwindReservation unwind_reservation) {} + + std::filesystem::path file_name_; + xe::memory::FileMappingHandle mapping_ = + xe::memory::kFileMappingHandleInvalid; + + // NOTE: the global critical region must be held when manipulating the offsets + // or counts of anything, to keep the tables consistent and ordered. + xe::global_critical_region global_critical_region_; + + // Value that the indirection table will be initialized with upon commit. + uint32_t indirection_default_value_ = 0xFEEDF00D; + + // Fixed at kIndirectionTableBase in host space, holding 4 byte pointers into + // the generated code table that correspond to the PPC functions in guest + // space. + uint8_t* indirection_table_base_ = nullptr; + // Fixed at kGeneratedCodeExecuteBase and holding all generated code, growing + // as needed. + uint8_t* generated_code_execute_base_ = nullptr; + // View of the memory that backs generated_code_execute_base_ when + // PageAccess::kExecuteReadWrite is not supported, for writing the generated + // code. Equals to generated_code_execute_base_ when it's supported. + uint8_t* generated_code_write_base_ = nullptr; + // Current offset to empty space in generated code. + size_t generated_code_offset_ = 0; + // Current high water mark of COMMITTED code. + std::atomic generated_code_commit_mark_ = {0}; + // Sorted map by host PC base offsets to source function info. + // This can be used to bsearch on host PC to find the guest function. + // The key is [start address | end address]. + std::vector> generated_code_map_; +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_CODE_CACHE_H_ diff --git a/src/xenia/cpu/backend/a64/a64_code_cache_win.cc b/src/xenia/cpu/backend/a64/a64_code_cache_win.cc new file mode 100644 index 000000000..21a87e9f2 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_code_cache_win.cc @@ -0,0 +1,319 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_code_cache.h" + +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/clock.h" +#include "xenia/base/logging.h" +#include "xenia/base/math.h" +#include "xenia/base/memory.h" +#include "xenia/base/platform_win.h" +#include "xenia/cpu/function.h" + +// Function pointer definitions +using FnRtlAddGrowableFunctionTable = decltype(&RtlAddGrowableFunctionTable); +using FnRtlGrowFunctionTable = decltype(&RtlGrowFunctionTable); +using FnRtlDeleteGrowableFunctionTable = + decltype(&RtlDeleteGrowableFunctionTable); + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +// ARM64 unwind-op codes +// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes +// https://www.corsix.org/content/windows-arm64-unwind-codes +typedef enum _UNWIND_OP_CODES { + UWOP_NOP = 0xE3, + UWOP_ALLOC_S = 0x00, // sub sp, sp, i*16 + UWOP_ALLOC_L = 0xE0'00'00'00, // sub sp, sp, i*16 + UWOP_SAVE_FPLR = 0x40, // stp fp, lr, [sp+i*8] + UWOP_SAVE_FPLRX = 0x80, // stp fp, lr, [sp-(i+1)*8]! + UWOP_SET_FP = 0xE1, // mov fp, sp + UWOP_END = 0xE4, +} UNWIND_CODE_OPS; + +using UNWIND_CODE = uint32_t; + +static_assert(sizeof(UNWIND_CODE) == sizeof(uint32_t)); + +// UNWIND_INFO defines the static part (first 32-bit) of the .xdata record +typedef struct _UNWIND_INFO { + uint32_t FunctionLength : 18; + uint32_t Version : 2; + uint32_t X : 1; + uint32_t E : 1; + uint32_t EpilogCount : 5; + uint32_t CodeWords : 5; + UNWIND_CODE UnwindCodes[2]; +} UNWIND_INFO, *PUNWIND_INFO; + +static_assert(offsetof(UNWIND_INFO, UnwindCodes[0]) == 4); +static_assert(offsetof(UNWIND_INFO, UnwindCodes[1]) == 8); + +// Size of unwind info per function. +static const uint32_t kUnwindInfoSize = sizeof(UNWIND_INFO); + +class Win32A64CodeCache : public A64CodeCache { + public: + Win32A64CodeCache(); + ~Win32A64CodeCache() override; + + bool Initialize() override; + + void* LookupUnwindInfo(uint64_t host_pc) override; + + private: + UnwindReservation RequestUnwindReservation(uint8_t* entry_address) override; + void PlaceCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, void* code_execute_address, + UnwindReservation unwind_reservation) override; + + void InitializeUnwindEntry(uint8_t* unwind_entry_address, + size_t unwind_table_slot, + void* code_execute_address, + const EmitFunctionInfo& func_info); + + // Growable function table system handle. + void* unwind_table_handle_ = nullptr; + // Actual unwind table entries. + std::vector unwind_table_; + // Current number of entries in the table. + std::atomic unwind_table_count_ = {0}; + // Does this version of Windows support growable funciton tables? + bool supports_growable_table_ = false; + + FnRtlAddGrowableFunctionTable add_growable_table_ = nullptr; + FnRtlDeleteGrowableFunctionTable delete_growable_table_ = nullptr; + FnRtlGrowFunctionTable grow_table_ = nullptr; +}; + +std::unique_ptr A64CodeCache::Create() { + return std::make_unique(); +} + +Win32A64CodeCache::Win32A64CodeCache() = default; + +Win32A64CodeCache::~Win32A64CodeCache() { + if (supports_growable_table_) { + if (unwind_table_handle_) { + delete_growable_table_(unwind_table_handle_); + } + } else { + if (generated_code_execute_base_) { + RtlDeleteFunctionTable(reinterpret_cast( + reinterpret_cast(generated_code_execute_base_) | 0x3)); + } + } +} + +bool Win32A64CodeCache::Initialize() { + if (!A64CodeCache::Initialize()) { + return false; + } + + // Compute total number of unwind entries we should allocate. + // We don't support reallocing right now, so this should be high. + unwind_table_.resize(kMaximumFunctionCount); + + // Check if this version of Windows supports growable function tables. + auto ntdll_handle = GetModuleHandleW(L"ntdll.dll"); + if (!ntdll_handle) { + add_growable_table_ = nullptr; + delete_growable_table_ = nullptr; + grow_table_ = nullptr; + } else { + add_growable_table_ = (FnRtlAddGrowableFunctionTable)GetProcAddress( + ntdll_handle, "RtlAddGrowableFunctionTable"); + delete_growable_table_ = (FnRtlDeleteGrowableFunctionTable)GetProcAddress( + ntdll_handle, "RtlDeleteGrowableFunctionTable"); + grow_table_ = (FnRtlGrowFunctionTable)GetProcAddress( + ntdll_handle, "RtlGrowFunctionTable"); + } + supports_growable_table_ = + add_growable_table_ && delete_growable_table_ && grow_table_; + + // Create table and register with the system. It's empty now, but we'll grow + // it as functions are added. + if (supports_growable_table_) { + if (add_growable_table_( + &unwind_table_handle_, unwind_table_.data(), unwind_table_count_, + DWORD(unwind_table_.size()), + reinterpret_cast(generated_code_execute_base_), + reinterpret_cast(generated_code_execute_base_ + + kGeneratedCodeSize))) { + XELOGE("Unable to create unwind function table"); + return false; + } + } else { + // Install a callback that the debugger will use to lookup unwind info on + // demand. + if (!RtlInstallFunctionTableCallback( + reinterpret_cast(generated_code_execute_base_) | 0x3, + reinterpret_cast(generated_code_execute_base_), + kGeneratedCodeSize, + [](DWORD64 control_pc, PVOID context) { + auto code_cache = reinterpret_cast(context); + return reinterpret_cast( + code_cache->LookupUnwindInfo(control_pc)); + }, + this, nullptr)) { + XELOGE("Unable to install function table callback"); + return false; + } + } + + return true; +} + +Win32A64CodeCache::UnwindReservation +Win32A64CodeCache::RequestUnwindReservation(uint8_t* entry_address) { + assert_false(unwind_table_count_ >= kMaximumFunctionCount); + UnwindReservation unwind_reservation; + unwind_reservation.data_size = xe::round_up(kUnwindInfoSize, 16); + unwind_reservation.table_slot = unwind_table_count_++; + unwind_reservation.entry_address = entry_address; + return unwind_reservation; +} + +void Win32A64CodeCache::PlaceCode(uint32_t guest_address, void* machine_code, + const EmitFunctionInfo& func_info, + void* code_execute_address, + UnwindReservation unwind_reservation) { + // Add unwind info. + InitializeUnwindEntry(unwind_reservation.entry_address, + unwind_reservation.table_slot, code_execute_address, + func_info); + + if (supports_growable_table_) { + // Notify that the unwind table has grown. + // We do this outside of the lock, but with the latest total count. + grow_table_(unwind_table_handle_, unwind_table_count_); + } + + // https://docs.microsoft.com/en-us/uwp/win32-and-com/win32-apis + FlushInstructionCache(GetCurrentProcess(), code_execute_address, + func_info.code_size.total); +} + +constexpr UNWIND_CODE UnwindOpWord(uint8_t code0 = UWOP_NOP, + uint8_t code1 = UWOP_NOP, + uint8_t code2 = UWOP_NOP, + uint8_t code3 = UWOP_NOP) { + return static_cast(code0) | (static_cast(code1) << 8) | + (static_cast(code2) << 16) | + (static_cast(code3) << 24); +} + +// 8-byte unwind code for "stp fp, lr, [sp, #-16]! +// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes +static uint8_t OpSaveFpLrX(int16_t pre_index_offset) { + assert_true(pre_index_offset <= -8); + assert_true(pre_index_offset >= -512); + // 16-byte aligned + constexpr int IndexShift = 3; + constexpr int IndexMask = (1 << IndexShift) - 1; + assert_true((pre_index_offset & IndexMask) == 0); + const uint32_t encoded_value = (-pre_index_offset >> IndexShift) - 1; + return UWOP_SAVE_FPLRX | encoded_value; +} + +// Ensure a 16-byte aligned stack +static constexpr size_t StackAlignShift = 4; // n / 16 +static constexpr size_t StackAlignMask = (1 << StackAlignShift) - 1; // n % 16 + +// 8-byte unwind code for up to +512-byte "sub sp, sp, #stack_space" +// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes +static uint8_t OpAllocS(int16_t stack_space) { + assert_true(stack_space >= 0); + assert_true(stack_space < 512); + assert_true((stack_space & StackAlignMask) == 0); + return UWOP_ALLOC_S | (stack_space >> StackAlignShift); +} + +// 4-byte unwind code for +256MiB "sub sp, sp, #stack_space" +// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling#unwind-codes +uint32_t OpAllocL(int32_t stack_space) { + assert_true(stack_space >= 0); + assert_true(stack_space < (0xFFFFFF * 16)); + assert_true((stack_space & StackAlignMask) == 0); + return xe::byte_swap(UWOP_ALLOC_L | + ((stack_space >> StackAlignShift) & 0xFF'FF'FF)); +} + +void Win32A64CodeCache::InitializeUnwindEntry( + uint8_t* unwind_entry_address, size_t unwind_table_slot, + void* code_execute_address, const EmitFunctionInfo& func_info) { + auto unwind_info = reinterpret_cast(unwind_entry_address); + + *unwind_info = {}; + // ARM64 instructions are always multiples of 4 bytes + // Windows ignores the bottom 2 bits + unwind_info->FunctionLength = func_info.code_size.total / 4; + unwind_info->CodeWords = 2; + + // https://learn.microsoft.com/en-us/cpp/build/arm64-exception-handling?view=msvc-170#unwind-codes + // The array of unwind codes is a pool of sequences that describe exactly how + // to undo the effects of the prolog. They're stored in the same order the + // operations need to be undone. The unwind codes can be thought of as a small + // instruction set, encoded as a string of bytes. When execution is complete, + // the return address to the calling function is in the lr register. And, all + // non-volatile registers are restored to their values at the time the + // function was called. + + // Function frames are generally: + // STP(X29, X30, SP, PRE_INDEXED, -16); + // MOV(X29, XSP); + // SUB(XSP, XSP, stack_size); + // ... function body ... + // ADD(XSP, XSP, stack_size); + // MOV(XSP, X29); + // LDP(X29, X30, SP, POST_INDEXED, 16); + + // These opcodes must undo the epilog and put the return address within lr + unwind_info->UnwindCodes[0] = OpAllocL(func_info.stack_size); + unwind_info->UnwindCodes[1] = + UnwindOpWord(UWOP_SET_FP, OpSaveFpLrX(-16), UWOP_END); + + // Add entry. + RUNTIME_FUNCTION& fn_entry = unwind_table_[unwind_table_slot]; + fn_entry.BeginAddress = + DWORD(reinterpret_cast(code_execute_address) - + generated_code_execute_base_); + fn_entry.UnwindData = + DWORD(unwind_entry_address - generated_code_execute_base_); +} + +void* Win32A64CodeCache::LookupUnwindInfo(uint64_t host_pc) { + return std::bsearch( + &host_pc, unwind_table_.data(), unwind_table_count_, + sizeof(RUNTIME_FUNCTION), + [](const void* key_ptr, const void* element_ptr) { + auto key = *reinterpret_cast(key_ptr) - + kGeneratedCodeExecuteBase; + auto element = reinterpret_cast(element_ptr); + if (key < element->BeginAddress) { + return -1; + } else if (key > (element->BeginAddress + element->FunctionLength)) { + return 1; + } else { + return 0; + } + }); +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_emitter.cc b/src/xenia/cpu/backend/a64/a64_emitter.cc new file mode 100644 index 000000000..6ae853ff3 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_emitter.cc @@ -0,0 +1,995 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_emitter.h" +#include "xenia/cpu/backend/a64/a64_util.h" + +#include + +#include +#include + +#include "third_party/fmt/include/fmt/format.h" +#include "xenia/base/assert.h" +#include "xenia/base/atomic.h" +#include "xenia/base/debugging.h" +#include "xenia/base/literals.h" +#include "xenia/base/logging.h" +#include "xenia/base/math.h" +#include "xenia/base/memory.h" +#include "xenia/base/profiling.h" +#include "xenia/base/vec128.h" +#include "xenia/cpu/backend/a64/a64_backend.h" +#include "xenia/cpu/backend/a64/a64_code_cache.h" +#include "xenia/cpu/backend/a64/a64_function.h" +#include "xenia/cpu/backend/a64/a64_sequences.h" +#include "xenia/cpu/backend/a64/a64_stack_layout.h" +#include "xenia/cpu/cpu_flags.h" +#include "xenia/cpu/function.h" +#include "xenia/cpu/function_debug_info.h" +#include "xenia/cpu/processor.h" +#include "xenia/cpu/symbol.h" +#include "xenia/cpu/thread_state.h" + +#include "oaknut/feature_detection/cpu_feature.hpp" +#include "oaknut/feature_detection/feature_detection.hpp" +#include "oaknut/feature_detection/feature_detection_idregs.hpp" + +DEFINE_bool(debugprint_trap_log, false, + "Log debugprint traps to the active debugger", "CPU"); +DEFINE_bool(ignore_undefined_externs, true, + "Don't exit when an undefined extern is called.", "CPU"); +DEFINE_bool(emit_source_annotations, false, + "Add extra movs and nops to make disassembly easier to read.", + "CPU"); + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +using xe::cpu::hir::HIRBuilder; +using xe::cpu::hir::Instr; +using namespace xe::literals; +using namespace oaknut::util; + +static const size_t kStashOffset = 32; +// static const size_t kStashOffsetHigh = 32 + 32; + +// Register indices that the HIR is allowed to use for operands +const uint8_t A64Emitter::gpr_reg_map_[A64Emitter::GPR_COUNT] = { + 19, 20, 21, 22, 23, 24, 25, 26, +}; + +const uint8_t A64Emitter::fpr_reg_map_[A64Emitter::FPR_COUNT] = { + 8, 9, 10, 11, 12, 13, 14, 15, +}; + +A64Emitter::A64Emitter(A64Backend* backend) + : VectorCodeGenerator(assembly_buffer), + processor_(backend->processor()), + backend_(backend), + code_cache_(backend->code_cache()) { + oaknut::CpuFeatures cpu_ = oaknut::detect_features(); + + // Combine with id register detection +#if OAKNUT_SUPPORTS_READING_ID_REGISTERS > 0 +#if OAKNUT_SUPPORTS_READING_ID_REGISTERS == 1 + const std::optional id_registers = + oaknut::read_id_registers(); +#elif OAKNUT_SUPPORTS_READING_ID_REGISTERS == 2 + const std::optional id_registers = + oaknut::read_id_registers(0); +#endif + if (id_registers.has_value()) { + cpu_ = cpu_ | oaknut::detect_features_via_id_registers(*id_registers); + } +#endif + +#define TEST_EMIT_FEATURE(emit, ext) \ + if ((cvars::a64_extension_mask & emit) == emit) { \ + feature_flags_ |= (cpu_.has(ext) ? emit : 0); \ + } + + TEST_EMIT_FEATURE(kA64EmitLSE, oaknut::CpuFeature::LSE); + TEST_EMIT_FEATURE(kA64EmitF16C, oaknut::CpuFeature::FP16Conv); + +#undef TEST_EMIT_FEATURE +} + +A64Emitter::~A64Emitter() = default; + +bool A64Emitter::Emit(GuestFunction* function, HIRBuilder* builder, + uint32_t debug_info_flags, FunctionDebugInfo* debug_info, + void** out_code_address, size_t* out_code_size, + std::vector* out_source_map) { + SCOPE_profile_cpu_f("cpu"); + + // Reset. + debug_info_ = debug_info; + debug_info_flags_ = debug_info_flags; + trace_data_ = &function->trace_data(); + source_map_arena_.Reset(); + + // Fill the generator with code. + EmitFunctionInfo func_info = {}; + if (!Emit(builder, func_info)) { + return false; + } + + // Copy the final code to the cache and relocate it. + *out_code_size = offset(); + *out_code_address = Emplace(func_info, function); + + // Stash source map. + source_map_arena_.CloneContents(out_source_map); + + return true; +} + +void* A64Emitter::Emplace(const EmitFunctionInfo& func_info, + GuestFunction* function) { + // Copy the current oaknut instruction-buffer into the code-cache + void* new_execute_address; + void* new_write_address; + + assert_true(func_info.code_size.total == offset()); + + if (function) { + code_cache_->PlaceGuestCode(function->address(), assembly_buffer.data(), + func_info, function, new_execute_address, + new_write_address); + } else { + code_cache_->PlaceHostCode(0, assembly_buffer.data(), func_info, + new_execute_address, new_write_address); + } + + // Reset the oaknut instruction-buffer + assembly_buffer.clear(); + label_lookup_.clear(); + + return new_execute_address; +} + +bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { + oaknut::Label epilog_label; + epilog_label_ = &epilog_label; + + // Calculate stack size. We need to align things to their natural sizes. + // This could be much better (sort by type/etc). + auto locals = builder->locals(); + size_t stack_offset = StackLayout::GUEST_STACK_SIZE; + for (auto it = locals.begin(); it != locals.end(); ++it) { + auto slot = *it; + size_t type_size = GetTypeSize(slot->type); + + // Align to natural size. + stack_offset = xe::align(stack_offset, type_size); + slot->set_constant((uint32_t)stack_offset); + stack_offset += type_size; + } + + // Ensure 16b alignment. + stack_offset -= StackLayout::GUEST_STACK_SIZE; + stack_offset = xe::align(stack_offset, static_cast(16)); + + struct _code_offsets { + size_t prolog; + size_t prolog_stack_alloc; + size_t body; + size_t epilog; + size_t tail; + } code_offsets = {}; + + code_offsets.prolog = offset(); + + // Function prolog. + // Must be 16b aligned. + // Windows is very strict about the form of this and the epilog: + // https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=vs-2017 + // IMPORTANT: any changes to the prolog must be kept in sync with + // A64CodeCache, which dynamically generates exception information. + // Adding or changing anything here must be matched! + size_t stack_size = StackLayout::GUEST_STACK_SIZE + stack_offset; + + // The SUB instruction can only encode immediates withi 0xFFF or 0xFFF000 + // If the stack size is greater than 0xFFF, then just align it to 0x1000 + if (stack_size > 0xFFF) { + stack_size = xe::align(stack_size, static_cast(0x1000)); + } + + assert_true(stack_size % 16 == 0); + func_info.stack_size = stack_size; + stack_size_ = stack_size; + + STP(X29, X30, SP, PRE_INDEXED, -16); + MOV(X29, SP); + + SUB(SP, SP, (uint32_t)stack_size); + + code_offsets.prolog_stack_alloc = offset(); + code_offsets.body = offset(); + + STR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME); + STR(X0, SP, StackLayout::GUEST_RET_ADDR); + STR(XZR, SP, StackLayout::GUEST_CALL_RET_ADDR); + + // Safe now to do some tracing. + if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) { + // We require 32-bit addresses. + assert_true(uint64_t(trace_data_->header()) < UINT_MAX); + auto trace_header = trace_data_->header(); + + // Call count. + MOV(W0, 1); + MOV(X5, reinterpret_cast( + low_address(&trace_header->function_call_count))); + LDADDAL(X0, X0, X5); + + // Get call history slot. + static_assert(FunctionTraceData::kFunctionCallerHistoryCount == 4, + "bitmask depends on count"); + LDR(X0, X5); + AND(W0, W0, 0b00000011); + + // Record call history value into slot (guest addr in W1). + MOV(X5, reinterpret_cast( + low_address(&trace_header->function_caller_history))); + STR(W1, X5, X0, oaknut::IndexExt::LSL, 2); + + // Calling thread. Load X0 with thread ID. + EmitGetCurrentThreadId(); + MOV(W5, 1); + LSL(W0, W5, W0); + + MOV(X5, reinterpret_cast( + low_address(&trace_header->function_thread_use))); + LDSET(W0, WZR, X5); + } + + // Load membase. + LDR(GetMembaseReg(), GetContextReg(), + offsetof(ppc::PPCContext, virtual_membase)); + + // Body. + auto block = builder->first_block(); + while (block) { + // Mark block labels. + auto label = block->label_head; + while (label) { + l(label_lookup_[label->name]); + label = label->next; + } + + // Process instructions. + const Instr* instr = block->instr_head; + while (instr) { + const Instr* new_tail = instr; + if (!SelectSequence(this, instr, &new_tail)) { + // No sequence found! + // NOTE: If you encounter this after adding a new instruction, do a full + // rebuild! + assert_always(); + XELOGE("Unable to process HIR opcode {}", instr->opcode->name); + break; + } + instr = new_tail; + } + + block = block->next; + } + + // Function epilog. + l(epilog_label); + epilog_label_ = nullptr; + EmitTraceUserCallReturn(); + LDR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME); + + code_offsets.epilog = offset(); + + ADD(SP, SP, (uint32_t)stack_size); + + MOV(SP, X29); + LDP(X29, X30, SP, POST_INDEXED, 16); + + RET(); + + code_offsets.tail = offset(); + + if (cvars::emit_source_annotations) { + NOP(); + NOP(); + NOP(); + NOP(); + NOP(); + } + + assert_zero(code_offsets.prolog); + func_info.code_size.total = offset(); + func_info.code_size.prolog = code_offsets.body - code_offsets.prolog; + func_info.code_size.body = code_offsets.epilog - code_offsets.body; + func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog; + func_info.code_size.tail = offset() - code_offsets.tail; + func_info.prolog_stack_alloc_offset = + code_offsets.prolog_stack_alloc - code_offsets.prolog; + + return true; +} + +void A64Emitter::MarkSourceOffset(const Instr* i) { + auto entry = source_map_arena_.Alloc(); + entry->guest_address = static_cast(i->src1.offset); + entry->hir_offset = uint32_t(i->block->ordinal << 16) | i->ordinal; + entry->code_offset = static_cast(offset()); + + if (cvars::emit_source_annotations) { + NOP(); + NOP(); + MOV(X0, entry->guest_address); + NOP(); + NOP(); + } + + if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) { + const uint32_t instruction_index = + (entry->guest_address - trace_data_->start_address()) / 4; + MOV(X0, 1); + MOV(X1, reinterpret_cast( + low_address(trace_data_->instruction_execute_counts() + + instruction_index * 8))); + LDADDAL(X0, ZR, X1); + } +} + +void A64Emitter::EmitGetCurrentThreadId() { + // X27 must point to context. We could fetch from the stack if needed. + LDRH(W0, GetContextReg(), offsetof(ppc::PPCContext, thread_id)); +} + +void A64Emitter::EmitTraceUserCallReturn() {} + +void A64Emitter::DebugBreak() { BRK(0xF000); } + +uint64_t TrapDebugPrint(void* raw_context, uint64_t address) { + auto thread_state = *reinterpret_cast(raw_context); + uint32_t str_ptr = uint32_t(thread_state->context()->r[3]); + // uint16_t str_len = uint16_t(thread_state->context()->r[4]); + auto str = thread_state->memory()->TranslateVirtual(str_ptr); + // TODO(benvanik): truncate to length? + XELOGD("(DebugPrint) {}", str); + + if (cvars::debugprint_trap_log) { + debugging::DebugPrint("(DebugPrint) {}", str); + } + + return 0; +} + +uint64_t TrapDebugBreak(void* raw_context, uint64_t address) { + auto thread_state = *reinterpret_cast(raw_context); + XELOGE("tw/td forced trap hit! This should be a crash!"); + if (cvars::break_on_debugbreak) { + xe::debugging::Break(); + } + return 0; +} + +void A64Emitter::Trap(uint16_t trap_type) { + switch (trap_type) { + case 20: + case 26: + // 0x0FE00014 is a 'debug print' where r3 = buffer r4 = length + CallNative(TrapDebugPrint, 0); + break; + case 0: + case 22: + // Always trap? + // TODO(benvanik): post software interrupt to debugger. + CallNative(TrapDebugBreak, 0); + break; + case 25: + // ? + break; + default: + XELOGW("Unknown trap type {}", trap_type); + BRK(0xF000); + break; + } +} + +void A64Emitter::UnimplementedInstr(const hir::Instr* i) { + // TODO(benvanik): notify debugger. + BRK(0xF000); + assert_always(); +} + +// This is used by the A64ThunkEmitter's ResolveFunctionThunk. +uint64_t ResolveFunction(void* raw_context, uint64_t target_address) { + auto thread_state = *reinterpret_cast(raw_context); + + // TODO(benvanik): required? + assert_not_zero(target_address); + + auto fn = thread_state->processor()->ResolveFunction( + static_cast(target_address)); + assert_not_null(fn); + auto a64_fn = static_cast(fn); + uint64_t addr = reinterpret_cast(a64_fn->machine_code()); + + return addr; +} + +void A64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { + assert_not_null(function); + auto fn = static_cast(function); + // Resolve address to the function to call and store in X16. + if (fn->machine_code()) { + // TODO(benvanik): is it worth it to do this? It removes the need for + // a ResolveFunction call, but makes the table less useful. + assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000); + MOV(X16, uint32_t(uint64_t(fn->machine_code()))); + } else if (code_cache_->has_indirection_table()) { + // Load the pointer to the indirection table maintained in A64CodeCache. + // The target dword will either contain the address of the generated code + // or a thunk to ResolveAddress. + MOV(W17, function->address()); + LDR(W16, X17); + } else { + // Old-style resolve. + // Not too important because indirection table is almost always available. + // TODO: Overwrite the call-site with a straight call. + CallNative(&ResolveFunction, function->address()); + MOV(X16, X0); + } + + // Actually jump/call to X16. + if (instr->flags & hir::CALL_TAIL) { + // Since we skip the prolog we need to mark the return here. + EmitTraceUserCallReturn(); + + // Pass the callers return address over. + LDR(X0, SP, StackLayout::GUEST_RET_ADDR); + + ADD(SP, SP, static_cast(stack_size())); + + MOV(SP, X29); + LDP(X29, X30, SP, POST_INDEXED, 16); + + BR(X16); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + LDR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR); + + BLR(X16); + } +} + +void A64Emitter::CallIndirect(const hir::Instr* instr, + const oaknut::XReg& reg) { + // Check if return. + if (instr->flags & hir::CALL_POSSIBLE_RETURN) { + LDR(W16, SP, StackLayout::GUEST_RET_ADDR); + CMP(reg.toW(), W16); + B(oaknut::Cond::EQ, epilog_label()); + } + + // Load the pointer to the indirection table maintained in A64CodeCache. + // The target dword will either contain the address of the generated code + // or a thunk to ResolveAddress. + if (code_cache_->has_indirection_table()) { + if (reg.toW().index() != W17.index()) { + MOV(W17, reg.toW()); + } + LDR(W16, X17); + } else { + // Old-style resolve. + // Not too important because indirection table is almost always available. + MOV(X0, GetContextReg()); + MOV(W1, reg.toW()); + + MOV(X16, reinterpret_cast(ResolveFunction)); + BLR(X16); + MOV(X16, X0); + } + + // Actually jump/call to X16. + if (instr->flags & hir::CALL_TAIL) { + // Since we skip the prolog we need to mark the return here. + EmitTraceUserCallReturn(); + + // Pass the callers return address over. + LDR(X0, SP, StackLayout::GUEST_RET_ADDR); + + ADD(SP, SP, static_cast(stack_size())); + + MOV(SP, X29); + LDP(X29, X30, SP, POST_INDEXED, 16); + + BR(X16); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + LDR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR); + + BLR(X16); + } +} + +uint64_t UndefinedCallExtern(void* raw_context, uint64_t function_ptr) { + auto function = reinterpret_cast(function_ptr); + if (!cvars::ignore_undefined_externs) { + xe::FatalError(fmt::format("undefined extern call to {:08X} {}", + function->address(), function->name().c_str())); + } else { + XELOGE("undefined extern call to {:08X} {}", function->address(), + function->name()); + } + return 0; +} +void A64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { + bool undefined = true; + if (function->behavior() == Function::Behavior::kBuiltin) { + auto builtin_function = static_cast(function); + if (builtin_function->handler()) { + undefined = false; + // x0 = target function + // x1 = arg0 + // x2 = arg1 + // x3 = arg2 + MOV(X0, reinterpret_cast(builtin_function->handler())); + MOV(X1, reinterpret_cast(builtin_function->arg0())); + MOV(X2, reinterpret_cast(builtin_function->arg1())); + + auto thunk = backend()->guest_to_host_thunk(); + MOV(X16, reinterpret_cast(thunk)); + BLR(X16); + + // x0 = host return + } + } else if (function->behavior() == Function::Behavior::kExtern) { + auto extern_function = static_cast(function); + if (extern_function->extern_handler()) { + undefined = false; + // x0 = target function + // x1 = arg0 + // x2 = arg1 + // x3 = arg2 + MOV(X0, reinterpret_cast(extern_function->extern_handler())); + LDR(X1, GetContextReg(), offsetof(ppc::PPCContext, kernel_state)); + + auto thunk = backend()->guest_to_host_thunk(); + MOV(X16, reinterpret_cast(thunk)); + BLR(X16); + + // x0 = host return + } + } + if (undefined) { + CallNative(UndefinedCallExtern, reinterpret_cast(function)); + } +} + +void A64Emitter::CallNative(void* fn) { CallNativeSafe(fn); } + +void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context)) { + CallNativeSafe(reinterpret_cast(fn)); +} + +void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)) { + CallNativeSafe(reinterpret_cast(fn)); +} + +void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0), + uint64_t arg0) { + MOV(GetNativeParam(0), arg0); + CallNativeSafe(reinterpret_cast(fn)); +} + +void A64Emitter::CallNativeSafe(void* fn) { + // X0 = target function + // X1 = arg0 + // X2 = arg1 + // X3 = arg2 + auto thunk = backend()->guest_to_host_thunk(); + + MOV(X0, reinterpret_cast(fn)); + + MOV(X16, reinterpret_cast(thunk)); + BLR(X16); + + // X0 = host return +} + +void A64Emitter::SetReturnAddress(uint64_t value) { + MOV(X0, value); + STR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR); +} + +oaknut::XReg A64Emitter::GetNativeParam(uint32_t param) { + if (param == 0) + return X1; + else if (param == 1) + return X2; + else if (param == 2) + return X3; + + assert_always(); + return X3; +} + +// Important: If you change these, you must update the thunks in a64_backend.cc! +oaknut::XReg A64Emitter::GetContextReg() { return X27; } +oaknut::XReg A64Emitter::GetMembaseReg() { return X28; } + +void A64Emitter::ReloadContext() { + LDR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME); +} + +void A64Emitter::ReloadMembase() { + LDR(GetMembaseReg(), GetContextReg(), + offsetof(ppc::PPCContext, virtual_membase)); +} + +bool A64Emitter::ConstantFitsIn32Reg(uint64_t v) { + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; +} + +void A64Emitter::MovMem64(const oaknut::XRegSp& addr, intptr_t offset, + uint64_t v) { + if (v == 0) { + STR(XZR, addr, offset); + } else if (!(v >> 32)) { + // All high bits are zero, 32-bit MOV + MOV(W0, static_cast(v)); + STR(X0, addr, offset); + } else { + // 64bit number that needs double movs. + MOV(X0, v); + STR(X0, addr, offset); + } +} + +static const vec128_t v_consts[] = { + /* VZero */ vec128f(0.0f), + /* VOnePD */ vec128d(1.0), + /* VNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), + /* VFFFF */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu), + /* VMaskX16Y16 */ + vec128i(0x0000FFFFu, 0xFFFF0000u, 0x00000000u, 0x00000000u), + /* VFlipX16Y16 */ + vec128i(0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u), + /* VFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), + /* VNormalizeX16Y16 */ + vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), + /* V0001 */ vec128f(0.0f, 0.0f, 0.0f, 1.0f), + /* V3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), + /* V3331 */ vec128f(3.0f, 3.0f, 3.0f, 1.0f), + /* V3333 */ vec128f(3.0f, 3.0f, 3.0f, 3.0f), + /* VSignMaskPS */ + vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* VSignMaskPD */ + vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), + /* VAbsMaskPS */ + vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu), + /* VAbsMaskPD */ + vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu), + /* VByteSwapMask */ + vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), + /* VByteOrderMask */ + vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu), + /* VPermuteControl15 */ vec128b(15), + /* VPermuteByteMask */ vec128b(0x1F), + /* VPackD3DCOLORSat */ vec128i(0x404000FFu), + /* VPackD3DCOLOR */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), + /* VUnpackD3DCOLOR */ + vec128i(0xFFFFFF0Eu, 0xFFFFFF0Du, 0xFFFFFF0Cu, 0xFFFFFF0Fu), + /* VPackFLOAT16_2 */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u), + /* VUnpackFLOAT16_2 */ + vec128i(0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu), + /* VPackFLOAT16_4 */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u, 0x05040706u), + /* VUnpackFLOAT16_4 */ + vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu), + /* VPackSHORT_Min */ vec128i(0x403F8001u), + /* VPackSHORT_Max */ vec128i(0x40407FFFu), + /* VPackSHORT_2 */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u), + /* VPackSHORT_4 */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu), + /* VUnpackSHORT_2 */ + vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu), + /* VUnpackSHORT_4 */ + vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu), + /* VUnpackSHORT_Overflow */ vec128i(0x403F8000u), + /* VPackUINT_2101010_MinUnpacked */ + vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u), + /* VPackUINT_2101010_MaxUnpacked */ + vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u), + /* VPackUINT_2101010_MaskUnpacked */ + vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u), + /* VPackUINT_2101010_MaskPacked */ + vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30), + /* VPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30), + /* VUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u), + /* VPackULONG_4202020_MinUnpacked */ + vec128i(0x40380001u, 0x40380001u, 0x40380001u, 0x40400000u), + /* VPackULONG_4202020_MaxUnpacked */ + vec128i(0x4047FFFFu, 0x4047FFFFu, 0x4047FFFFu, 0x4040000Fu), + /* VPackULONG_4202020_MaskUnpacked */ + vec128i(0xFFFFFu, 0xFFFFFu, 0xFFFFFu, 0xFu), + /* VPackULONG_4202020_PermuteXZ */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0A0908FFu, 0xFF020100u), + /* VPackULONG_4202020_PermuteYW */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0CFFFF06u, 0x0504FFFFu), + /* VUnpackULONG_4202020_Permute */ + vec128i(0xFF0E0D0Cu, 0xFF0B0A09u, 0xFF080F0Eu, 0xFFFFFF0Bu), + /* VUnpackULONG_4202020_Overflow */ vec128i(0x40380000u), + /* VOneOver255 */ vec128f(1.0f / 255.0f), + /* VMaskEvenPI16 */ + vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), + /* VShiftMaskEvenPI16 */ + vec128i(0x0000000Fu, 0x0000000Fu, 0x0000000Fu, 0x0000000Fu), + /* VShiftMaskPS */ + vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), + /* VShiftByteMask */ + vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), + /* VSwapWordMask */ + vec128i(0x03030303u, 0x03030303u, 0x03030303u, 0x03030303u), + /* VUnsignedDwordMax */ + vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), + /* V255 */ vec128f(255.0f), + /* VPI32 */ vec128i(32), + /* VSignMaskI8 */ + vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u), + /* VSignMaskI16 */ + vec128i(0x80008000u, 0x80008000u, 0x80008000u, 0x80008000u), + /* VSignMaskI32 */ + vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* VSignMaskF32 */ + vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* VShortMinPS */ vec128f(SHRT_MIN), + /* VShortMaxPS */ vec128f(SHRT_MAX), + /* VIntMin */ vec128i(INT_MIN), + /* VIntMax */ vec128i(INT_MAX), + /* VIntMaxPD */ vec128d(INT_MAX), + /* VPosIntMinPS */ vec128f((float)0x80000000u), + /* VQNaN */ vec128i(0x7FC00000u), + /* VInt127 */ vec128i(0x7Fu), + /* V2To32 */ vec128f(0x1.0p32f), +}; + +// First location to try and place constants. +static const uintptr_t kConstDataLocation = 0x20000000; +static const uintptr_t kConstDataSize = sizeof(v_consts); + +// Increment the location by this amount for every allocation failure. +static const uintptr_t kConstDataIncrement = 0x00001000; + +// This function places constant data that is used by the emitter later on. +// Only called once and used by multiple instances of the emitter. +// +// TODO(DrChat): This should be placed in the code cache with the code, but +// doing so requires RIP-relative addressing, which is difficult to support +// given the current setup. +uintptr_t A64Emitter::PlaceConstData() { + uint8_t* ptr = reinterpret_cast(kConstDataLocation); + void* mem = nullptr; + while (!mem) { + mem = memory::AllocFixed( + ptr, xe::round_up(kConstDataSize, memory::page_size()), + memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite); + + ptr += kConstDataIncrement; + } + + // The pointer must not be greater than 31 bits. + assert_zero(reinterpret_cast(mem) & ~0x7FFFFFFF); + std::memcpy(mem, v_consts, sizeof(v_consts)); + memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr); + + return reinterpret_cast(mem); +} + +void A64Emitter::FreeConstData(uintptr_t data) { + memory::DeallocFixed(reinterpret_cast(data), 0, + memory::DeallocationType::kRelease); +} + +uintptr_t A64Emitter::GetVConstPtr() const { return backend_->emitter_data(); } + +uintptr_t A64Emitter::GetVConstPtr(VConst id) const { + // Load through fixed constant table setup by PlaceConstData. + // It's important that the pointer is not signed, as it will be sign-extended. + return GetVConstPtr() + GetVConstOffset(id); +} + +// Implies possible StashV(0, ...)! +void A64Emitter::LoadConstantV(oaknut::QReg dest, const vec128_t& v) { + if (!v.low && !v.high) { + // 0000... + // MOVI is implemented as a register-rename while EOR(x, x, x) is not + // https://dougallj.github.io/applecpu/firestorm.html + MOVI(dest.B16(), 0); + } else if (v.low == ~uint64_t(0) && v.high == ~uint64_t(0)) { + // 1111... + MOVI(dest.B16(), 0xFF); + } else { + // Try to figure out some common splat-patterns to utilize MOVI rather than + // stashing to memory. + const bool all_same_u8 = + std::adjacent_find(std::cbegin(v.u8), std::cend(v.u8), + std::not_equal_to<>()) == std::cend(v.u8); + + if (all_same_u8) { + // 0xXX, 0xXX, 0xXX... + MOVI(dest.B16(), v.u8[0]); + return; + } + + const bool all_same_u16 = + std::adjacent_find(std::cbegin(v.u16), std::cend(v.u16), + std::not_equal_to<>()) == std::cend(v.u16); + + if (all_same_u16) { + if ((v.u16[0] & 0xFF00) == 0) { + // 0x00XX, 0x00XX, 0x00XX... + MOVI(dest.H8(), uint8_t(v.u16[0])); + return; + } else if ((v.u16[0] & 0x00FF) == 0) { + // 0xXX00, 0xXX00, 0xXX00... + MOVI(dest.H8(), uint8_t(v.u16[0] >> 8), oaknut::util::LSL, 8); + return; + } + } + + const bool all_same_u32 = + std::adjacent_find(std::cbegin(v.u32), std::cend(v.u32), + std::not_equal_to<>()) == std::cend(v.u32); + + if (all_same_u32) { + if ((v.u32[0] & 0x00FFFFFF) == 0) { + // This is used a lot for certain float-splats and should be checked + // first before the others + // 0xXX000000, 0xXX000000, 0xXX000000... + MOVI(dest.S4(), uint8_t(v.u32[0] >> 24), oaknut::util::LSL, 24); + return; + } else if ((v.u32[0] & 0xFFFFFF00) == 0) { + // 0x000000XX, 0x000000XX, 0x000000XX... + MOVI(dest.S4(), uint8_t(v.u32[0])); + return; + } else if ((v.u32[0] & 0xFFFF00FF) == 0) { + // 0x0000XX00, 0x0000XX00, 0x0000XX00... + MOVI(dest.S4(), uint8_t(v.u32[0] >> 8), oaknut::util::LSL, 8); + return; + } else if ((v.u32[0] & 0xFF00FFFF) == 0) { + // 0x00XX0000, 0x00XX0000, 0x00XX0000... + MOVI(dest.S4(), uint8_t(v.u32[0] >> 16), oaknut::util::LSL, 16); + return; + } + + // Try to utilize FMOV if possible + oaknut::FImm8 fp8(0); + if (f32_to_fimm8(v.u32[0], fp8)) { + FMOV(dest.S4(), fp8); + return; + } + } + + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + MovMem64(SP, kStashOffset, v.low); + MovMem64(SP, kStashOffset + 8, v.high); + LDR(dest, SP, kStashOffset); + } +} + +void A64Emitter::LoadConstantV(oaknut::QReg dest, float v) { + union { + float f; + uint32_t i; + } x = {v}; + if (!x.i) { + // +0.0f (but not -0.0f because it may be used to flip the sign via xor). + MOVI(dest.B16(), 0); + } else if (x.i == ~uint32_t(0)) { + // 1111... + MOVI(dest.B16(), 0xFF); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + + // Try to utilize FMOV if possible + oaknut::FImm8 fp8(0); + if (f32_to_fimm8(x.i, fp8)) { + FMOV(dest.toS(), fp8); + return; + } + + MOV(W0, x.i); + FMOV(dest.toS(), W0); + } +} + +void A64Emitter::LoadConstantV(oaknut::QReg dest, double v) { + union { + double d; + uint64_t i; + } x = {v}; + if (!x.i) { + // +0.0 (but not -0.0 because it may be used to flip the sign via xor). + MOVI(dest.toD(), oaknut::RepImm(0)); + } else if (x.i == ~uint64_t(0)) { + // 1111... + MOVI(dest.toD(), oaknut::RepImm(0xFF)); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + + // Try to utilize FMOV if possible + oaknut::FImm8 fp8(0); + if (f64_to_fimm8(x.i, fp8)) { + FMOV(dest.toD(), fp8); + return; + } + + MOV(X0, x.i); + FMOV(dest.toD(), X0); + } +} + +uintptr_t A64Emitter::StashV(int index, const oaknut::QReg& r) { + // auto addr = ptr[rsp + kStashOffset + (index * 16)]; + // vmovups(addr, r); + const auto addr = kStashOffset + (index * 16); + STR(r, SP, addr); + return addr; +} + +uintptr_t A64Emitter::StashConstantV(int index, float v) { + union { + float f; + uint32_t i; + } x = {v}; + const auto addr = kStashOffset + (index * 16); + MovMem64(SP, addr, x.i); + MovMem64(SP, addr + 8, 0); + return addr; +} + +uintptr_t A64Emitter::StashConstantV(int index, double v) { + union { + double d; + uint64_t i; + } x = {v}; + const auto addr = kStashOffset + (index * 16); + MovMem64(SP, addr, x.i); + MovMem64(SP, addr + 8, 0); + return addr; +} + +uintptr_t A64Emitter::StashConstantV(int index, const vec128_t& v) { + const auto addr = kStashOffset + (index * 16); + MovMem64(SP, addr, v.low); + MovMem64(SP, addr + 8, v.high); + return addr; +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_emitter.h b/src/xenia/cpu/backend/a64/a64_emitter.h new file mode 100644 index 000000000..629c67a4b --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_emitter.h @@ -0,0 +1,267 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_EMITTER_H_ +#define XENIA_CPU_BACKEND_A64_A64_EMITTER_H_ + +#include +#include + +#include "xenia/base/arena.h" +#include "xenia/cpu/function.h" +#include "xenia/cpu/function_trace_data.h" +#include "xenia/cpu/hir/hir_builder.h" +#include "xenia/cpu/hir/instr.h" +#include "xenia/cpu/hir/value.h" +#include "xenia/memory.h" + +#include "oaknut/code_block.hpp" +#include "oaknut/oaknut.hpp" + +namespace xe { +namespace cpu { +class Processor; +} // namespace cpu +} // namespace xe + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +class A64Backend; +class A64CodeCache; + +struct EmitFunctionInfo; + +enum RegisterFlags { + REG_DEST = (1 << 0), + REG_ABCD = (1 << 1), +}; + +enum VConst { + VZero = 0, + VOnePD, + VNegativeOne, + VFFFF, + VMaskX16Y16, + VFlipX16Y16, + VFixX16Y16, + VNormalizeX16Y16, + V0001, + V3301, + V3331, + V3333, + VSignMaskPS, + VSignMaskPD, + VAbsMaskPS, + VAbsMaskPD, + VByteSwapMask, + VByteOrderMask, + VPermuteControl15, + VPermuteByteMask, + VPackD3DCOLORSat, + VPackD3DCOLOR, + VUnpackD3DCOLOR, + VPackFLOAT16_2, + VUnpackFLOAT16_2, + VPackFLOAT16_4, + VUnpackFLOAT16_4, + VPackSHORT_Min, + VPackSHORT_Max, + VPackSHORT_2, + VPackSHORT_4, + VUnpackSHORT_2, + VUnpackSHORT_4, + VUnpackSHORT_Overflow, + VPackUINT_2101010_MinUnpacked, + VPackUINT_2101010_MaxUnpacked, + VPackUINT_2101010_MaskUnpacked, + VPackUINT_2101010_MaskPacked, + VPackUINT_2101010_Shift, + VUnpackUINT_2101010_Overflow, + VPackULONG_4202020_MinUnpacked, + VPackULONG_4202020_MaxUnpacked, + VPackULONG_4202020_MaskUnpacked, + VPackULONG_4202020_PermuteXZ, + VPackULONG_4202020_PermuteYW, + VUnpackULONG_4202020_Permute, + VUnpackULONG_4202020_Overflow, + VOneOver255, + VMaskEvenPI16, + VShiftMaskEvenPI16, + VShiftMaskPS, + VShiftByteMask, + VSwapWordMask, + VUnsignedDwordMax, + V255, + VPI32, + VSignMaskI8, + VSignMaskI16, + VSignMaskI32, + VSignMaskF32, + VShortMinPS, + VShortMaxPS, + VIntMin, + VIntMax, + VIntMaxPD, + VPosIntMinPS, + VQNaN, + VInt127, + V2To32, +}; + +enum A64EmitterFeatureFlags { + kA64EmitLSE = 1 << 0, + kA64EmitF16C = 1 << 1, +}; + +class A64Emitter : public oaknut::VectorCodeGenerator { + public: + A64Emitter(A64Backend* backend); + virtual ~A64Emitter(); + + Processor* processor() const { return processor_; } + A64Backend* backend() const { return backend_; } + + static uintptr_t PlaceConstData(); + static void FreeConstData(uintptr_t data); + + bool Emit(GuestFunction* function, hir::HIRBuilder* builder, + uint32_t debug_info_flags, FunctionDebugInfo* debug_info, + void** out_code_address, size_t* out_code_size, + std::vector* out_source_map); + + public: + // Reserved: XSP, X27, X28 + // Scratch: X1-X15, X30 | V0-v7 and V16-V31 + // V0-2 + // Available: X19-X26 + // V4-V15 (save to get V3) + static const size_t GPR_COUNT = 8; + static const size_t FPR_COUNT = 8; + + static void SetupReg(const hir::Value* v, oaknut::WReg& r) { + const auto idx = gpr_reg_map_[v->reg.index]; + r = oaknut::WReg(idx); + } + static void SetupReg(const hir::Value* v, oaknut::XReg& r) { + const auto idx = gpr_reg_map_[v->reg.index]; + r = oaknut::XReg(idx); + } + static void SetupReg(const hir::Value* v, oaknut::SReg& r) { + const auto idx = fpr_reg_map_[v->reg.index]; + r = oaknut::SReg(idx); + } + static void SetupReg(const hir::Value* v, oaknut::DReg& r) { + const auto idx = fpr_reg_map_[v->reg.index]; + r = oaknut::DReg(idx); + } + static void SetupReg(const hir::Value* v, oaknut::QReg& r) { + const auto idx = fpr_reg_map_[v->reg.index]; + r = oaknut::QReg(idx); + } + + // Gets(and possibly create) an HIR label with the specified name + oaknut::Label* lookup_label(const char* label_name) { + return &label_lookup_[label_name]; + } + + oaknut::Label& epilog_label() { return *epilog_label_; } + + void MarkSourceOffset(const hir::Instr* i); + + void DebugBreak(); + void Trap(uint16_t trap_type = 0); + void UnimplementedInstr(const hir::Instr* i); + + void Call(const hir::Instr* instr, GuestFunction* function); + void CallIndirect(const hir::Instr* instr, const oaknut::XReg& reg); + void CallExtern(const hir::Instr* instr, const Function* function); + void CallNative(void* fn); + void CallNative(uint64_t (*fn)(void* raw_context)); + void CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)); + void CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0), + uint64_t arg0); + void CallNativeSafe(void* fn); + void SetReturnAddress(uint64_t value); + + static oaknut::XReg GetNativeParam(uint32_t param); + + static oaknut::XReg GetContextReg(); + static oaknut::XReg GetMembaseReg(); + void ReloadContext(); + void ReloadMembase(); + + // Moves a 64bit immediate into memory. + static bool ConstantFitsIn32Reg(uint64_t v); + void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v); + + uintptr_t GetVConstPtr() const; + uintptr_t GetVConstPtr(VConst id) const; + static constexpr uintptr_t GetVConstOffset(VConst id) { + return sizeof(vec128_t) * id; + } + void LoadConstantV(oaknut::QReg dest, float v); + void LoadConstantV(oaknut::QReg dest, double v); + void LoadConstantV(oaknut::QReg dest, const vec128_t& v); + + // Returned addresses are relative to XSP + uintptr_t StashV(int index, const oaknut::QReg& r); + uintptr_t StashConstantV(int index, float v); + uintptr_t StashConstantV(int index, double v); + uintptr_t StashConstantV(int index, const vec128_t& v); + + bool IsFeatureEnabled(uint32_t feature_flag) const { + return (feature_flags_ & feature_flag) == feature_flag; + } + + FunctionDebugInfo* debug_info() const { return debug_info_; } + + size_t stack_size() const { return stack_size_; } + + protected: + void* Emplace(const EmitFunctionInfo& func_info, + GuestFunction* function = nullptr); + bool Emit(hir::HIRBuilder* builder, EmitFunctionInfo& func_info); + void EmitGetCurrentThreadId(); + void EmitTraceUserCallReturn(); + + protected: + Processor* processor_ = nullptr; + A64Backend* backend_ = nullptr; + A64CodeCache* code_cache_ = nullptr; + uint32_t feature_flags_ = 0; + + std::vector assembly_buffer; + + oaknut::Label* epilog_label_ = nullptr; + + // Convert from plain-text label-names into oaknut-labels + std::unordered_map label_lookup_; + + hir::Instr* current_instr_ = nullptr; + + FunctionDebugInfo* debug_info_ = nullptr; + uint32_t debug_info_flags_ = 0; + FunctionTraceData* trace_data_ = nullptr; + Arena source_map_arena_; + + size_t stack_size_ = 0; + + static const uint8_t gpr_reg_map_[GPR_COUNT]; + static const uint8_t fpr_reg_map_[FPR_COUNT]; +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_EMITTER_H_ diff --git a/src/xenia/cpu/backend/a64/a64_function.cc b/src/xenia/cpu/backend/a64/a64_function.cc new file mode 100644 index 000000000..9167bde7c --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_function.cc @@ -0,0 +1,45 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_function.h" + +#include "xenia/cpu/backend/a64/a64_backend.h" +#include "xenia/cpu/processor.h" +#include "xenia/cpu/thread_state.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +A64Function::A64Function(Module* module, uint32_t address) + : GuestFunction(module, address) {} + +A64Function::~A64Function() { + // machine_code_ is freed by code cache. +} + +void A64Function::Setup(uint8_t* machine_code, size_t machine_code_length) { + machine_code_ = machine_code; + machine_code_length_ = machine_code_length; +} + +bool A64Function::CallImpl(ThreadState* thread_state, uint32_t return_address) { + auto backend = + reinterpret_cast(thread_state->processor()->backend()); + auto thunk = backend->host_to_guest_thunk(); + thunk(machine_code_, thread_state->context(), + reinterpret_cast(uintptr_t(return_address))); + return true; +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_function.h b/src/xenia/cpu/backend/a64/a64_function.h new file mode 100644 index 000000000..d4c568567 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_function.h @@ -0,0 +1,44 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_FUNCTION_H_ +#define XENIA_CPU_BACKEND_A64_A64_FUNCTION_H_ + +#include "xenia/cpu/function.h" +#include "xenia/cpu/thread_state.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +class A64Function : public GuestFunction { + public: + A64Function(Module* module, uint32_t address); + ~A64Function() override; + + uint8_t* machine_code() const override { return machine_code_; } + size_t machine_code_length() const override { return machine_code_length_; } + + void Setup(uint8_t* machine_code, size_t machine_code_length); + + protected: + bool CallImpl(ThreadState* thread_state, uint32_t return_address) override; + + private: + uint8_t* machine_code_ = nullptr; + size_t machine_code_length_ = 0; +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_FUNCTION_H_ diff --git a/src/xenia/cpu/backend/a64/a64_op.h b/src/xenia/cpu/backend/a64/a64_op.h new file mode 100644 index 000000000..2b2f58932 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_op.h @@ -0,0 +1,618 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ +#ifndef XENIA_CPU_BACKEND_A64_A64_OP_H_ +#define XENIA_CPU_BACKEND_A64_A64_OP_H_ + +#include "xenia/cpu/backend/a64/a64_emitter.h" + +#include "xenia/cpu/hir/instr.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +// TODO(benvanik): direct usings. +using namespace xe::cpu; +using namespace xe::cpu::hir; +using namespace oaknut; +using namespace oaknut::util; + +// Selects the right byte/word/etc from a vector. We need to flip logical +// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...) +#define VEC128_B(n) ((n) ^ 0x3) +#define VEC128_W(n) ((n) ^ 0x1) +#define VEC128_D(n) (n) +#define VEC128_F(n) (n) + +enum KeyType { + KEY_TYPE_X = OPCODE_SIG_TYPE_X, + KEY_TYPE_L = OPCODE_SIG_TYPE_L, + KEY_TYPE_O = OPCODE_SIG_TYPE_O, + KEY_TYPE_S = OPCODE_SIG_TYPE_S, + KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE, + KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE, + KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE, + KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE, + KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE, + KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, + KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, +}; + +#pragma pack(push, 1) +union InstrKey { + uint32_t value; + struct { + uint32_t opcode : 8; + uint32_t dest : 5; + uint32_t src1 : 5; + uint32_t src2 : 5; + uint32_t src3 : 5; + uint32_t reserved : 4; + }; + + operator uint32_t() const { return value; } + + InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); } + InstrKey(uint32_t v) : value(v) {} + InstrKey(const Instr* i) : value(0) { + opcode = i->opcode->num; + uint32_t sig = i->opcode->signature; + dest = + GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; + src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); + if (src1 == OPCODE_SIG_TYPE_V) { + src1 += i->src1.value->type; + } + src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); + if (src2 == OPCODE_SIG_TYPE_V) { + src2 += i->src2.value->type; + } + src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); + if (src3 == OPCODE_SIG_TYPE_V) { + src3 += i->src3.value->type; + } + } + + template + struct Construct { + static const uint32_t value = + (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); + }; +}; +#pragma pack(pop) +static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes"); + +template +struct CombinedStruct; +template <> +struct CombinedStruct<> {}; +template +struct CombinedStruct : T, CombinedStruct {}; + +struct OpBase {}; + +template +struct Op : OpBase { + static const KeyType key_type = KEY_TYPE; +}; + +struct VoidOp : Op { + protected: + friend struct Op; + template + friend struct I; + void Load(const Instr::Op& op) {} +}; + +struct OffsetOp : Op { + uint64_t value; + + protected: + friend struct Op; + template + friend struct I; + void Load(const Instr::Op& op) { this->value = op.offset; } +}; + +struct SymbolOp : Op { + Function* value; + + protected: + friend struct Op; + template + friend struct I; + bool Load(const Instr::Op& op) { + this->value = op.symbol; + return true; + } +}; + +struct LabelOp : Op { + hir::Label* value; + + protected: + friend struct Op; + template + friend struct I; + void Load(const Instr::Op& op) { this->value = op.label; } +}; + +template +struct ValueOp : Op, KEY_TYPE> { + typedef REG_TYPE reg_type; + const Value* value; + bool is_constant; + virtual bool ConstantFitsIn32Reg() const { return true; } + const REG_TYPE& reg() const { + assert_true(!is_constant); + return reg_; + } + operator const REG_TYPE&() const { return reg(); } + bool IsEqual(const T& b) const { + if (is_constant && b.is_constant) { + return reinterpret_cast(this)->constant() == b.constant(); + } else if (!is_constant && !b.is_constant) { + return reg_.index() == b.reg_.index(); + } else { + return false; + } + } + bool IsEqual(const oaknut::Reg& b) const { + if (is_constant) { + return false; + } else if (!is_constant) { + return reg_.index() == b.index(); + } else { + return false; + } + } + bool operator==(const T& b) const { return IsEqual(b); } + bool operator!=(const T& b) const { return !IsEqual(b); } + bool operator==(const oaknut::Reg& b) const { return IsEqual(b); } + bool operator!=(const oaknut::Reg& b) const { return !IsEqual(b); } + void Load(const Instr::Op& op) { + value = op.value; + is_constant = value->IsConstant(); + if (!is_constant) { + A64Emitter::SetupReg(value, reg_); + } + } + + protected: + REG_TYPE reg_ = REG_TYPE(0); +}; + +struct I8Op : ValueOp { + typedef ValueOp BASE; + const int8_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i8; + } +}; +struct I16Op : ValueOp { + typedef ValueOp BASE; + const int16_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i16; + } +}; +struct I32Op : ValueOp { + typedef ValueOp BASE; + const int32_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i32; + } +}; +struct I64Op : ValueOp { + typedef ValueOp BASE; + const int64_t constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.i64; + } + bool ConstantFitsIn32Reg() const override { + int64_t v = BASE::value->constant.i64; + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; + } +}; +struct F32Op : ValueOp { + typedef ValueOp BASE; + const float constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.f32; + } +}; +struct F64Op : ValueOp { + typedef ValueOp BASE; + const double constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.f64; + } +}; +struct V128Op : ValueOp { + typedef ValueOp BASE; + const vec128_t& constant() const { + assert_true(BASE::is_constant); + return BASE::value->constant.v128; + } +}; + +template +struct DestField; +template +struct DestField { + DEST dest; + + protected: + bool LoadDest(const Instr* i) { + Instr::Op op; + op.value = i->dest; + dest.Load(op); + return true; + } +}; +template <> +struct DestField { + protected: + bool LoadDest(const Instr* i) { return true; } +}; + +template +struct I; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + const Instr* instr; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + return true; + } + return false; + } +}; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + const Instr* instr; + SRC1 src1 = {}; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + src1.Load(i->src1); + return true; + } + return false; + } +}; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + return true; + } + return false; + } +}; +template +struct I : DestField { + typedef DestField BASE; + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = + InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + static const KeyType src3_type = SRC3::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; + SRC3 src3; + + protected: + template + friend struct Sequence; + bool Load(const Instr* i) { + if (InstrKey(i).value == key && BASE::LoadDest(i)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + src3.Load(i->src3); + return true; + } + return false; + } +}; + +template +static const T GetTempReg(A64Emitter& e); +template <> +const WReg GetTempReg(A64Emitter& e) { + return W0; +} +template <> +const XReg GetTempReg(A64Emitter& e) { + return X0; +} + +template +struct Sequence { + typedef T EmitArgType; + + static constexpr uint32_t head_key() { return T::key; } + + static bool Select(A64Emitter& e, const Instr* i) { + T args; + if (!args.Load(i)) { + return false; + } + SEQ::Emit(e, args); + return true; + } + + template + static void EmitUnaryOp(A64Emitter& e, const EmitArgType& i, + const REG_FN& reg_fn) { + if (i.src1.is_constant) { + e.MOV(i.dest, i.src1.constant()); + reg_fn(e, i.dest); + } else { + if (i.dest != i.src1) { + e.MOV(i.dest, i.src1); + } + reg_fn(e, i.dest); + } + } + + template + static void EmitCommutativeBinaryOp(A64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + if (i.src2.is_constant) { + // Both constants. + if (i.src1.ConstantFitsIn32Reg()) { + e.MOV(i.dest, i.src2.constant()); + reg_const_fn(e, i.dest, static_cast(i.src1.constant())); + } else if (i.src2.ConstantFitsIn32Reg()) { + e.MOV(i.dest, i.src1.constant()); + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + e.MOV(i.dest, i.src1.constant()); + auto temp = GetTempReg(e); + e.MOV(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + // src1 constant. + if (i.dest == i.src2) { + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.MOV(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.MOV(i.dest, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1); + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + reg_reg_fn(e, i.dest, i.src1); + } else { + e.MOV(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + template + static void EmitAssociativeBinaryOp(A64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2); + e.MOV(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } else { + e.MOV(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.MOV(i.dest, i.src1); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2); + e.MOV(i.dest, i.src1); + reg_reg_fn(e, i.dest, temp); + } else { + e.MOV(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + + template + static void EmitCommutativeBinaryVOp(A64Emitter& e, const EmitArgType& i, + const FN& fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.LoadConstantV(Q0, i.src1.constant()); + fn(e, i.dest, REG(0), i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.LoadConstantV(Q0, i.src2.constant()); + fn(e, i.dest, i.src1, REG(0)); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitAssociativeBinaryVOp(A64Emitter& e, const EmitArgType& i, + const FN& fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.LoadConstantV(Q0, i.src1.constant()); + fn(e, i.dest, REG(0), i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.LoadConstantV(Q0, i.src2.constant()); + fn(e, i.dest, i.src1, REG(0)); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitCommutativeCompareOp(A64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src2, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src1.constant()); + reg_reg_fn(e, i.src2, temp); + } + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src1, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2.constant()); + reg_reg_fn(e, i.src1, temp); + } + } else { + reg_reg_fn(e, i.src1, i.src2); + } + } + template + static void EmitAssociativeCompareOp(A64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, + const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src2, static_cast(i.src1.constant()), + true); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2, temp, true); + } + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src1, static_cast(i.src2.constant()), + false); + } else { + auto temp = GetTempReg(e); + e.MOV(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1, temp, false); + } + } else { + reg_reg_fn(e, i.dest, i.src1, i.src2, false); + } + } +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_OP_H_ diff --git a/src/xenia/cpu/backend/a64/a64_seq_control.cc b/src/xenia/cpu/backend/a64/a64_seq_control.cc new file mode 100644 index 000000000..e68d2955b --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_seq_control.cc @@ -0,0 +1,551 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_sequences.h" + +#include +#include + +#include "xenia/cpu/backend/a64/a64_op.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +volatile int anchor_control = 0; + +// ============================================================================ +// OPCODE_DEBUG_BREAK +// ============================================================================ +struct DEBUG_BREAK : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { e.DebugBreak(); } +}; +EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK, DEBUG_BREAK); + +// ============================================================================ +// OPCODE_DEBUG_BREAK_TRUE +// ============================================================================ +struct DEBUG_BREAK_TRUE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.DebugBreak(); + e.l(skip); + } +}; +struct DEBUG_BREAK_TRUE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.DebugBreak(); + e.l(skip); + } +}; +struct DEBUG_BREAK_TRUE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.DebugBreak(); + e.l(skip); + } +}; +struct DEBUG_BREAK_TRUE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.DebugBreak(); + e.l(skip); + } +}; +struct DEBUG_BREAK_TRUE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.DebugBreak(); + e.l(skip); + } +}; +struct DEBUG_BREAK_TRUE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.DebugBreak(); + e.l(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE, DEBUG_BREAK_TRUE_I8, + DEBUG_BREAK_TRUE_I16, DEBUG_BREAK_TRUE_I32, + DEBUG_BREAK_TRUE_I64, DEBUG_BREAK_TRUE_F32, + DEBUG_BREAK_TRUE_F64); + +// ============================================================================ +// OPCODE_TRAP +// ============================================================================ +struct TRAP : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.Trap(i.instr->flags); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP); + +// ============================================================================ +// OPCODE_TRAP_TRUE +// ============================================================================ +struct TRAP_TRUE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Trap(i.instr->flags); + e.l(skip); + } +}; +struct TRAP_TRUE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Trap(i.instr->flags); + e.l(skip); + } +}; +struct TRAP_TRUE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Trap(i.instr->flags); + e.l(skip); + } +}; +struct TRAP_TRUE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Trap(i.instr->flags); + e.l(skip); + } +}; +struct TRAP_TRUE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.Trap(i.instr->flags); + e.l(skip); + } +}; +struct TRAP_TRUE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.Trap(i.instr->flags); + e.l(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16, + TRAP_TRUE_I32, TRAP_TRUE_I64, TRAP_TRUE_F32, + TRAP_TRUE_F64); + +// ============================================================================ +// OPCODE_CALL +// ============================================================================ +struct CALL : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src1.value->is_guest()); + e.Call(i.instr, static_cast(i.src1.value)); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL, CALL); + +// ============================================================================ +// OPCODE_CALL_TRUE +// ============================================================================ +struct CALL_TRUE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.l(skip); + } +}; +struct CALL_TRUE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.l(skip); + } +}; +struct CALL_TRUE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.l(skip); + } +}; +struct CALL_TRUE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.l(skip); + } +}; +struct CALL_TRUE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.l(skip); + } +}; +struct CALL_TRUE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->is_guest()); + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.Call(i.instr, static_cast(i.src2.value)); + e.l(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16, + CALL_TRUE_I32, CALL_TRUE_I64, CALL_TRUE_F32, + CALL_TRUE_F64); + +// ============================================================================ +// OPCODE_CALL_INDIRECT +// ============================================================================ +struct CALL_INDIRECT + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CallIndirect(i.instr, i.src1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT, CALL_INDIRECT); + +// ============================================================================ +// OPCODE_CALL_INDIRECT_TRUE +// ============================================================================ +struct CALL_INDIRECT_TRUE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.CallIndirect(i.instr, i.src2); + e.l(skip); + } +}; +struct CALL_INDIRECT_TRUE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.CallIndirect(i.instr, i.src2); + e.l(skip); + } +}; +struct CALL_INDIRECT_TRUE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.CallIndirect(i.instr, i.src2); + e.l(skip); + } +}; +struct CALL_INDIRECT_TRUE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.CBZ(i.src1, skip); + e.CallIndirect(i.instr, i.src2); + e.l(skip); + } +}; +struct CALL_INDIRECT_TRUE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.CallIndirect(i.instr, i.src2); + e.l(skip); + } +}; +struct CALL_INDIRECT_TRUE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label skip; + e.FCMP(i.src1, 0); + e.B(Cond::EQ, skip); + e.CallIndirect(i.instr, i.src2); + e.l(skip); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8, + CALL_INDIRECT_TRUE_I16, CALL_INDIRECT_TRUE_I32, + CALL_INDIRECT_TRUE_I64, CALL_INDIRECT_TRUE_F32, + CALL_INDIRECT_TRUE_F64); + +// ============================================================================ +// OPCODE_CALL_EXTERN +// ============================================================================ +struct CALL_EXTERN + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CallExtern(i.instr, i.src1.value); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN, CALL_EXTERN); + +// ============================================================================ +// OPCODE_RETURN +// ============================================================================ +struct RETURN : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // If this is the last instruction in the last block, just let us + // fall through. + if (i.instr->next || i.instr->block->next) { + e.B(e.epilog_label()); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_RETURN, RETURN); + +// ============================================================================ +// OPCODE_RETURN_TRUE +// ============================================================================ +struct RETURN_TRUE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CBNZ(i.src1, e.epilog_label()); + } +}; +struct RETURN_TRUE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CBNZ(i.src1, e.epilog_label()); + } +}; +struct RETURN_TRUE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CBNZ(i.src1, e.epilog_label()); + } +}; +struct RETURN_TRUE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CBNZ(i.src1, e.epilog_label()); + } +}; +struct RETURN_TRUE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1, 0); + e.B(Cond::NE, e.epilog_label()); + } +}; +struct RETURN_TRUE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1, 0); + e.B(Cond::NE, e.epilog_label()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16, + RETURN_TRUE_I32, RETURN_TRUE_I64, RETURN_TRUE_F32, + RETURN_TRUE_F64); + +// ============================================================================ +// OPCODE_SET_RETURN_ADDRESS +// ============================================================================ +struct SET_RETURN_ADDRESS + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SetReturnAddress(i.src1.constant()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS, SET_RETURN_ADDRESS); + +// ============================================================================ +// OPCODE_BRANCH +// ============================================================================ +struct BRANCH : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src1.value->name); + assert_not_null(label); + e.B(*label); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); + +// ============================================================================ +// OPCODE_BRANCH_TRUE +// ============================================================================ +struct BRANCH_TRUE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBNZ(i.src1, *label); + } +}; +struct BRANCH_TRUE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBNZ(i.src1, *label); + } +}; +struct BRANCH_TRUE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBNZ(i.src1, *label); + } +}; +struct BRANCH_TRUE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBNZ(i.src1, *label); + } +}; +struct BRANCH_TRUE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.FCMP(i.src1, 0); + e.B(Cond::NE, *label); + } +}; +struct BRANCH_TRUE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.FCMP(i.src1, 0); + e.B(Cond::NE, *label); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, + BRANCH_TRUE_I32, BRANCH_TRUE_I64, BRANCH_TRUE_F32, + BRANCH_TRUE_F64); + +// ============================================================================ +// OPCODE_BRANCH_FALSE +// ============================================================================ +struct BRANCH_FALSE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBZ(i.src1, *label); + } +}; +struct BRANCH_FALSE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBZ(i.src1, *label); + } +}; +struct BRANCH_FALSE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBZ(i.src1, *label); + } +}; +struct BRANCH_FALSE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.CBZ(i.src1, *label); + } +}; +struct BRANCH_FALSE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.FCMP(i.src1, 0); + e.B(Cond::EQ, *label); + } +}; +struct BRANCH_FALSE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + oaknut::Label* label = e.lookup_label(i.src2.value->name); + assert_not_null(label); + e.FCMP(i.src1, 0); + e.B(Cond::EQ, *label); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE, BRANCH_FALSE_I8, BRANCH_FALSE_I16, + BRANCH_FALSE_I32, BRANCH_FALSE_I64, BRANCH_FALSE_F32, + BRANCH_FALSE_F64); + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe \ No newline at end of file diff --git a/src/xenia/cpu/backend/a64/a64_seq_memory.cc b/src/xenia/cpu/backend/a64/a64_seq_memory.cc new file mode 100644 index 000000000..d7d66a14d --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_seq_memory.cc @@ -0,0 +1,1207 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_sequences.h" + +#include +#include + +#include "xenia/base/memory.h" +#include "xenia/cpu/backend/a64/a64_op.h" +#include "xenia/cpu/backend/a64/a64_tracers.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +volatile int anchor_memory = 0; + +template +XReg ComputeMemoryAddressOffset(A64Emitter& e, const T& guest, const T& offset, + WReg address_register = W3) { + assert_true(offset.is_constant); + const int32_t offset_const = static_cast(offset.constant()); + + if (guest.is_constant) { + uint32_t address = static_cast(guest.constant()); + address += offset_const; + if (address < 0x80000000) { + e.MOV(address_register.toX(), address); + e.ADD(address_register.toX(), e.GetMembaseReg(), address_register.toX()); + return address_register.toX(); + } else { + if (address >= 0xE0000000 && + xe::memory::allocation_granularity() > 0x1000) { + e.MOV(W0, address + 0x1000); + } else { + e.MOV(W0, address); + } + e.ADD(address_register.toX(), e.GetMembaseReg(), X0); + return address_register.toX(); + } + } else { + if (xe::memory::allocation_granularity() > 0x1000) { + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do + // it via memory mapping. + e.MOV(W0, 0xE0000000 - offset_const); + e.CMP(guest.reg().toW(), W0); + e.CSET(W0, Cond::HS); + e.ADD(W0, guest.reg().toW(), W0, LSL, 12); + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.MOV(W0, guest.reg().toW()); + } + e.MOV(X1, offset_const); + e.ADD(X0, X0, X1); + + e.ADD(address_register.toX(), e.GetMembaseReg(), X0); + return address_register.toX(); + } +} + +// Note: most *should* be aligned, but needs to be checked! +template +XReg ComputeMemoryAddress(A64Emitter& e, const T& guest, + WReg address_register = W3) { + if (guest.is_constant) { + // TODO(benvanik): figure out how to do this without a temp. + // Since the constant is often 0x8... if we tried to use that as a + // displacement it would be sign extended and mess things up. + const uint32_t address = static_cast(guest.constant()); + if (address < 0x80000000) { + e.MOV(W0, address); + e.ADD(address_register.toX(), e.GetMembaseReg(), X0); + return address_register.toX(); + } else { + if (address >= 0xE0000000 && + xe::memory::allocation_granularity() > 0x1000) { + e.MOV(W0, address + 0x1000u); + } else { + e.MOV(W0, address); + } + e.ADD(address_register.toX(), e.GetMembaseReg(), X0); + return address_register.toX(); + } + } else { + if (xe::memory::allocation_granularity() > 0x1000) { + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do + // it via memory mapping. + e.MOV(W0, 0xE0000000); + e.CMP(guest.reg().toW(), W0); + e.CSET(W0, Cond::HS); + e.ADD(W0, guest.reg().toW(), W0, LSL, 12); + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.MOV(W0, guest.reg().toW()); + } + e.ADD(address_register.toX(), e.GetMembaseReg(), X0); + return address_register.toX(); + } +} + +// ============================================================================ +// OPCODE_ATOMIC_EXCHANGE +// ============================================================================ +// Note that the address we use here is a real, host address! +// This is weird, and should be fixed. +template +void EmitAtomicExchangeXX(A64Emitter& e, const ARGS& i, const FN& fn) { + if (i.dest == i.src1) { + e.MOV(X0, i.src1); + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.MOV(i.dest, i.src2.constant()); + } else { + e.MOV(i.dest, i.src2); + } + } + fn(e, i.dest, X0); + } else { + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.MOV(i.dest, i.src2.constant()); + } else { + e.MOV(i.dest, i.src2); + } + } + fn(e, i.dest, i.src1); + } +} +struct ATOMIC_EXCHANGE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX( + e, i, + [](A64Emitter& e, WReg dest, XReg src) { e.SWPALB(dest, dest, src); }); + } +}; +struct ATOMIC_EXCHANGE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX( + e, i, + [](A64Emitter& e, WReg dest, XReg src) { e.SWPALH(dest, dest, src); }); + } +}; +struct ATOMIC_EXCHANGE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX( + e, i, + [](A64Emitter& e, WReg dest, XReg src) { e.SWPAL(dest, dest, src); }); + } +}; +struct ATOMIC_EXCHANGE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX( + e, i, + [](A64Emitter& e, XReg dest, XReg src) { e.SWPAL(dest, dest, src); }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8, + ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32, + ATOMIC_EXCHANGE_I64); + +// ============================================================================ +// OPCODE_ATOMIC_COMPARE_EXCHANGE +// ============================================================================ +struct ATOMIC_COMPARE_EXCHANGE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (xe::memory::allocation_granularity() > 0x1000) { + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do + // it via memory mapping. + e.MOV(W3, 0xE0000000); + e.CMP(i.src1.reg().toW(), W3); + e.CSET(W1, Cond::HS); + e.ADD(W1, i.src1.reg().toW(), W1, LSL, 12); + } else { + e.MOV(W1, i.src1.reg().toW()); + } + e.ADD(X1, e.GetMembaseReg(), X1); + + const XReg address = X1; + const WReg expected = i.src2; + const WReg desired = i.src3; + const WReg status = W0; + + if (e.IsFeatureEnabled(kA64EmitLSE)) { + e.MOV(status, expected); + + // if([C] == A) [C] = B + // else A = [C] + e.CASAL(status, desired, address); + e.CMP(status, expected); + e.CSET(i.dest, Cond::EQ); + return; + } + + oaknut::Label success, fail, retry; + + e.l(retry); + e.LDAXR(W4, address); + e.CMP(W4, expected); + e.B(Cond::NE, fail); + + e.STLXR(status.toW(), desired, address); + e.CBNZ(status, retry); + e.B(success); + + e.l(fail); + e.CLREX(); + + e.l(success); + e.CSET(i.dest, Cond::EQ); + } +}; +struct ATOMIC_COMPARE_EXCHANGE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (xe::memory::allocation_granularity() > 0x1000) { + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do + // it via memory mapping. + e.MOV(W3, 0xE0000000); + e.CMP(i.src1.reg(), X3); + e.CSET(W1, Cond::HS); + e.ADD(W1, i.src1.reg().toW(), W1, LSL, 12); + } else { + e.MOV(W1, i.src1.reg().toW()); + } + e.ADD(X1, e.GetMembaseReg(), X1); + + const XReg address = X1; + const XReg expected = i.src2; + const XReg desired = i.src3; + const XReg status = X0; + + if (e.IsFeatureEnabled(kA64EmitLSE)) { + e.MOV(status, expected); + + // if([C] == A) [C] = B + // else A = [C] + e.CASAL(status, desired, address); + e.CMP(status, expected); + e.CSET(i.dest, Cond::EQ); + return; + } + + oaknut::Label success, fail, retry; + + e.l(retry); + e.LDAXR(X4, address); + e.CMP(X4, expected); + e.B(Cond::NE, fail); + + e.STLXR(status.toW(), desired, address); + e.CBNZ(status, retry); + e.B(success); + + e.l(fail); + e.CLREX(); + + e.l(success); + e.CSET(i.dest, Cond::EQ); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE, + ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64); + +// ============================================================================ +// OPCODE_LOAD_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +struct LOAD_LOCAL_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDRB(i.dest, SP, i.src1.constant()); + // e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDRH(i.dest, SP, i.src1.constant()); + // e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, SP, i.src1.constant()); + // e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, SP, i.src1.constant()); + // e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, SP, i.src1.constant()); + // e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, SP, i.src1.constant()); + // e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +struct LOAD_LOCAL_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, SP, i.src1.constant()); + // e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL, LOAD_LOCAL_I8, LOAD_LOCAL_I16, + LOAD_LOCAL_I32, LOAD_LOCAL_I64, LOAD_LOCAL_F32, + LOAD_LOCAL_F64, LOAD_LOCAL_V128); + +// ============================================================================ +// OPCODE_STORE_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +struct STORE_LOCAL_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2); + e.STRB(i.src2, SP, i.src1.constant()); + } +}; +struct STORE_LOCAL_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2); + e.STRH(i.src2, SP, i.src1.constant()); + } +}; +struct STORE_LOCAL_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2); + e.STR(i.src2, SP, i.src1.constant()); + } +}; +struct STORE_LOCAL_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2); + e.STR(i.src2, SP, i.src1.constant()); + } +}; +struct STORE_LOCAL_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2); + e.STR(i.src2, SP, i.src1.constant()); + } +}; +struct STORE_LOCAL_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2); + e.STR(i.src2, SP, i.src1.constant()); + } +}; +struct STORE_LOCAL_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2); + e.STR(i.src2, SP, i.src1.constant()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16, + STORE_LOCAL_I32, STORE_LOCAL_I64, STORE_LOCAL_F32, + STORE_LOCAL_F64, STORE_LOCAL_V128); + +// ============================================================================ +// OPCODE_LOAD_CONTEXT +// ============================================================================ +struct LOAD_CONTEXT_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDRB(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.MOV(e.GetNativeParam(0), i.src1.value); + e.LDRB(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI8)); + } + } +}; +struct LOAD_CONTEXT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDRH(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.LDRH(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI16)); + } + } +}; +struct LOAD_CONTEXT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.LDR(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI32)); + } + } +}; +struct LOAD_CONTEXT_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.LDR(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadI64)); + } + } +}; +struct LOAD_CONTEXT_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadF32)); + } + } +}; +struct LOAD_CONTEXT_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadF64)); + } + } +}; +struct LOAD_CONTEXT_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.LDR(i.dest, e.GetContextReg(), i.src1.value); + if (IsTracingData()) { + e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextLoadV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT, LOAD_CONTEXT_I8, LOAD_CONTEXT_I16, + LOAD_CONTEXT_I32, LOAD_CONTEXT_I64, LOAD_CONTEXT_F32, + LOAD_CONTEXT_F64, LOAD_CONTEXT_V128); + +// ============================================================================ +// OPCODE_STORE_CONTEXT +// ============================================================================ +// Note: all types are always aligned on the stack. +struct STORE_CONTEXT_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.constant()); + e.STRB(W0, e.GetContextReg(), i.src1.value); + } else { + e.STRB(i.src2.reg(), e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.LDRB(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI8)); + } + } +}; +struct STORE_CONTEXT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.constant()); + e.STRH(W0, e.GetContextReg(), i.src1.value); + } else { + e.STRH(i.src2.reg(), e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.LDRH(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI16)); + } + } +}; +struct STORE_CONTEXT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.constant()); + e.STR(W0, e.GetContextReg(), i.src1.value); + } else { + e.STR(i.src2.reg(), e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.LDR(e.GetNativeParam(1).toW(), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI32)); + } + } +}; +struct STORE_CONTEXT_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.MOV(X0, i.src2.constant()); + e.STR(X0, e.GetContextReg(), i.src1.value); + } else { + e.STR(i.src2.reg(), e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.LDR(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreI64)); + } + } +}; +struct STORE_CONTEXT_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.value->constant.i32); + e.STR(W0, e.GetContextReg(), i.src1.value); + } else { + e.STR(i.src2, e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreF32)); + } + } +}; +struct STORE_CONTEXT_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.MOV(X0, i.src2.value->constant.i64); + e.STR(X0, e.GetContextReg(), i.src1.value); + } else { + e.STR(i.src2, e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreF64)); + } + } +}; +struct STORE_CONTEXT_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.LoadConstantV(Q0, i.src2.constant()); + e.STR(Q0, e.GetContextReg(), i.src1.value); + } else { + e.STR(i.src2, e.GetContextReg(), i.src1.value); + } + if (IsTracingData()) { + e.ADD(e.GetNativeParam(1), e.GetContextReg(), i.src1.value); + e.MOV(e.GetNativeParam(0), i.src1.value); + e.CallNative(reinterpret_cast(TraceContextStoreV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT, STORE_CONTEXT_I8, STORE_CONTEXT_I16, + STORE_CONTEXT_I32, STORE_CONTEXT_I64, STORE_CONTEXT_F32, + STORE_CONTEXT_F64, STORE_CONTEXT_V128); + +// ============================================================================ +// OPCODE_LOAD_MMIO +// ============================================================================ +// Note: all types are always aligned in the context. +struct LOAD_MMIO_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // uint64_t (context, addr) + const auto mmio_range = reinterpret_cast(i.src1.value); + const auto read_address = uint32_t(i.src2.value); + e.MOV(e.GetNativeParam(0), uint64_t(mmio_range->callback_context)); + e.MOV(e.GetNativeParam(1).toW(), read_address); + e.CallNativeSafe(reinterpret_cast(mmio_range->read)); + e.REV(i.dest, W0); + if (IsTracingData()) { + e.MOV(e.GetNativeParam(0).toW(), i.dest); + e.MOV(X1, read_address); + e.CallNative(reinterpret_cast(TraceContextLoadI32)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_MMIO, LOAD_MMIO_I32); + +// ============================================================================ +// OPCODE_STORE_MMIO +// ============================================================================ +// Note: all types are always aligned on the stack. +struct STORE_MMIO_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // void (context, addr, value) + const auto mmio_range = reinterpret_cast(i.src1.value); + const auto write_address = uint32_t(i.src2.value); + e.MOV(e.GetNativeParam(0), uint64_t(mmio_range->callback_context)); + e.MOV(e.GetNativeParam(1).toW(), write_address); + if (i.src3.is_constant) { + e.MOV(e.GetNativeParam(2).toW(), xe::byte_swap(i.src3.constant())); + } else { + e.REV(e.GetNativeParam(2).toW(), i.src3); + } + e.CallNativeSafe(reinterpret_cast(mmio_range->write)); + if (IsTracingData()) { + if (i.src3.is_constant) { + e.MOV(e.GetNativeParam(0).toW(), i.src3.constant()); + } else { + e.MOV(e.GetNativeParam(0).toW(), i.src3); + } + e.MOV(X1, write_address); + e.CallNative(reinterpret_cast(TraceContextStoreI32)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32); + +// ============================================================================ +// OPCODE_LOAD_OFFSET +// ============================================================================ +struct LOAD_OFFSET_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + e.LDRB(i.dest, addr_reg); + } +}; + +struct LOAD_OFFSET_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + e.LDRH(i.dest, addr_reg); + e.REV16(i.dest, i.dest); + } else { + e.LDRH(i.dest, addr_reg); + } + } +}; + +struct LOAD_OFFSET_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + e.LDR(i.dest, addr_reg); + e.REV(i.dest, i.dest); + } else { + e.LDR(i.dest, addr_reg); + } + } +}; + +struct LOAD_OFFSET_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + e.LDR(i.dest, addr_reg); + e.REV(i.dest, i.dest); + } else { + e.LDR(i.dest, addr_reg); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, + LOAD_OFFSET_I32, LOAD_OFFSET_I64); + +// ============================================================================ +// OPCODE_STORE_OFFSET +// ============================================================================ +struct STORE_OFFSET_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.src3.is_constant) { + e.MOV(W0, i.src3.constant()); + e.STRB(W0, addr_reg); + } else { + e.STRB(i.src3, addr_reg); + } + } +}; + +struct STORE_OFFSET_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + assert_always("not implemented"); + } else { + if (i.src3.is_constant) { + e.MOV(W0, i.src3.constant()); + e.STRH(W0, addr_reg); + } else { + e.STRH(i.src3, addr_reg); + } + } + } +}; + +struct STORE_OFFSET_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + assert_always("not implemented"); + } else { + if (i.src3.is_constant) { + e.MOV(W0, i.src3.constant()); + e.STR(W0, addr_reg); + } else { + e.STR(i.src3, addr_reg); + } + } + } +}; + +struct STORE_OFFSET_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + assert_always("not implemented"); + } else { + if (i.src3.is_constant) { + e.MovMem64(addr_reg, 0, i.src3.constant()); + } else { + e.STR(i.src3, addr_reg); + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE_OFFSET, STORE_OFFSET_I8, STORE_OFFSET_I16, + STORE_OFFSET_I32, STORE_OFFSET_I64); + +// ============================================================================ +// OPCODE_LOAD +// ============================================================================ +struct LOAD_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDRB(i.dest, addr_reg); + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1).toW(), i.dest); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadI8)); + } + } +}; +struct LOAD_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + e.LDRH(i.dest, addr_reg); + e.REV16(i.dest, i.dest); + } else { + e.LDRH(i.dest, addr_reg); + } + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1).toW(), i.dest); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadI16)); + } + } +}; +struct LOAD_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + e.LDR(i.dest, addr_reg); + e.REV(i.dest, i.dest); + } else { + e.LDR(i.dest, addr_reg); + } + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1).toW(), i.dest); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadI32)); + } + } +}; +struct LOAD_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + e.LDR(i.dest, addr_reg); + e.REV64(i.dest, i.dest); + } else { + e.LDR(i.dest, addr_reg); + } + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1), i.dest); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadI64)); + } + } +}; +struct LOAD_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDR(i.dest, addr_reg); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_always("not implemented yet"); + } + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadF32)); + } + } +}; +struct LOAD_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDR(i.dest, addr_reg); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_always("not implemented yet"); + } + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadF64)); + } + } +}; +struct LOAD_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDR(i.dest, addr_reg); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + // Reverse upper and lower 64-bit halfs + e.REV64(i.dest.reg().B16(), i.dest.reg().B16()); + // Reverse the 64-bit halfs themselves + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } + if (IsTracingData()) { + e.MOV(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryLoadV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD, LOAD_I8, LOAD_I16, LOAD_I32, LOAD_I64, + LOAD_F32, LOAD_F64, LOAD_V128); + +// ============================================================================ +// OPCODE_STORE +// ============================================================================ +// Note: most *should* be aligned, but needs to be checked! +struct STORE_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.MOV(W0, i.src2.constant()); + e.STRB(W0, addr_reg); + } else { + e.STRB(i.src2.reg(), addr_reg); + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDRB(e.GetNativeParam(1).toW(), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreI8)); + } + } +}; +struct STORE_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not implemented"); + } else { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.constant()); + e.STRH(W0, addr_reg); + } else { + e.STRH(i.src2.reg(), addr_reg); + } + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDRH(e.GetNativeParam(1).toW(), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreI16)); + } + } +}; +struct STORE_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not implemented"); + } else { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.constant()); + e.STR(W0, addr_reg); + } else { + e.STR(i.src2.reg(), addr_reg); + } + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDR(e.GetNativeParam(1).toW(), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); + } + } +}; +struct STORE_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not implemented"); + } else { + if (i.src2.is_constant) { + e.MovMem64(addr_reg, 0, i.src2.constant()); + } else { + e.STR(i.src2.reg(), addr_reg); + } + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.LDR(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreI64)); + } + } +}; +struct STORE_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not yet implemented"); + } else { + if (i.src2.is_constant) { + e.MOV(W0, i.src2.value->constant.i32); + e.STR(W0, addr_reg); + } else { + e.STR(i.src2, addr_reg); + } + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.MOV(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreF32)); + } + } +}; +struct STORE_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + assert_always("not yet implemented"); + } else { + if (i.src2.is_constant) { + e.MOV(X0, i.src2.value->constant.i64); + e.STR(X0, addr_reg); + } else { + e.STR(i.src2, addr_reg); + } + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.MOV(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreF64)); + } + } +}; +struct STORE_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto addr_reg = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + // Reverse upper and lower 64-bit halfs + e.REV64(Q0.B16(), i.src2.reg().B16()); + // Reverse the 64-bit halfs themselves + e.EXT(Q0.B16(), Q0.B16(), Q0.B16(), 8); + e.STR(Q0, addr_reg); + } else { + if (i.src2.is_constant) { + e.LoadConstantV(Q0, i.src2.constant()); + e.STR(Q0, addr_reg); + } else { + e.STR(i.src2, addr_reg); + } + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.MOV(e.GetNativeParam(1), addr_reg); + e.MOV(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemoryStoreV128)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STORE, STORE_I8, STORE_I16, STORE_I32, STORE_I64, + STORE_F32, STORE_F64, STORE_V128); + +// ============================================================================ +// OPCODE_CACHE_CONTROL +// ============================================================================ +struct CACHE_CONTROL + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + bool is_clflush = false, is_prefetch = false; + switch (CacheControlType(i.instr->flags)) { + case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH: + case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE: + is_prefetch = true; + break; + case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE: + case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE_AND_FLUSH: + is_clflush = true; + break; + default: + assert_unhandled_case(CacheControlType(i.instr->flags)); + return; + } + size_t cache_line_size = i.src2.value; + + XReg addr = X0; + uint32_t address_constant; + if (i.src1.is_constant) { + // TODO(benvanik): figure out how to do this without a temp. + // Since the constant is often 0x8... if we tried to use that as a + // displacement it would be sign extended and mess things up. + address_constant = static_cast(i.src1.constant()); + if (address_constant < 0x80000000) { + e.ADD(addr, e.GetMembaseReg(), address_constant); + } else { + if (address_constant >= 0xE0000000 && + xe::memory::allocation_granularity() > 0x1000) { + e.MOV(X1, address_constant + 0x1000); + } else { + e.MOV(X1, address_constant); + } + e.ADD(addr, e.GetMembaseReg(), X1); + } + } else { + if (xe::memory::allocation_granularity() > 0x1000) { + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do + // it via memory mapping. + e.MOV(X1, 0xE0000000); + e.CMP(i.src1.reg(), X1); + e.CSET(X1, Cond::HS); + e.ADD(X1, i.src1.reg(), X1, LSL, 12); + } else { + // Clear the top 32 bits, as they are likely garbage. + e.MOV(W1, i.src1.reg().toW()); + } + e.ADD(addr, e.GetMembaseReg(), X1); + } + + if (is_clflush) { + // TODO(wunkolo): These kind of cache-maintenance instructions cause an + // illegal-instruction on windows, but is trapped to proper EL1 code on + // Linux. Need a way to do cache-maintenance on Windows-Arm + // e.DC(DcOp::CIVAC, addr); + + // Full data sync + e.DSB(BarrierOp::ISH); + } + if (is_prefetch) { + e.PRFM(PrfOp::PLDL1KEEP, addr); + } + + if (cache_line_size >= 128) { + // Prefetch the other 64 bytes of the 128-byte cache line. + if (i.src1.is_constant && address_constant < 0x80000000) { + e.ADD(addr, e.GetMembaseReg(), address_constant ^ 64); + } else { + e.EOR(X1, X1, 64); + } + if (is_clflush) { + // TODO(wunkolo): These kind of cache-maintenance instructions cause an + // illegal-instruction on windows, but is trapped to proper EL1 code on + // Linux. Need a way to do cache-maintenance on Windows-Arm + // e.DC(DcOp::CIVAC, addr); + + // Full data sync + e.DSB(BarrierOp::ISH); + } + if (is_prefetch) { + e.PRFM(PrfOp::PLDL1KEEP, addr); + } + assert_true(cache_line_size == 128); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CACHE_CONTROL, CACHE_CONTROL); + +// ============================================================================ +// OPCODE_MEMORY_BARRIER +// ============================================================================ +struct MEMORY_BARRIER + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.DMB(BarrierOp::SY); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MEMORY_BARRIER, MEMORY_BARRIER); + +// ============================================================================ +// OPCODE_MEMSET +// ============================================================================ +struct MEMSET_I64_I8_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + assert_true(i.src3.is_constant); + assert_true(i.src2.constant() == 0); + e.MOVI(Q0.B16(), 0); + auto addr_reg = ComputeMemoryAddress(e, i.src1); + switch (i.src3.constant()) { + case 32: + e.STP(Q0, Q0, addr_reg, 0 * 16); + break; + case 128: + e.STP(Q0, Q0, addr_reg, 0 * 16); + e.STP(Q0, Q0, addr_reg, 2 * 16); + e.STP(Q0, Q0, addr_reg, 4 * 16); + e.STP(Q0, Q0, addr_reg, 6 * 16); + break; + default: + assert_unhandled_case(i.src3.constant()); + break; + } + if (IsTracingData()) { + addr_reg = ComputeMemoryAddress(e, i.src1); + e.MOV(e.GetNativeParam(2), i.src3.constant()); + e.MOV(e.GetNativeParam(1), i.src2.constant()); + e.LDR(e.GetNativeParam(0), addr_reg); + e.CallNative(reinterpret_cast(TraceMemset)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MEMSET, MEMSET_I64_I8_I64); + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_seq_vector.cc b/src/xenia/cpu/backend/a64/a64_seq_vector.cc new file mode 100644 index 000000000..abc4688ac --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc @@ -0,0 +1,2170 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Xenia Developers. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_sequences.h" +#include "xenia/cpu/backend/a64/a64_util.h" + +#include +#include + +#include "xenia/cpu/backend/a64/a64_op.h" + +// For OPCODE_PACK/OPCODE_UNPACK +#include "third_party/half/include/half.hpp" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +volatile int anchor_vector = 0; + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_I2F +// ============================================================================ +struct VECTOR_CONVERT_I2F + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.UCVTF(i.dest.reg().S4(), i.src1.reg().S4()); + } else { + e.SCVTF(i.dest.reg().S4(), i.src1.reg().S4()); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F, VECTOR_CONVERT_I2F); + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_F2I +// ============================================================================ +struct VECTOR_CONVERT_F2I + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.FCVTZU(i.dest.reg().S4(), i.src1.reg().S4()); + } else { + e.FCVTZS(i.dest.reg().S4(), i.src1.reg().S4()); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I); + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHL +// ============================================================================ +static const vec128_t lvsl_table[16] = { + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), +}; +struct LOAD_VECTOR_SHL_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + assert_true(sh < xe::countof(lvsl_table)); + e.MOV(X0, reinterpret_cast(&lvsl_table[sh])); + e.LDR(i.dest, X0); + } else { + e.MOV(X0, reinterpret_cast(lvsl_table)); + e.AND(X1, i.src1.reg().toX(), 0xf); + e.LDR(i.dest, X0, X1, IndexExt::LSL, 4); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL, LOAD_VECTOR_SHL_I8); + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHR +// ============================================================================ +static const vec128_t lvsr_table[16] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), +}; +struct LOAD_VECTOR_SHR_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + assert_true(sh < xe::countof(lvsr_table)); + e.MOV(X0, reinterpret_cast(&lvsr_table[sh])); + e.LDR(i.dest, X0); + } else { + e.MOV(X0, reinterpret_cast(lvsr_table)); + e.AND(X1, i.src1.reg().toX(), 0xf); + e.LDR(i.dest, X0, X1, IndexExt::LSL, 4); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR, LOAD_VECTOR_SHR_I8); + +// ============================================================================ +// OPCODE_VECTOR_MAX +// ============================================================================ +struct VECTOR_MAX + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + uint32_t part_type = i.instr->flags >> 8; + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + switch (part_type) { + case INT8_TYPE: + e.UMAX(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.UMAX(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.UMAX(dest.S4(), src1.S4(), src2.S4()); + break; + default: + assert_unhandled_case(part_type); + break; + } + } else { + switch (part_type) { + case INT8_TYPE: + e.SMAX(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.SMAX(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.SMAX(dest.S4(), src1.S4(), src2.S4()); + break; + default: + assert_unhandled_case(part_type); + break; + } + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MAX, VECTOR_MAX); + +// ============================================================================ +// OPCODE_VECTOR_MIN +// ============================================================================ +struct VECTOR_MIN + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + uint32_t part_type = i.instr->flags >> 8; + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + switch (part_type) { + case INT8_TYPE: + e.UMIN(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.UMIN(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.UMIN(dest.S4(), src1.S4(), src2.S4()); + break; + default: + assert_unhandled_case(part_type); + break; + } + } else { + switch (part_type) { + case INT8_TYPE: + e.SMIN(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.SMIN(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.SMIN(dest.S4(), src1.S4(), src2.S4()); + break; + default: + assert_unhandled_case(part_type); + break; + } + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_MIN, VECTOR_MIN); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_EQ +// ============================================================================ +struct VECTOR_COMPARE_EQ_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.CMEQ(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.CMEQ(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.CMEQ(dest.S4(), src1.S4(), src2.S4()); + break; + case FLOAT32_TYPE: + e.FCMEQ(dest.S4(), src1.S4(), src2.S4()); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ, VECTOR_COMPARE_EQ_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGT +// ============================================================================ +struct VECTOR_COMPARE_SGT_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.CMGT(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.CMGT(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.CMGT(dest.S4(), src1.S4(), src2.S4()); + break; + case FLOAT32_TYPE: + e.FCMGT(dest.S4(), src1.S4(), src2.S4()); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT, VECTOR_COMPARE_SGT_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGE +// ============================================================================ +struct VECTOR_COMPARE_SGE_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.CMGE(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.CMGE(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.CMGE(dest.S4(), src1.S4(), src2.S4()); + break; + case FLOAT32_TYPE: + e.FCMGE(dest.S4(), src1.S4(), src2.S4()); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE, VECTOR_COMPARE_SGE_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGT +// ============================================================================ +struct VECTOR_COMPARE_UGT_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.CMHI(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.CMHI(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.CMHI(dest.S4(), src1.S4(), src2.S4()); + break; + case FLOAT32_TYPE: + e.FABS(Q0.S4(), src1.S4()); + e.FABS(Q1.S4(), src2.S4()); + e.FCMGT(dest.S4(), Q0.S4(), Q1.S4()); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT, VECTOR_COMPARE_UGT_V128); + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGE +// ============================================================================ +struct VECTOR_COMPARE_UGE_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.CMHS(dest.B16(), src1.B16(), src2.B16()); + break; + case INT16_TYPE: + e.CMHS(dest.H8(), src1.H8(), src2.H8()); + break; + case INT32_TYPE: + e.CMHS(dest.S4(), src1.S4(), src2.S4()); + break; + case FLOAT32_TYPE: + e.FABS(Q0.S4(), src1.S4()); + e.FABS(Q1.S4(), src2.S4()); + e.FCMGE(dest.S4(), Q0.S4(), Q1.S4()); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE, VECTOR_COMPARE_UGE_V128); + +// ============================================================================ +// OPCODE_VECTOR_ADD +// ============================================================================ +struct VECTOR_ADD + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, const QReg& dest, QReg src1, QReg src2) { + const TypeName part_type = + static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); + switch (part_type) { + case INT8_TYPE: + if (saturate) { + if (is_unsigned) { + e.UQADD(dest.B16(), src1.B16(), src2.B16()); + } else { + e.SQADD(dest.B16(), src1.B16(), src2.B16()); + } + } else { + e.ADD(dest.B16(), src1.B16(), src2.B16()); + } + break; + case INT16_TYPE: + if (saturate) { + if (is_unsigned) { + e.UQADD(dest.H8(), src1.H8(), src2.H8()); + } else { + e.SQADD(dest.H8(), src1.H8(), src2.H8()); + } + } else { + e.ADD(dest.H8(), src1.H8(), src2.H8()); + } + break; + case INT32_TYPE: + if (saturate) { + if (is_unsigned) { + e.UQADD(dest.S4(), src1.S4(), src2.S4()); + } else { + e.SQADD(dest.S4(), src1.S4(), src2.S4()); + } + } else { + e.ADD(dest.S4(), src1.S4(), src2.S4()); + } + break; + case FLOAT32_TYPE: + assert_false(is_unsigned); + assert_false(saturate); + e.FADD(dest.S4(), src1.S4(), src2.S4()); + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD, VECTOR_ADD); + +// ============================================================================ +// OPCODE_VECTOR_SUB +// ============================================================================ +struct VECTOR_SUB + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, const QReg& dest, QReg src1, QReg src2) { + const TypeName part_type = + static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); + switch (part_type) { + case INT8_TYPE: + if (saturate) { + if (is_unsigned) { + e.UQSUB(dest.B16(), src1.B16(), src2.B16()); + } else { + e.SQSUB(dest.B16(), src1.B16(), src2.B16()); + } + } else { + e.SUB(dest.B16(), src1.B16(), src2.B16()); + } + break; + case INT16_TYPE: + if (saturate) { + if (is_unsigned) { + e.UQSUB(dest.H8(), src1.H8(), src2.H8()); + } else { + e.SQSUB(dest.H8(), src1.H8(), src2.H8()); + } + } else { + e.SUB(dest.H8(), src1.H8(), src2.H8()); + } + break; + case INT32_TYPE: + if (saturate) { + if (is_unsigned) { + e.UQSUB(dest.S4(), src1.S4(), src2.S4()); + } else { + e.SQSUB(dest.S4(), src1.S4(), src2.S4()); + } + } else { + e.SUB(dest.S4(), src1.S4(), src2.S4()); + } + break; + case FLOAT32_TYPE: + assert_false(is_unsigned); + assert_false(saturate); + e.FSUB(dest.S4(), src1.S4(), src2.S4()); + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB); + +// ============================================================================ +// OPCODE_VECTOR_SHL +// ============================================================================ +template ::value, int> = 0> +static uint8x16_t EmulateVectorShl(void*, std::byte src1[16], + std::byte src2[16]) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load NEON registers into a C array. + vst1q_u8(reinterpret_cast(value), vld1q_u8(src1)); + vst1q_u8(reinterpret_cast(shamt), vld1q_u8(src2)); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return vld1q_u8(value); +} +struct VECTOR_SHL_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } + } + + static void EmitInt8(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SHL + e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + + static void EmitInt16(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SHL + e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0] & 0xF); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + + static void EmitInt32(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SHL + e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0] & 0x1F); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); + +// ============================================================================ +// OPCODE_VECTOR_SHR +// ============================================================================ +template ::value, int> = 0> +static uint8x16_t EmulateVectorShr(void*, std::byte src1[16], + std::byte src2[16]) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load NEON registers into a C array. + vst1q_u8(reinterpret_cast(value), vld1q_u8(src1)); + vst1q_u8(reinterpret_cast(shamt), vld1q_u8(src2)); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return vld1q_u8(value); +} +struct VECTOR_SHR_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } + } + + static void EmitInt8(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use USHR + e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + + static void EmitInt16(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use USHR + e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + + static void EmitInt32(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use USHR + e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); + +// ============================================================================ +// OPCODE_VECTOR_SHA +// ============================================================================ +struct VECTOR_SHA_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } + } + + static void EmitInt8(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SSHR + e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + + static void EmitInt16(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SSHR + e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + + static void EmitInt32(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SSHR + e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F); + return; + } + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); + +// ============================================================================ +// OPCODE_VECTOR_ROTATE_LEFT +// ============================================================================ +template ::value, int> = 0> +static uint8x16_t EmulateVectorRotateLeft(void*, std::byte src1[16], + std::byte src2[16]) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load NEON registers into a C array. + vst1q_u8(reinterpret_cast(value), vld1q_u8(src1)); + vst1q_u8(reinterpret_cast(shamt), vld1q_u8(src2)); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = xe::rotate_left(value[i], shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return vld1q_u8(value); +} +struct VECTOR_ROTATE_LEFT_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + switch (i.instr->flags) { + case INT8_TYPE: + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + break; + case INT16_TYPE: + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + break; + case INT32_TYPE: + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + break; + default: + assert_always(); + break; + } + e.MOV(i.dest.reg().B16(), Q0.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); + +// ============================================================================ +// OPCODE_VECTOR_AVERAGE +// ============================================================================ +struct VECTOR_AVERAGE + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, + [&i](A64Emitter& e, const QReg& dest, const QReg& src1, + const QReg& src2) { + const TypeName part_type = + static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + switch (part_type) { + case INT8_TYPE: + if (is_unsigned) { + e.URHADD(dest.B16(), src1.B16(), src2.B16()); + } else { + e.SRHADD(dest.B16(), src1.B16(), src2.B16()); + assert_always(); + } + break; + case INT16_TYPE: + if (is_unsigned) { + e.URHADD(dest.H8(), src1.H8(), src2.H8()); + } else { + e.SRHADD(dest.H8(), src1.H8(), src2.H8()); + } + break; + case INT32_TYPE: + if (is_unsigned) { + e.URHADD(dest.S4(), src1.S4(), src2.S4()); + } else { + e.SRHADD(dest.S4(), src1.S4(), src2.S4()); + } + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE, VECTOR_AVERAGE); + +// ============================================================================ +// OPCODE_INSERT +// ============================================================================ +struct INSERT_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + e.MOV(i.dest.reg().Belem()[i.src2.constant() ^ 0x3], i.src3.reg()); + } +}; +struct INSERT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + e.MOV(i.dest.reg().Helem()[i.src2.constant() ^ 0x1], i.src3.reg()); + } +}; +struct INSERT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.is_constant); + e.MOV(i.dest.reg().Selem()[i.src2.constant()], i.src3.reg()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_INSERT, INSERT_I8, INSERT_I16, INSERT_I32); + +// ============================================================================ +// OPCODE_EXTRACT +// ============================================================================ +// TODO(benvanik): sequence extract/splat: +// v0.i32 = extract v0.v128, 0 +// v0.v128 = splat v0.i32 +// This can be a single broadcast. +struct EXTRACT_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.UMOV(i.dest, i.src1.reg().Belem()[VEC128_B(i.src2.constant())]); + } else { + // Fixup index + e.EOR(W0, i.src2, 0b11); + e.AND(W0, W0, 0x1F); + e.DUP(Q0.B16(), W0); + // Byte-table lookup + e.TBL(Q0.B16(), List{i.src1.reg().B16()}, Q0.B16()); + // Get lowest element + e.UMOV(i.dest, Q0.Belem()[0]); + } + } +}; +struct EXTRACT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.UMOV(i.dest, i.src1.reg().Helem()[VEC128_W(i.src2.constant())]); + } else { + // Fixup index + e.EOR(W0, i.src2, 0b01); + e.LSL(W0, W0, 1); + + // Replicate index as byte + e.MOV(W1, 0x01'01); + e.MUL(W0, W0, W1); + + // Byte indices + e.ADD(W0, W0, 0x01'00); + e.UXTH(W0, W0); + + // Replicate byte indices + e.DUP(Q0.H8(), W0); + // Byte-table lookup + e.TBL(Q0.B16(), List{i.src1.reg().B16()}, Q0.B16()); + // Get lowest element + e.UMOV(i.dest, Q0.Helem()[0]); + } + } +}; +struct EXTRACT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + static const vec128_t extract_table_32[4] = { + vec128b(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + }; + if (i.src2.is_constant) { + e.UMOV(i.dest, i.src1.reg().Selem()[VEC128_D(i.src2.constant())]); + } else { + QReg src1 = i.src1.reg(); + if (i.src1.is_constant) { + src1 = Q1; + e.LoadConstantV(src1, i.src1.constant()); + } + + e.AND(X0, i.src2.reg().toX(), 0b11); + e.LSL(X0, X0, 4); + + e.MOV(X1, reinterpret_cast(extract_table_32)); + e.LDR(Q0, X1, X0); + + // Byte-table lookup + e.TBL(Q0.B16(), List{src1.B16()}, Q0.B16()); + // Get lowest element + e.UMOV(i.dest, Q0.Selem()[0]); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, EXTRACT_I32); + +// ============================================================================ +// OPCODE_SPLAT +// ============================================================================ +// Copy a value into all elements of a vector +struct SPLAT_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + e.MOVI(i.dest.reg().B16(), i.src1.constant()); + } else { + e.DUP(i.dest.reg().B16(), i.src1); + } + } +}; +struct SPLAT_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + if ((i.src1.constant() & 0xFF'00) == 0) { + e.MOVI(i.dest.reg().H8(), i.src1.constant()); + return; + } else if ((i.src1.constant() & 0x00'FF) == 0) { + e.MOVI(i.dest.reg().H8(), i.src1.constant(), oaknut::util::LSL, 8); + return; + } + e.MOV(W0, i.src1.constant()); + e.DUP(i.dest.reg().H8(), W0); + } else { + e.DUP(i.dest.reg().H8(), i.src1); + } + } +}; +struct SPLAT_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + oaknut::FImm8 fp8(0); + if (f32_to_fimm8(i.src1.value->constant.u32, fp8)) { + e.FMOV(i.dest.reg().S4(), fp8); + return; + } else if ((i.src1.constant() & 0xFF'FF'FF'00) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.constant()); + return; + } else if ((i.src1.constant() & 0xFF'FF'00'FF) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 8); + return; + } else if ((i.src1.constant() & 0xFF'00'FF'FF) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 16); + return; + } else if ((i.src1.constant() & 0x00'FF'FF'FF) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.constant(), oaknut::util::LSL, 24); + return; + } + e.MOV(W0, i.src1.constant()); + e.DUP(i.dest.reg().S4(), W0); + } else { + e.DUP(i.dest.reg().S4(), i.src1); + } + } +}; +struct SPLAT_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + oaknut::FImm8 fp8(0); + if (f32_to_fimm8(i.src1.value->constant.u32, fp8)) { + e.FMOV(i.dest.reg().S4(), fp8); + return; + } else if ((i.src1.value->constant.u32 & 0xFF'FF'FF'00) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32); + return; + } else if ((i.src1.value->constant.u32 & 0xFF'FF'00'FF) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL, + 8); + return; + } else if ((i.src1.value->constant.u32 & 0xFF'00'FF'FF) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL, + 16); + return; + } else if ((i.src1.value->constant.u32 & 0x00'FF'FF'FF) == 0) { + e.MOVI(i.dest.reg().S4(), i.src1.value->constant.u32, oaknut::util::LSL, + 24); + return; + } + e.MOV(W0, i.src1.value->constant.i32); + e.DUP(i.dest.reg().S4(), W0); + } else { + e.DUP(i.dest.reg().S4(), i.src1.reg().toQ().Selem()[0]); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SPLAT, SPLAT_I8, SPLAT_I16, SPLAT_I32, SPLAT_F32); + +// ============================================================================ +// OPCODE_PERMUTE +// ============================================================================ +struct PERMUTE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(i.instr->flags == INT32_TYPE); + // Permute words between src2 and src3. + if (i.src1.is_constant) { + // Each byte is a word-index + const uint32_t control = i.src1.constant(); + const QReg indices = Q0; + + // Word to byte index + e.MOV(W0, control * 4); + e.MOV(indices.Selem()[0], W0); + + // Widen int8 to int16 + e.ZIP1(indices.B16(), indices.B16(), indices.B16()); + // Widen int16 to int32 + e.ZIP1(indices.B16(), indices.B16(), indices.B16()); + + // Convert to byte-indices + e.MOV(W0, 0x03'02'01'00); + e.DUP(Q1.S4(), W0); + e.ADD(indices.S4(), indices.S4(), Q1.S4()); + + // Table-registers must be sequential indices + const QReg table0 = Q2; + if (i.src2.is_constant) { + e.LoadConstantV(table0, i.src2.constant()); + } else { + e.MOV(table0.B16(), i.src2.reg().B16()); + } + + const QReg table1 = Q3; + if (i.src3.is_constant) { + e.LoadConstantV(table1, i.src3.constant()); + } else { + e.MOV(table1.B16(), i.src3.reg().B16()); + } + + e.TBL(i.dest.reg().B16(), List{table0.B16(), table1.B16()}, + indices.B16()); + } else { + // Permute by non-constant. + assert_always(); + } + } +}; +struct PERMUTE_V128 + : Sequence> { + static void EmitByInt8(A64Emitter& e, const EmitArgType& i) { + // Permute bytes between src2 and src3. + // src1 is an array of indices corresponding to positions within src2 and + // src3. + if (i.src3.value->IsConstantZero()) { + if (i.src2.value->IsConstantZero()) { + // src2 & src3 are zero, so result will always be zero. + e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16()); + return; + } + } + + const QReg indices = Q0; + if (i.src1.is_constant) { + e.LoadConstantV(indices, i.src1.constant()); + } else { + e.MOV(indices.B16(), i.src1.reg().B16()); + } + + // Indices must be endian-swapped + e.MOVI(Q1.B16(), 0b11); + e.EOR(indices.B16(), indices.B16(), Q1.B16()); + + // Modulo 32 the indices + e.MOVI(Q1.B16(), 0b0001'1111); + e.AND(indices.B16(), indices.B16(), Q1.B16()); + + // Table-registers must be sequential indices + const QReg table_lo = Q2; + if (i.src2.is_constant) { + e.LoadConstantV(table_lo, i.src2.constant()); + } else { + e.MOV(table_lo.B16(), i.src2.reg().B16()); + } + + const QReg table_hi = Q3; + if (i.src3.is_constant) { + e.LoadConstantV(table_hi, i.src3.constant()); + } else { + e.MOV(table_hi.B16(), i.src3.reg().B16()); + } + + e.TBL(i.dest.reg().B16(), List{table_lo.B16(), table_hi.B16()}, + indices.B16()); + } + + static void EmitByInt16(A64Emitter& e, const EmitArgType& i) { + // Permute bytes between src2 and src3. + // src1 is an array of indices corresponding to positions within src2 and + // src3. + if (i.src3.value->IsConstantZero()) { + if (i.src2.value->IsConstantZero()) { + // src2 & src3 are zero, so result will always be zero. + e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16()); + return; + } + } + + const QReg indices = Q0; + if (i.src1.is_constant) { + e.LoadConstantV(indices, i.src1.constant()); + } else { + e.MOV(indices.B16(), i.src1.reg().B16()); + } + + // Indices must be endian-swapped + e.MOVI(Q1.H8(), 0b1); + e.EOR(indices.B16(), indices.B16(), Q1.B16()); + + // Modulo-16 the indices + e.MOVI(Q1.H8(), 0b0000'1111); + e.AND(indices.B16(), indices.B16(), Q1.B16()); + + // Convert int16 indices into int8 + e.MOVI(Q1.B16(), 0x02); + e.MUL(indices.H8(), indices.H8(), Q1.H8()); + + e.MOVI(Q1.H8(), 0x01, LSL, 8); + e.ADD(indices.H8(), indices.H8(), Q1.H8()); + + // Table-registers must be sequential indices + const QReg table_lo = Q2; + if (i.src2.is_constant) { + e.LoadConstantV(table_lo, i.src2.constant()); + } else { + e.MOV(table_lo.B16(), i.src2.reg().B16()); + } + + const QReg table_hi = Q3; + if (i.src3.is_constant) { + e.LoadConstantV(table_hi, i.src3.constant()); + } else { + e.MOV(table_hi.B16(), i.src3.reg().B16()); + } + + e.TBL(i.dest.reg().B16(), List{table_lo.B16(), table_hi.B16()}, + indices.B16()); + } + + static void EmitByInt32(A64Emitter& e, const EmitArgType& i) { + assert_always(); + } + + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitByInt8(e, i); + break; + case INT16_TYPE: + EmitByInt16(e, i); + break; + case INT32_TYPE: + EmitByInt32(e, i); + break; + default: + assert_unhandled_case(i.instr->flags); + return; + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128); + +// ============================================================================ +// OPCODE_SWIZZLE +// ============================================================================ +struct SWIZZLE + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + auto element_type = i.instr->flags; + if (element_type == INT8_TYPE) { + assert_always(); + } else if (element_type == INT16_TYPE) { + assert_always(); + } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) { + // Four 2-bit word-indices packed into one 8-bit value + const uint8_t swizzle_mask = static_cast(i.src2.value); + + // Convert to byte-indices + const vec128_t indice_vec = + vec128i(((swizzle_mask >> 0) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00, + ((swizzle_mask >> 2) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00, + ((swizzle_mask >> 4) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00, + ((swizzle_mask >> 6) & 0b11) * 0x04'04'04'04 + 0x03'02'01'00); + + const QReg indices = Q1; + e.LoadConstantV(indices, indice_vec); + + QReg table0 = Q0; + if (i.src1.is_constant) { + e.LoadConstantV(table0, i.src1.constant()); + } else { + table0 = i.src1; + } + + e.TBL(i.dest.reg().B16(), List{table0.B16()}, indices.B16()); + } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { + assert_always(); + } else { + assert_always(); + } + }; +}; +EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE, SWIZZLE); + +// ============================================================================ +// OPCODE_PACK +// ============================================================================ +struct PACK : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags & PACK_TYPE_MODE) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_SHORT_4: + EmitSHORT_4(e, i); + break; + case PACK_TYPE_UINT_2101010: + EmitUINT_2101010(e, i); + break; + case PACK_TYPE_ULONG_4202020: + EmitULONG_4202020(e, i); + break; + case PACK_TYPE_8_IN_16: + Emit8_IN_16(e, i, i.instr->flags); + break; + case PACK_TYPE_16_IN_32: + Emit16_IN_32(e, i, i.instr->flags); + break; + default: + assert_unhandled_case(i.instr->flags); + break; + } + } + static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + QReg src = i.src1; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } + + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] + // are valid - max before min to pack NaN as zero (5454082B is heavily + // affected by the order - packs 0xFFFFFFFF in matrix code to get a 0 + // constant). + e.LDR(Q0, VConstData, e.GetVConstOffset(V3333)); + e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4()); + + e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLORSat)); + e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4()); + // Extract bytes. + // RGBA (XYZW) -> ARGB (WXYZ) + // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | + // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) + e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLOR)); + e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16()); + } + static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) { + alignas(16) float a[4]; + alignas(16) uint16_t b[8]; + vst1q_u8(a, vld1q_u8(src1)); + std::memset(b, 0, sizeof(b)); + + for (int i = 0; i < 2; i++) { + b[7 - i] = half_float::detail::float2half(a[i]); + } + + return vld1q_u8(b); + } + static void EmitFLOAT16_2(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // dest = [(src1.x | src1.y), 0, 0, 0] + + if (e.IsFeatureEnabled(kA64EmitF16C)) { + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + e.FCVTN(i.dest.reg().toD().H4(), src1.S4()); + e.MOVI(Q0.B16(), 0); + e.EXT(i.dest.reg().B16(), Q0.B16(), i.dest.reg().B16(), 4); + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + return; + } + + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + } + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static uint8x16_t EmulateFLOAT16_4(void*, std::byte src1[16]) { + alignas(16) float a[4]; + alignas(16) uint16_t b[8]; + vst1q_u8(a, vld1q_u8(src1)); + std::memset(b, 0, sizeof(b)); + + for (int i = 0; i < 4; i++) { + b[7 - (i ^ 2)] = + half_float::detail::float2half(a[i]); + } + + return vld1q_u8(b); + } + static void EmitFLOAT16_4(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0] + + if (e.IsFeatureEnabled(kA64EmitF16C)) { + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + e.FCVTN(i.dest.reg().toD().H4(), src1.S4()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + return; + } + + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + } + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static void EmitSHORT_2(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + QReg src = i.src1; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + // Saturate + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min)); + e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max)); + e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Pack + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_2)); + e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); + } + static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + QReg src = i.src1; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + // Saturate + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min)); + e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max)); + e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Pack + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_4)); + e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); + } + static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { + // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt + // XYZ are 10 bits, signed and saturated. + // W is 2 bits, unsigned and saturated. + const QReg src = i.dest; + if (i.src1.is_constant) { + e.LoadConstantV(src, i.src1.constant()); + } + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + // Saturate. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked)); + e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaxUnpacked)); + e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Remove the unneeded bits of the floats. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskUnpacked)); + e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); + + // Shift the components up. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift)); + e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Combine the components. + e.LoadConstantV(Q1, vec128i(0x03'02'01'00 + 0x04'04'04'04 * 2, + 0x03'02'01'00 + 0x04'04'04'04 * 3, + 0x03'02'01'00 + 0x04'04'04'04 * 0, + 0x03'02'01'00 + 0x04'04'04'04 * 1)); + e.TBL(Q0.B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); + e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16()); + + e.LoadConstantV(Q1, vec128i(0x03'02'01'00 + 0x04'04'04'04 * 1, + 0x03'02'01'00 + 0x04'04'04'04 * 0, + 0x03'02'01'00 + 0x04'04'04'04 * 3, + 0x03'02'01'00 + 0x04'04'04'04 * 2)); + e.TBL(Q0.B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); + e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16()); + } + static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) { + // XYZ are 20 bits, signed and saturated. + // W is 4 bits, unsigned and saturated. + QReg src = i.src1; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + // Saturate. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked)); + e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaxUnpacked)); + e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Remove the unneeded bits of the floats (so excess nibbles will also be + // cleared). + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaskUnpacked)); + e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); + + // Store Y and W shifted left by 4 so vpshufb can be used with them. + e.SHL(Q0.S4(), i.dest.reg().S4(), 4); + + // Place XZ where they're supposed to be. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteXZ)); + e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); + // Place YW. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteYW)); + e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16()); + // Merge XZ and YW. + e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16()); + } + static void Emit8_IN_16(A64Emitter& e, const EmitArgType& i, uint32_t flags) { + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // unsigned -> unsigned + saturate + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + + const QReg src2 = i.src2.is_constant ? Q1 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2, i.src2.constant()); + } + e.UQXTN(i.dest.reg().toD().B8(), src2.H8()); + e.UQXTN2(i.dest.reg().B16(), src1.H8()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } else { + // unsigned -> unsigned + e.XTN(i.dest.reg().toD().B8(), i.src2.reg().H8()); + e.XTN2(i.dest.reg().B16(), i.src1.reg().H8()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } + } else { + if (IsPackOutSaturate(flags)) { + // unsigned -> signed + saturate + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } + } else { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // signed -> unsigned + saturate + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + + const QReg src2 = i.src2.is_constant ? Q1 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2, i.src2.constant()); + } + + e.SQXTUN(i.dest.reg().toD().B8(), src2.H8()); + e.SQXTUN2(i.dest.reg().B16(), src1.H8()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } else { + // signed -> unsigned + assert_always(); + } + } else { + if (IsPackOutSaturate(flags)) { + // signed -> signed + saturate + e.SQXTN(i.dest.reg().toD().B8(), i.src2.reg().H8()); + e.SQXTN2(i.dest.reg().B16(), i.src1.reg().H8()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } else { + // signed -> signed + assert_always(); + } + } + } + } + // Pack 2 32-bit vectors into a 16-bit vector. + static void Emit16_IN_32(A64Emitter& e, const EmitArgType& i, + uint32_t flags) { + // TODO(benvanik): handle src2 (or src1) being constant zero + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // unsigned -> unsigned + saturate + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + + const QReg src2 = i.src2.is_constant ? Q1 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2, i.src2.constant()); + } + + e.UQXTN(i.dest.reg().toD().H4(), src2.S4()); + e.UQXTN2(i.dest.reg().H8(), src1.S4()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } else { + // unsigned -> unsigned + e.XTN(i.dest.reg().toD().H4(), i.src2.reg().S4()); + e.XTN2(i.dest.reg().H8(), i.src1.reg().S4()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } + } else { + if (IsPackOutSaturate(flags)) { + // unsigned -> signed + saturate + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } + } else { + if (IsPackOutUnsigned(flags)) { + if (IsPackOutSaturate(flags)) { + // signed -> unsigned + saturate + e.SQXTUN(i.dest.reg().toD().H4(), i.src2.reg().S4()); + e.SQXTUN2(i.dest.reg().H8(), i.src1.reg().S4()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } else { + // signed -> unsigned + assert_always(); + } + } else { + if (IsPackOutSaturate(flags)) { + // signed -> signed + saturate + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + + const QReg src2 = i.src2.is_constant ? Q1 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2, i.src2.constant()); + } + e.SQXTN(i.dest.reg().toD().H4(), src2.S4()); + e.SQXTN2(i.dest.reg().H8(), src1.S4()); + + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 8); + } else { + // signed -> signed + assert_always(); + } + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK); + +// ============================================================================ +// OPCODE_UNPACK +// ============================================================================ +struct UNPACK : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags & PACK_TYPE_MODE) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_SHORT_4: + EmitSHORT_4(e, i); + break; + case PACK_TYPE_UINT_2101010: + EmitUINT_2101010(e, i); + break; + case PACK_TYPE_ULONG_4202020: + EmitULONG_4202020(e, i); + break; + case PACK_TYPE_8_IN_16: + Emit8_IN_16(e, i, i.instr->flags); + break; + case PACK_TYPE_16_IN_32: + Emit16_IN_32(e, i, i.instr->flags); + break; + default: + assert_unhandled_case(i.instr->flags); + break; + } + } + static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) { + // ARGB (WXYZ) -> RGBA (XYZW) + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + QReg src(0); + + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.FMOV(i.dest.reg().S4(), FImm8(0, 7, 0)); + return; + } + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + // src = ZZYYXXWW + // Unpack to 000000ZZ,000000YY,000000XX,000000WW + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackD3DCOLOR)); + e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); + // Add 1.0f to each. + e.FMOV(Q1.S4(), FImm8(0, 7, 0)); + e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); + // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. + } + static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) { + alignas(16) uint16_t a[4]; + alignas(16) float b[8]; + vst1q_u8(a, vld1q_u8(src1)); + std::memset(b, 0, sizeof(b)); + + for (int i = 0; i < 2; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); + } + + // Constants, or something + b[2] = 0.f; + b[3] = 1.f; + + return vld1q_u8(b); + } + static void EmitFLOAT16_2(A64Emitter& e, const EmitArgType& i) { + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // D3D10 half float format + + if (e.IsFeatureEnabled(kA64EmitF16C)) { + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + + // Move the upper 4 bytes to the lower 4 bytes, zero the rest + e.EOR(Q0.B16(), Q0.B16(), Q0.B16()); + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16(), 12); + + e.FCVTL(i.dest.reg().S4(), i.dest.reg().toD().H4()); + e.REV64(i.dest.reg().S4(), i.dest.reg().S4()); + + // Write 1.0 to element 3 + e.FMOV(S0, oaknut::FImm8(0, 7, 0)); + e.MOV(i.dest.reg().Selem()[3], Q0.Selem()[0]); + return; + } + + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + } + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static uint8x16_t EmulateFLOAT16_4(void*, std::byte src1[16]) { + alignas(16) uint16_t a[4]; + alignas(16) float b[8]; + vst1q_u8(a, vld1q_u8(src1)); + + for (int i = 0; i < 4; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); + } + + return vld1q_u8(b); + } + static void EmitFLOAT16_4(A64Emitter& e, const EmitArgType& i) { + // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] + if (e.IsFeatureEnabled(kA64EmitF16C)) { + const QReg src1 = i.src1.is_constant ? Q0 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } + e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.src1.reg().B16(), 8); + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.FCVTL(i.dest.reg().S4(), i.dest.reg().toD().H4()); + return; + } + + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + } + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static void EmitSHORT_2(A64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) + // src is (xx,xx,xx,VALUE) + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + QReg src(0); + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + src = i.dest; + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3301)); + return; + } + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = i.src1; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + // Shuffle bytes. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_2)); + e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); + + // If negative, make smaller than 3 - sign extend before adding. + e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 16); + e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); + + // Add 3,3,0,1. + e.LDR(Q1, VConstData, e.GetVConstOffset(V3301)); + e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Return quiet NaNs in case of negative overflow. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow)); + e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); + e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 3.0 + (VB.y>>16)*2^-22 + // (VD.w) = 3.0 + (VB.y)*2^-22 + // src is (xx,xx,VALUE,VALUE) + + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + QReg src(0); + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3333)); + return; + } + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + // Shuffle bytes. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_4)); + e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); + + // If negative, make smaller than 3 - sign extend before adding. + e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 16); + e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); + + // Add 3,3,3,3. + e.LDR(Q1, VConstData, e.GetVConstOffset(V3333)); + e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + + // Return quiet NaNs in case of negative overflow. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow)); + e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); + e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + QReg src(0); + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331)); + return; + } + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + + // Splat W. + e.DUP(i.dest.reg().S4(), src.Selem()[3]); + // Keep only the needed components. + // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskPacked)); + e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); + + // Shift the components down. + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift)); + e.NEG(Q1.S4(), Q1.S4()); + e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding. + // W is unsigned. + e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22); + e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22); + // Add 3,3,3,1. + e.LDR(Q1, VConstData, e.GetVConstOffset(V3331)); + e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + // Return quiet NaNs in case of negative overflow. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackUINT_2101010_Overflow)); + e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); + e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); + e.MOV(i.dest.reg().B16(), Q0.B16()); + // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. + // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.} + } + static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) { + const XReg VConstData = X3; + e.MOV(VConstData, e.GetVConstPtr()); + + QReg src(0); + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331)); + return; + } + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + // Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW + // will have excess 4 lower bits. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Permute)); + e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); + + // Drop the excess nibble of YW. + e.USHR(Q0.S4(), i.dest.reg().S4(), 4); + // Merge XZ and YW now both starting at offset 0. + e.LoadConstantV(Q1, vec128i(3 * 0x04'04'04'04 + 0x03'02'01'00, + 2 * 0x04'04'04'04 + 0x03'02'01'00, + 1 * 0x04'04'04'04 + 0x03'02'01'00, + 0 * 0x04'04'04'04 + 0x03'02'01'00)); + e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16(), Q0.B16()}, + Q1.B16()); + + // Reorder as XYZW. + e.LoadConstantV(Q1, vec128i(3 * 0x04'04'04'04 + 0x03'02'01'00, + 1 * 0x04'04'04'04 + 0x03'02'01'00, + 2 * 0x04'04'04'04 + 0x03'02'01'00, + 0 * 0x04'04'04'04 + 0x03'02'01'00)); + e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16(), Q0.B16()}, + Q1.B16()); + // Drop the excess upper nibble in XZ and sign-extend XYZ. + e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12); + e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12); + // Add 3,3,3,1. + e.LDR(Q1, VConstData, e.GetVConstOffset(V3331)); + e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + // Return quiet NaNs in case of negative overflow. + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Overflow)); + e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); + e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static void Emit8_IN_16(A64Emitter& e, const EmitArgType& i, uint32_t flags) { + assert_false(IsPackOutSaturate(flags)); + QReg src(0); + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + if (IsPackToLo(flags)) { + // Unpack to LO. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.SXTL2(i.dest.reg().H8(), i.dest.reg().B16()); + } + } + } else { + // Unpack to HI. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.REV32(i.dest.reg().H8(), i.dest.reg().H8()); + e.SXTL(i.dest.reg().H8(), i.dest.reg().toD().B8()); + } + } + } + } + static void Emit16_IN_32(A64Emitter& e, const EmitArgType& i, + uint32_t flags) { + assert_false(IsPackOutSaturate(flags)); + QReg src(0); + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantV(src, i.src1.constant()); + } else { + src = i.src1; + } + if (IsPackToLo(flags)) { + // Unpack to LO. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.SXTL2(i.dest.reg().S4(), src.H8()); + } + } + } else { + // Unpack to HI. + if (IsPackInUnsigned(flags)) { + if (IsPackOutUnsigned(flags)) { + // unsigned -> unsigned + assert_always(); + } else { + // unsigned -> signed + assert_always(); + } + } else { + if (IsPackOutUnsigned(flags)) { + // signed -> unsigned + assert_always(); + } else { + // signed -> signed + e.SXTL(i.dest.reg().S4(), src.toD().H4()); + } + } + } + e.REV64(i.dest.reg().S4(), i.dest.reg().S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK); + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_sequences.cc b/src/xenia/cpu/backend/a64/a64_sequences.cc new file mode 100644 index 000000000..3eb60510e --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_sequences.cc @@ -0,0 +1,2788 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +// A note about vectors: +// Xenia represents vectors as xyzw pairs, with indices 0123. +// XMM registers are xyzw pairs with indices 3210, making them more like wzyx. +// This makes things somewhat confusing. It'd be nice to just shuffle the +// registers around on load/store, however certain operations require that +// data be in the right offset. +// Basically, this identity must hold: +// shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} +// All indices and operations must respect that. +// +// Memory (big endian): +// [00 01 02 03] [04 05 06 07] [08 09 0A 0B] [0C 0D 0E 0F] (x, y, z, w) +// load into xmm register: +// [0F 0E 0D 0C] [0B 0A 09 08] [07 06 05 04] [03 02 01 00] (w, z, y, x) + +#include "xenia/cpu/backend/a64/a64_sequences.h" + +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/clock.h" +#include "xenia/base/logging.h" +#include "xenia/base/string.h" +#include "xenia/base/threading.h" +#include "xenia/cpu/backend/a64/a64_emitter.h" +#include "xenia/cpu/backend/a64/a64_op.h" +#include "xenia/cpu/backend/a64/a64_tracers.h" +#include "xenia/cpu/backend/a64/a64_util.h" +#include "xenia/cpu/hir/hir_builder.h" +#include "xenia/cpu/processor.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +using namespace oaknut; + +// TODO(benvanik): direct usings. +using namespace xe::cpu; +using namespace xe::cpu::hir; + +using xe::cpu::hir::Instr; + +typedef bool (*SequenceSelectFn)(A64Emitter&, const Instr*); +std::unordered_map sequence_table; + +// ============================================================================ +// OPCODE_COMMENT +// ============================================================================ +struct COMMENT : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (IsTracingInstr()) { + auto str = reinterpret_cast(i.src1.value); + // TODO(benvanik): pass through. + // TODO(benvanik): don't just leak this memory. + auto str_copy = xe_strdup(str); + e.MOV(e.GetNativeParam(0), reinterpret_cast(str_copy)); + e.CallNative(reinterpret_cast(TraceString)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_COMMENT, COMMENT); + +// ============================================================================ +// OPCODE_NOP +// ============================================================================ +struct NOP : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { e.NOP(); } +}; +EMITTER_OPCODE_TABLE(OPCODE_NOP, NOP); + +// ============================================================================ +// OPCODE_SOURCE_OFFSET +// ============================================================================ +struct SOURCE_OFFSET + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.MarkSourceOffset(i.instr); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SOURCE_OFFSET, SOURCE_OFFSET); + +// ============================================================================ +// OPCODE_ASSIGN +// ============================================================================ +struct ASSIGN_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest, i.src1); + } +}; +struct ASSIGN_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTH(i.dest, i.src1); + } +}; +struct ASSIGN_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.MOV(i.dest, i.src1); + } +}; +struct ASSIGN_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.MOV(i.dest, i.src1); + } +}; +struct ASSIGN_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FMOV(i.dest, i.src1); + } +}; +struct ASSIGN_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FMOV(i.dest, i.src1); + } +}; +struct ASSIGN_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.MOV(i.dest.reg().B16(), i.src1.reg().B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ASSIGN, ASSIGN_I8, ASSIGN_I16, ASSIGN_I32, + ASSIGN_I64, ASSIGN_F32, ASSIGN_F64, ASSIGN_V128); + +// ============================================================================ +// OPCODE_CAST +// ============================================================================ +struct CAST_I32_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FMOV(i.dest, i.src1); + } +}; +struct CAST_I64_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FMOV(i.dest, i.src1); + } +}; +struct CAST_F32_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FMOV(i.dest, i.src1); + } +}; +struct CAST_F64_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FMOV(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CAST, CAST_I32_F32, CAST_I64_F64, CAST_F32_I32, + CAST_F64_I64); + +// ============================================================================ +// OPCODE_ZERO_EXTEND +// ============================================================================ +struct ZERO_EXTEND_I16_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest, i.src1); + } +}; +struct ZERO_EXTEND_I32_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest, i.src1); + } +}; +struct ZERO_EXTEND_I64_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest.reg().toW(), i.src1); + } +}; +struct ZERO_EXTEND_I32_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTH(i.dest, i.src1); + } +}; +struct ZERO_EXTEND_I64_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTH(i.dest.reg().toW(), i.src1); + } +}; +struct ZERO_EXTEND_I64_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.MOV(i.dest.reg().toW(), i.src1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ZERO_EXTEND, ZERO_EXTEND_I16_I8, ZERO_EXTEND_I32_I8, + ZERO_EXTEND_I64_I8, ZERO_EXTEND_I32_I16, + ZERO_EXTEND_I64_I16, ZERO_EXTEND_I64_I32); + +// ============================================================================ +// OPCODE_SIGN_EXTEND +// ============================================================================ +struct SIGN_EXTEND_I16_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SXTB(i.dest, i.src1); + } +}; +struct SIGN_EXTEND_I32_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SXTB(i.dest, i.src1); + } +}; +struct SIGN_EXTEND_I64_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SXTB(i.dest, i.src1); + } +}; +struct SIGN_EXTEND_I32_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SXTH(i.dest, i.src1); + } +}; +struct SIGN_EXTEND_I64_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SXTH(i.dest, i.src1); + } +}; +struct SIGN_EXTEND_I64_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SXTW(i.dest, i.src1.reg()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SIGN_EXTEND, SIGN_EXTEND_I16_I8, SIGN_EXTEND_I32_I8, + SIGN_EXTEND_I64_I8, SIGN_EXTEND_I32_I16, + SIGN_EXTEND_I64_I16, SIGN_EXTEND_I64_I32); + +// ============================================================================ +// OPCODE_TRUNCATE +// ============================================================================ +struct TRUNCATE_I8_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest, i.src1); + } +}; +struct TRUNCATE_I8_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest, i.src1); + } +}; +struct TRUNCATE_I8_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTB(i.dest, i.src1.reg().toW()); + } +}; +struct TRUNCATE_I16_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTH(i.dest, i.src1); + } +}; +struct TRUNCATE_I16_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UXTH(i.dest, i.src1.reg().toW()); + } +}; +struct TRUNCATE_I32_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.MOV(i.dest, i.src1.reg().toW()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_TRUNCATE, TRUNCATE_I8_I16, TRUNCATE_I8_I32, + TRUNCATE_I8_I64, TRUNCATE_I16_I32, TRUNCATE_I16_I64, + TRUNCATE_I32_I64); + +// ============================================================================ +// OPCODE_CONVERT +// ============================================================================ +struct CONVERT_I32_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + if (i.instr->flags == ROUND_TO_ZERO) { + e.FCVTZS(i.dest, i.src1.reg().toS()); + } else { + e.FCVTNS(i.dest, i.src1.reg().toS()); + } + } +}; +struct CONVERT_I32_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // Intel returns 0x80000000 if the double value does not fit within an int32 + // ARM64 and PPC saturates the value instead + if (i.instr->flags == ROUND_TO_ZERO) { + e.FCVTZS(i.dest, i.src1.reg().toD()); + } else { + e.FCVTNS(i.dest, i.src1.reg().toD()); + } + } +}; +struct CONVERT_I64_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags == ROUND_TO_ZERO) { + e.FCVTZS(i.dest, i.src1.reg().toD()); + } else { + e.FCVTNS(i.dest, i.src1.reg().toD()); + } + } +}; +struct CONVERT_F32_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SCVTF(i.dest.reg().toS(), i.src1); + } +}; +struct CONVERT_F32_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCVT(i.dest.reg().toS(), i.src1.reg().toD()); + } +}; +struct CONVERT_F64_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.SCVTF(i.dest.reg().toD(), i.src1); + } +}; +struct CONVERT_F64_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // e.vcvtss2sd(i.dest, i.src1); + e.FCVT(i.dest.reg().toD(), i.src1.reg().toS()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CONVERT, CONVERT_I32_F32, CONVERT_I32_F64, + CONVERT_I64_F64, CONVERT_F32_I32, CONVERT_F32_F64, + CONVERT_F64_I64, CONVERT_F64_F32); + +// ============================================================================ +// OPCODE_ROUND +// ============================================================================ +struct ROUND_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.FRINTZ(i.dest.reg().toS(), i.src1.reg().toS()); + break; + case ROUND_TO_NEAREST: + e.FRINTN(i.dest.reg().toS(), i.src1.reg().toS()); + break; + case ROUND_TO_MINUS_INFINITY: + e.FRINTM(i.dest.reg().toS(), i.src1.reg().toS()); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.FRINTP(i.dest.reg().toS(), i.src1.reg().toS()); + break; + } + } +}; +struct ROUND_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.FRINTZ(i.dest, i.src1); + break; + case ROUND_TO_NEAREST: + e.FRINTN(i.dest, i.src1); + break; + case ROUND_TO_MINUS_INFINITY: + e.FRINTM(i.dest, i.src1); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.FRINTP(i.dest, i.src1); + break; + } + } +}; +struct ROUND_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.FRINTZ(i.dest.reg().S4(), i.src1.reg().S4()); + break; + case ROUND_TO_NEAREST: + e.FRINTN(i.dest.reg().S4(), i.src1.reg().S4()); + break; + case ROUND_TO_MINUS_INFINITY: + e.FRINTM(i.dest.reg().S4(), i.src1.reg().S4()); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.FRINTP(i.dest.reg().S4(), i.src1.reg().S4()); + break; + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128); + +// ============================================================================ +// OPCODE_LOAD_CLOCK +// ============================================================================ +struct LOAD_CLOCK : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // When scaling is disabled and the raw clock source is selected, the code + // in the Clock class is actually just forwarding tick counts after one + // simple multiply and division. In that case we rather bake the scaling in + // here to cut extra function calls with CPU cache misses and stack frame + // overhead. + if (cvars::clock_no_scaling && cvars::clock_source_raw) { + auto ratio = Clock::guest_tick_ratio(); + // The 360 CPU is an in-order CPU, ARM64 usually isn't. Since it's + // resolution however is much higher than the 360's mftb instruction this + // can safely be ignored. + + // Read clock cycle count + e.MRS(i.dest, SystemReg::CNTVCT_EL0); + // Apply tick frequency scaling. + e.MOV(X0, ratio.first); + e.MUL(i.dest, i.dest, X0); + e.MOV(X0, ratio.second); + e.UDIV(i.dest, i.dest, X0); + } else { + e.CallNative(LoadClock); + e.MOV(i.dest, X0); + } + } + static uint64_t LoadClock(void* raw_context) { + return Clock::QueryGuestTickCount(); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOAD_CLOCK, LOAD_CLOCK); + +// ============================================================================ +// OPCODE_CONTEXT_BARRIER +// ============================================================================ +struct CONTEXT_BARRIER + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) {} +}; +EMITTER_OPCODE_TABLE(OPCODE_CONTEXT_BARRIER, CONTEXT_BARRIER); + +// ============================================================================ +// OPCODE_MAX +// ============================================================================ +struct MAX_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FMAX(dest, src1, src2); + }); + } +}; +struct MAX_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FMAX(dest, src1, src2); + }); + } +}; +struct MAX_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FMAX(dest.S4(), src1.S4(), src2.S4()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128); + +// ============================================================================ +// OPCODE_MIN +// ============================================================================ +struct MIN_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.CMP(dest_src, src); + e.CSEL(dest_src, dest_src, src, Cond::LO); + }, + [](A64Emitter& e, WReg dest_src, int32_t constant) { + e.MOV(W0, constant); + e.CMP(dest_src, W0); + e.CSEL(dest_src, dest_src, W0, Cond::LO); + }); + } +}; +struct MIN_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.CMP(dest_src, src); + e.CSEL(dest_src, dest_src, src, Cond::LO); + }, + [](A64Emitter& e, WReg dest_src, int32_t constant) { + e.MOV(W0, constant); + e.CMP(dest_src, W0); + e.CSEL(dest_src, dest_src, W0, Cond::LO); + }); + } +}; +struct MIN_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.CMP(dest_src, src); + e.CSEL(dest_src, dest_src, src, Cond::LO); + }, + [](A64Emitter& e, WReg dest_src, int32_t constant) { + e.MOV(W0, constant); + e.CMP(dest_src, W0); + e.CSEL(dest_src, dest_src, W0, Cond::LO); + }); + } +}; +struct MIN_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, XReg dest_src, XReg src) { + e.CMP(dest_src, src); + e.CSEL(dest_src, dest_src, src, Cond::LO); + }, + [](A64Emitter& e, XReg dest_src, int64_t constant) { + e.MOV(X0, constant); + e.CMP(dest_src, X0); + e.CSEL(dest_src, dest_src, X0, Cond::LO); + }); + } +}; +struct MIN_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FMIN(dest, src1, src2); + }); + } +}; +struct MIN_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FMIN(dest, src1, src2); + }); + } +}; +struct MIN_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FMIN(dest.S4(), src1.S4(), src2.S4()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32, + MIN_F64, MIN_V128); + +// ============================================================================ +// OPCODE_SELECT +// ============================================================================ +// dest = src1 ? src2 : src3 +// TODO(benvanik): match compare + select sequences, as often it's something +// like SELECT(VECTOR_COMPARE_SGE(a, b), a, b) +struct SELECT_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + WReg src2(0); + if (i.src2.is_constant) { + src2 = W0; + e.MOV(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + e.CMP(i.src1.reg().toX(), 0); + e.CSEL(i.dest, src2, i.src3, Cond::NE); + } +}; +struct SELECT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + WReg src2(0); + if (i.src2.is_constant) { + src2 = W0; + e.MOV(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + e.CMP(i.src1.reg().toX(), 0); + e.CSEL(i.dest, src2, i.src3, Cond::NE); + } +}; +struct SELECT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + WReg src2(0); + if (i.src2.is_constant) { + src2 = W0; + e.MOV(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + e.CMP(i.src1.reg().toX(), 0); + e.CSEL(i.dest, src2, i.src3, Cond::NE); + } +}; +struct SELECT_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + XReg src2(0); + if (i.src2.is_constant) { + src2 = X0; + e.MOV(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + e.CMP(i.src1.reg().toX(), 0); + e.CSEL(i.dest, src2, i.src3, Cond::NE); + } +}; +struct SELECT_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // dest = src1 != 0 ? src2 : src3 + + SReg src2 = i.src2.is_constant ? S2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } + + SReg src3 = i.src3.is_constant ? S3 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantV(src3.toQ(), i.src3.constant()); + } + + e.CMP(i.src1.reg().toX(), 0); + e.FCSEL(i.dest, src2, src3, Cond::NE); + } +}; +struct SELECT_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // dest = src1 != 0 ? src2 : src3 + + const DReg src2 = i.src2.is_constant ? D2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } + + const DReg src3 = i.src3.is_constant ? D3 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantV(src3.toQ(), i.src3.constant()); + } + + e.CMP(i.src1.reg().toX(), 0); + e.FCSEL(i.dest, src2, src3, Cond::NE); + } +}; +struct SELECT_V128_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // dest = src1 != 0 ? src2 : src3 + + const QReg src2 = i.src2.is_constant ? Q2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2, i.src2.constant()); + } + + const QReg src3 = i.src3.is_constant ? Q3 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantV(src3, i.src3.constant()); + } + + e.CMP(i.src1.reg().toX(), 0); + e.CSETM(W0, Cond::NE); + e.DUP(i.dest.reg().S4(), W0); + e.BSL(i.dest.reg().B16(), src2.B16(), src3.B16()); + } +}; +struct SELECT_V128_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + const QReg src1 = Q0; + if (i.src1.is_constant) { + e.LoadConstantV(src1, i.src1.constant()); + } else { + e.MOV(src1.B16(), i.src1.reg().B16()); + } + + const QReg src2 = i.src2.is_constant ? Q2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantV(src2, i.src2.constant()); + } + + const QReg src3 = i.src3.is_constant ? Q3 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantV(src3, i.src3.constant()); + } + + // src1 ? src2 : src3; + e.BSL(src1.B16(), src3.B16(), src2.B16()); + e.MOV(i.dest.reg().B16(), src1.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32, + SELECT_I64, SELECT_F32, SELECT_F64, SELECT_V128_I8, + SELECT_V128_V128); + +// ============================================================================ +// OPCODE_IS_TRUE +// ============================================================================ +struct IS_TRUE_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::NE); + } +}; +struct IS_TRUE_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::NE); + } +}; +struct IS_TRUE_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::NE); + } +}; +struct IS_TRUE_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::NE); + } +}; +struct IS_TRUE_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::NE); + } +}; +struct IS_TRUE_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::NE); + } +}; +struct IS_TRUE_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UMAXV(Q0.toS(), i.src1.reg().S4()); + e.MOV(W0, Q0.Selem()[0]); + e.CMP(W0, 0); + e.CSET(i.dest, Cond::NE); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32, + IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128); + +// ============================================================================ +// OPCODE_IS_FALSE +// ============================================================================ +struct IS_FALSE_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::EQ); + } +}; +struct IS_FALSE_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::EQ); + } +}; +struct IS_FALSE_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::EQ); + } +}; +struct IS_FALSE_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::EQ); + } +}; +struct IS_FALSE_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::EQ); + } +}; +struct IS_FALSE_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1.reg(), 0); + e.CSET(i.dest, Cond::EQ); + } +}; +struct IS_FALSE_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.UMAXV(Q0.toS(), i.src1.reg().S4()); + e.MOV(W0, Q0.Selem()[0]); + e.CMP(W0, 0); + e.CSET(i.dest, Cond::EQ); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32, + IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128); + +// ============================================================================ +// OPCODE_IS_NAN +// ============================================================================ +struct IS_NAN_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1, i.src1); + e.CSET(i.dest, Cond::VS); + } +}; + +struct IS_NAN_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1, i.src1); + e.CSET(i.dest, Cond::VS); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_IS_NAN, IS_NAN_F32, IS_NAN_F64); + +// ============================================================================ +// OPCODE_COMPARE_EQ +// ============================================================================ +struct COMPARE_EQ_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, WReg src1, int32_t constant) { + e.MOV(W1, constant); + e.CMP(src1, W1); + }); + e.CSET(i.dest, Cond::EQ); + } +}; +struct COMPARE_EQ_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, WReg src1, int32_t constant) { + e.MOV(W1, constant); + e.CMP(src1, W1); + }); + e.CSET(i.dest, Cond::EQ); + } +}; +struct COMPARE_EQ_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, WReg src1, int32_t constant) { + e.MOV(W1, constant); + e.CMP(src1, W1); + }); + e.CSET(i.dest, Cond::EQ); + } +}; +struct COMPARE_EQ_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, XReg src1, XReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, XReg src1, int32_t constant) { + e.MOV(X1, constant); + e.CMP(src1, X1); + }); + e.CSET(i.dest, Cond::EQ); + } +}; +struct COMPARE_EQ_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, + [&i](A64Emitter& e, I8Op dest, const SReg& src1, const SReg& src2) { + e.FCMP(src1, src2); + }); + e.CSET(i.dest, Cond::EQ); + } +}; +struct COMPARE_EQ_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, + [&i](A64Emitter& e, I8Op dest, const DReg& src1, const DReg& src2) { + e.FCMP(src1, src2); + }); + e.CSET(i.dest, Cond::EQ); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EQ, COMPARE_EQ_I8, COMPARE_EQ_I16, + COMPARE_EQ_I32, COMPARE_EQ_I64, COMPARE_EQ_F32, + COMPARE_EQ_F64); + +// ============================================================================ +// OPCODE_COMPARE_NE +// ============================================================================ +struct COMPARE_NE_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, WReg src1, int32_t constant) { + e.MOV(W1, constant); + e.CMP(src1, W1); + }); + e.CSET(i.dest, Cond::NE); + } +}; +struct COMPARE_NE_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, WReg src1, int32_t constant) { + e.MOV(W1, constant); + e.CMP(src1, W1); + }); + e.CSET(i.dest, Cond::NE); + } +}; +struct COMPARE_NE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, WReg src1, WReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, WReg src1, int32_t constant) { + e.MOV(W1, constant); + e.CMP(src1, W1); + }); + e.CSET(i.dest, Cond::NE); + } +}; +struct COMPARE_NE_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, [](A64Emitter& e, XReg src1, XReg src2) { e.CMP(src1, src2); }, + [](A64Emitter& e, XReg src1, int32_t constant) { + e.MOV(X1, constant); + e.CMP(src1, X1); + }); + e.CSET(i.dest, Cond::NE); + } +}; +struct COMPARE_NE_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1, i.src2); + e.CSET(i.dest, Cond::NE); + } +}; +struct COMPARE_NE_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FCMP(i.src1, i.src2); + e.CSET(i.dest, Cond::NE); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_COMPARE_NE, COMPARE_NE_I8, COMPARE_NE_I16, + COMPARE_NE_I32, COMPARE_NE_I64, COMPARE_NE_F32, + COMPARE_NE_F64); + +// ============================================================================ +// OPCODE_COMPARE_* +// ============================================================================ +#define EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, type, \ + reg_type) \ + struct COMPARE_##op##_##type \ + : Sequence> { \ + static void Emit(A64Emitter& e, const EmitArgType& i) { \ + EmitAssociativeCompareOp( \ + e, i, \ + [](A64Emitter& e, WReg dest, const reg_type& src1, \ + const reg_type& src2, bool inverse) { \ + e.CMP(src1, src2); \ + if (!inverse) { \ + e.CSET(dest, cond); \ + } else { \ + e.CSET(dest, inverse_cond); \ + } \ + }, \ + [](A64Emitter& e, WReg dest, const reg_type& src1, int32_t constant, \ + bool inverse) { \ + e.MOV(reg_type(1), constant); \ + e.CMP(src1, reg_type(1)); \ + if (!inverse) { \ + e.CSET(dest, cond); \ + } else { \ + e.CSET(dest, inverse_cond); \ + } \ + }); \ + } \ + }; +#define EMITTER_ASSOCIATIVE_COMPARE_XX(op, cond, inverse_cond) \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I8Op, WReg); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I16Op, WReg); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I32Op, WReg); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, cond, inverse_cond, I64Op, XReg); \ + EMITTER_OPCODE_TABLE(OPCODE_COMPARE_##op, COMPARE_##op##_I8Op, \ + COMPARE_##op##_I16Op, COMPARE_##op##_I32Op, \ + COMPARE_##op##_I64Op); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLT, Cond::LT, Cond::GT); // setl, setg +EMITTER_ASSOCIATIVE_COMPARE_XX(SLE, Cond::LE, Cond::GE); // setle, setge +EMITTER_ASSOCIATIVE_COMPARE_XX(SGT, Cond::GT, Cond::LT); // setg, setl +EMITTER_ASSOCIATIVE_COMPARE_XX(SGE, Cond::GE, Cond::LE); // setge, setle +EMITTER_ASSOCIATIVE_COMPARE_XX(ULT, Cond::LO, Cond::HI); // setb, seta +EMITTER_ASSOCIATIVE_COMPARE_XX(ULE, Cond::LS, Cond::HS); // setbe, setae +EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, Cond::HS, Cond::LS); // setae, setbe +EMITTER_ASSOCIATIVE_COMPARE_XX(UGT, Cond::HI, Cond::LO); // seta, setb + +// https://web.archive.org/web/20171129015931/https://x86.renejeschke.de/html/file_module_x86_id_288.html +// Original link: https://x86.renejeschke.de/html/file_module_x86_id_288.html +#define EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(op, cond) \ + struct COMPARE_##op##_F32 \ + : Sequence> { \ + static void Emit(A64Emitter& e, const EmitArgType& i) { \ + e.FCMP(i.src1, i.src2); \ + e.CSET(i.dest, cond); \ + } \ + }; \ + struct COMPARE_##op##_F64 \ + : Sequence> { \ + static void Emit(A64Emitter& e, const EmitArgType& i) { \ + if (i.src1.is_constant) { \ + e.LoadConstantV(Q0, i.src1.constant()); \ + e.FCMP(D0, i.src2); \ + } else if (i.src2.is_constant) { \ + e.LoadConstantV(Q0, i.src2.constant()); \ + e.FCMP(i.src1, D0); \ + } else { \ + e.FCMP(i.src1, i.src2); \ + } \ + e.CSET(i.dest, cond); \ + } \ + }; \ + EMITTER_OPCODE_TABLE(OPCODE_COMPARE_##op##_FLT, COMPARE_##op##_F32, \ + COMPARE_##op##_F64); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLT, Cond::LT); // setb +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLE, Cond::LE); // setbe +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGT, Cond::GT); // seta +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGE, Cond::GE); // setae +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULT, Cond::LO); // setb +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULE, Cond::LS); // setbe +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGT, Cond::HI); // seta +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGE, Cond::HS); // setae + +// ============================================================================ +// OPCODE_DID_SATURATE +// ============================================================================ +struct DID_SATURATE + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // Bit 27 in the FPSR is the QC bit + e.MRS(X0, SystemReg::FPSR); + e.UBFX(i.dest, W0, 27, 1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, DID_SATURATE); + +// ============================================================================ +// OPCODE_ADD +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAddXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, REG dest_src, REG src) { + e.ADD(dest_src, dest_src, src); + }, + [](A64Emitter& e, REG dest_src, int32_t constant) { + e.MOV(REG(1), constant); + e.ADD(dest_src, dest_src, REG(1)); + }); +} +struct ADD_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +struct ADD_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +struct ADD_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +struct ADD_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +struct ADD_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FADD(dest, src1, src2); + }); + } +}; +struct ADD_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FADD(dest, src1, src2); + }); + } +}; +struct ADD_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FADD(dest.S4(), src1.S4(), src2.S4()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ADD, ADD_I8, ADD_I16, ADD_I32, ADD_I64, ADD_F32, + ADD_F64, ADD_V128); + +// ============================================================================ +// OPCODE_ADD_CARRY +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAddCarryXX(A64Emitter& e, const ARGS& i) { + // TODO(benvanik): faster setting? we could probably do some fun math tricks + // here to get the carry flag set. + if (i.src3.is_constant) { + e.MOV(W0, WZR); + if (i.src3.constant()) { + // Set carry + // This is implicitly "SUBS 0 - 0" + e.CMP(W0, 0); + } else { + // Clear carry + e.CMN(W0, 0); + } + } else { + // If src3 is non-zero, set the carry flag + e.CMP(i.src3.reg().toW(), 0); + e.CSET(X0, Cond::NE); + + e.MRS(X1, SystemReg::NZCV); + // Assign carry bit + e.BFI(X1, X0, 29, 1); + e.MSR(SystemReg::NZCV, X1); + } + SEQ::EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, const REG& dest_src, const REG& src) { + e.ADC(dest_src, dest_src, src); + }, + [](A64Emitter& e, const REG& dest_src, int32_t constant) { + e.MOV(REG(1), constant); + e.ADC(dest_src, dest_src, REG(1)); + }); +} +struct ADD_CARRY_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +struct ADD_CARRY_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +struct ADD_CARRY_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +struct ADD_CARRY_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY, ADD_CARRY_I8, ADD_CARRY_I16, + ADD_CARRY_I32, ADD_CARRY_I64); + +// ============================================================================ +// OPCODE_SUB +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitSubXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, REG dest_src, REG src) { + e.SUB(dest_src, dest_src, src); + }, + [](A64Emitter& e, REG dest_src, int32_t constant) { + e.MOV(REG(1), constant); + e.SUB(dest_src, dest_src, REG(1)); + }); +} +struct SUB_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +struct SUB_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +struct SUB_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +struct SUB_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +struct SUB_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitAssociativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FSUB(dest, src1, src2); + }); + } +}; +struct SUB_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitAssociativeBinaryVOp( + e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FSUB(dest, src1, src2); + }); + } +}; +struct SUB_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitAssociativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FSUB(dest.S4(), src1.S4(), src2.S4()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, + SUB_F64, SUB_V128); + +// ============================================================================ +// OPCODE_MUL +// ============================================================================ +// Sign doesn't matter here, as we don't use the high bits. +// We exploit mulx here to avoid creating too much register pressure. +struct MUL_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + } +}; +struct MUL_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + } +}; +struct MUL_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + } +}; +struct MUL_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(X0, i.src1.constant()); + e.MUL(i.dest, X0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(X0, i.src2.constant()); + e.MUL(i.dest, i.src1, X0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + } +}; +struct MUL_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FMUL(dest, src1, src2); + }); + } +}; +struct MUL_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FMUL(dest, src1, src2); + }); + } +}; +struct MUL_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FMUL(dest.S4(), src1.S4(), src2.S4()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MUL, MUL_I8, MUL_I16, MUL_I32, MUL_I64, MUL_F32, + MUL_F64, MUL_V128); + +// ============================================================================ +// OPCODE_MUL_HI +// ============================================================================ +struct MUL_HI_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + e.UBFX(i.dest, i.dest, 8, 8); + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + e.SBFX(i.dest, i.dest, 8, 8); + } + } +}; +struct MUL_HI_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + e.UBFX(i.dest, i.dest, 16, 16); + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.MUL(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.MUL(i.dest, i.src1, W0); + } else { + e.MUL(i.dest, i.src1, i.src2); + } + e.SBFX(i.dest, i.dest, 16, 16); + } + } +}; +struct MUL_HI_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.UMULL(X0, W0, i.src2); + e.UBFX(X0, X0, 32, 32); + e.MOV(i.dest, X0.toW()); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.UMULL(X0, W0, i.src1); + e.UBFX(X0, X0, 32, 32); + e.MOV(i.dest, X0.toW()); + } else { + e.UMULL(X0, i.src1, i.src2); + e.UBFX(X0, X0, 32, 32); + e.MOV(i.dest, X0.toW()); + } + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.SMULL(X0, W0, i.src2); + e.SBFX(X0, X0, 32, 32); + e.MOV(i.dest, X0.toW()); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.SMULL(X0, W0, i.src1); + e.SBFX(X0, X0, 32, 32); + e.MOV(i.dest, X0.toW()); + } else { + e.SMULL(X0, i.src1, i.src2); + e.SBFX(X0, X0, 32, 32); + e.MOV(i.dest, X0.toW()); + } + } + } +}; +struct MUL_HI_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(X0, i.src1.constant()); + e.UMULH(i.dest, X0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(X0, i.src2.constant()); + e.UMULH(i.dest, i.src1, X0); + } else { + e.UMULH(i.dest, i.src1, i.src2); + } + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(X0, i.src1.constant()); + e.SMULH(i.dest, X0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(X0, i.src2.constant()); + e.SMULH(i.dest, i.src1, X0); + } else { + e.SMULH(i.dest, i.src1, i.src2); + } + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MUL_HI, MUL_HI_I8, MUL_HI_I16, MUL_HI_I32, + MUL_HI_I64); + +// ============================================================================ +// OPCODE_DIV +// ============================================================================ +struct DIV_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.UDIV(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.UDIV(i.dest, i.src1, W0); + } else { + e.UDIV(i.dest, i.src1, i.src2); + } + e.UXTB(i.dest, i.dest); + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.SDIV(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.SDIV(i.dest, i.src1, W0); + } else { + e.SDIV(i.dest, i.src1, i.src2); + } + e.SXTB(i.dest, i.dest); + } + } +}; +struct DIV_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.UDIV(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.UDIV(i.dest, i.src1, W0); + } else { + e.UDIV(i.dest, i.src1, i.src2); + } + e.UXTH(i.dest, i.dest); + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.SDIV(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.SDIV(i.dest, i.src1, W0); + } else { + e.SDIV(i.dest, i.src1, i.src2); + } + e.SXTH(i.dest, i.dest); + } + } +}; +struct DIV_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.UDIV(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.UDIV(i.dest, i.src1, W0); + } else { + e.UDIV(i.dest, i.src1, i.src2); + } + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(W0, i.src1.constant()); + e.SDIV(i.dest, W0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(W0, i.src2.constant()); + e.SDIV(i.dest, i.src1, W0); + } else { + e.SDIV(i.dest, i.src1, i.src2); + } + } + } +}; +struct DIV_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(X0, i.src1.constant()); + e.UDIV(i.dest, X0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(X0, i.src2.constant()); + e.UDIV(i.dest, i.src1, X0); + } else { + e.UDIV(i.dest, i.src1, i.src2); + } + } else { + if (i.src1.is_constant) { + assert_true(!i.src2.is_constant); + e.MOV(X0, i.src1.constant()); + e.SDIV(i.dest, X0, i.src2); + } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); + e.MOV(X0, i.src2.constant()); + e.SDIV(i.dest, i.src1, X0); + } else { + e.SDIV(i.dest, i.src1, i.src2); + } + } + } +}; +struct DIV_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitAssociativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FDIV(dest, src1, src2); + }); + } +}; +struct DIV_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitAssociativeBinaryVOp( + e, i, [](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FDIV(dest, src1, src2); + }); + } +}; +struct DIV_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + EmitAssociativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FDIV(dest.S4(), src1.S4(), src2.S4()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32, + DIV_F64, DIV_V128); + +// ============================================================================ +// OPCODE_MUL_ADD +// ============================================================================ +// d = 1 * 2 + 3 +// $0 = $1x$0 + $2 +struct MUL_ADD_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + SReg src3 = S3; + if (i.src3.is_constant) { + e.LoadConstantV(src3.toQ(), i.src3.constant()); + } else { + src3 = i.src3.reg(); + } + + SReg src2 = S2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } else { + src2 = i.src2.reg(); + } + + SReg src1 = S1; + if (i.src1.is_constant) { + e.LoadConstantV(src1.toQ(), i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.FMADD(i.dest, src1, src2, src3); + } +}; +struct MUL_ADD_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + DReg src3 = D3; + if (i.src3.is_constant) { + e.LoadConstantV(src3.toQ(), i.src3.constant()); + } else { + src3 = i.src3.reg(); + } + + DReg src2 = D2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } else { + src2 = i.src2.reg(); + } + + DReg src1 = D1; + if (i.src1.is_constant) { + e.LoadConstantV(src1.toQ(), i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.FMADD(i.dest, src1, src2, src3); + } +}; +struct MUL_ADD_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + const QReg dest = i.dest.reg(); + if (i.src3.is_constant) { + e.LoadConstantV(dest.toQ(), i.src3.constant()); + } else { + // If i.dest != i.src3, move the addition-term into dest for FMLA + if (i.dest != i.src3) { + e.MOV(dest.B16(), i.src3.reg().B16()); + } + } + + QReg src2 = Q2; + if (i.src2.is_constant) { + e.LoadConstantV(src2.toQ(), i.src2.constant()); + } else { + src2 = i.src2.reg(); + } + + QReg src1 = Q1; + if (i.src1.is_constant) { + e.LoadConstantV(src1.toQ(), i.src1.constant()); + } else { + src1 = i.src1.reg(); + } + + e.FMLA(dest.S4(), src1.S4(), src2.S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); + +// ============================================================================ +// OPCODE_MUL_SUB +// ============================================================================ +// d = 1 * 2 - 3 +// $0 = $2x$0 - $3 +// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. +// dest could be src2 or src3 - need to ensure it's not before overwriting dest +// perhaps use other 132/213/etc +// Forms: +// - 132 -> $1 = $1 * $3 - $2 +// - 213 -> $1 = $2 * $1 - $3 +// - 231 -> $1 = $2 * $3 - $1 +struct MUL_SUB_F32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + SReg src3(1); + if (i.src3.is_constant) { + src3 = S1; + e.LoadConstantV(src3.toQ(), i.src3.constant()); + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3.reg(); + if (i.dest.reg().index() == i.src3.reg().index()) { + e.FMOV(S1, i.src3); + src3 = S1; + } + } + + // Multiply operation is commutative. + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, SReg dest, SReg src1, SReg src2) { + e.FMUL(dest, src1, src2); // $0 = $1 * $2 + }); + + e.FSUB(i.dest, i.dest, src3); // $0 = $1 - $2 + } +}; +struct MUL_SUB_F64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + DReg src3(1); + if (i.src3.is_constant) { + src3 = D1; + e.LoadConstantV(src3.toQ(), i.src3.constant()); + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3.reg(); + if (i.dest.reg().index() == i.src3.reg().index()) { + e.FMOV(D1, i.src3); + src3 = D1; + } + } + + // Multiply operation is commutative. + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, DReg dest, DReg src1, DReg src2) { + e.FMUL(dest, src1, src2); // $0 = $1 * $2 + }); + + e.FSUB(i.dest, i.dest, src3); // $0 = $1 + $2 + } +}; +struct MUL_SUB_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + QReg src3(1); + if (i.src3.is_constant) { + src3 = Q1; + e.LoadConstantV(src3, i.src3.constant()); + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3; + if (i.dest == i.src3) { + e.MOV(Q1.B16(), i.src3.reg().B16()); + src3 = Q1; + } + } + + // Multiply operation is commutative. + EmitCommutativeBinaryVOp( + e, i, [&i](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.FMUL(dest.S4(), src1.S4(), src2.S4()); // $0 = $1 * $2 + }); + + e.FSUB(i.dest.reg().S4(), i.dest.reg().S4(), src3.S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F32, MUL_SUB_F64, MUL_SUB_V128); + +// ============================================================================ +// OPCODE_NEG +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +template +void EmitNegXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitUnaryOp( + e, i, [](A64Emitter& e, REG dest_src) { e.NEG(dest_src, dest_src); }); +} +struct NEG_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +struct NEG_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +struct NEG_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +struct NEG_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +struct NEG_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FNEG(i.dest, i.src1); + } +}; +struct NEG_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FNEG(i.dest, i.src1); + } +}; +struct NEG_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_true(!i.instr->flags); + e.FNEG(i.dest.reg().S4(), i.src1.reg().S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_NEG, NEG_I8, NEG_I16, NEG_I32, NEG_I64, NEG_F32, + NEG_F64, NEG_V128); + +// ============================================================================ +// OPCODE_ABS +// ============================================================================ +struct ABS_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FABS(i.dest, i.src1); + } +}; +struct ABS_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FABS(i.dest, i.src1); + } +}; +struct ABS_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FABS(i.dest.reg().S4(), i.src1.reg().S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ABS, ABS_F32, ABS_F64, ABS_V128); + +// ============================================================================ +// OPCODE_SQRT +// ============================================================================ +struct SQRT_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FSQRT(i.dest, i.src1); + } +}; +struct SQRT_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FSQRT(i.dest, i.src1); + } +}; +struct SQRT_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FSQRT(i.dest.reg().S4(), i.src1.reg().S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SQRT, SQRT_F32, SQRT_F64, SQRT_V128); + +// ============================================================================ +// OPCODE_RSQRT +// ============================================================================ +// Altivec guarantees an error of < 1/4096 for vrsqrtefp +struct RSQRT_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FRSQRTE(i.dest, i.src1); + } +}; +struct RSQRT_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FRSQRTE(i.dest, i.src1); + } +}; +struct RSQRT_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FRSQRTE(i.dest.reg().S4(), i.src1.reg().S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_RSQRT, RSQRT_F32, RSQRT_F64, RSQRT_V128); + +// ============================================================================ +// OPCODE_RECIP +// ============================================================================ +// Altivec guarantees an error of < 1/4096 for vrefp +struct RECIP_F32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FRECPE(i.dest, i.src1); + } +}; +struct RECIP_F64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FRECPE(i.dest, i.src1); + } +}; +struct RECIP_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.FRECPE(i.dest.reg().S4(), i.src1.reg().S4()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_RECIP, RECIP_F32, RECIP_F64, RECIP_V128); + +// ============================================================================ +// OPCODE_POW2 +// ============================================================================ +// TODO(benvanik): use approx here: +// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +struct POW2_F32 : Sequence> { + static float32x4_t EmulatePow2(void*, std::byte src[16]) { + float src_value; + vst1q_lane_f32(&src_value, vld1q_u8(src), 0); + const float result = std::exp2(src_value); + return vld1q_lane_f32(&result, vld1q_u8(src), 0); + } + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_always(); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ())); + e.CallNativeSafe(reinterpret_cast(EmulatePow2)); + e.FMOV(i.dest, S0); + } +}; +struct POW2_F64 : Sequence> { + static float64x2_t EmulatePow2(void*, std::byte src[16]) { + double src_value; + vst1q_lane_f64(&src_value, vld1q_u8(src), 0); + const double result = std::exp2(src_value); + return vld1q_lane_f64(&result, vld1q_u8(src), 0); + } + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_always(); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ())); + e.CallNativeSafe(reinterpret_cast(EmulatePow2)); + e.FMOV(i.dest, D0); + } +}; +struct POW2_V128 : Sequence> { + static float32x4_t EmulatePow2(void*, std::byte src[16]) { + alignas(16) float values[4]; + vst1q_f32(values, vld1q_u8(src)); + for (size_t i = 0; i < 4; ++i) { + values[i] = std::exp2(values[i]); + } + return vld1q_f32(values); + } + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ())); + e.CallNativeSafe(reinterpret_cast(EmulatePow2)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_POW2, POW2_F32, POW2_F64, POW2_V128); + +// ============================================================================ +// OPCODE_LOG2 +// ============================================================================ +// TODO(benvanik): use approx here: +// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it! +struct LOG2_F32 : Sequence> { + static float32x4_t EmulateLog2(void*, std::byte src[16]) { + float src_value; + vst1q_lane_f32(&src_value, vld1q_u8(src), 0); + float result = std::log2(src_value); + return vld1q_lane_f32(&result, vld1q_u8(src), 0); + } + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_always(); + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ())); + } + e.CallNativeSafe(reinterpret_cast(EmulateLog2)); + e.FMOV(i.dest, S0); + } +}; +struct LOG2_F64 : Sequence> { + static float64x2_t EmulateLog2(void*, std::byte src[16]) { + double src_value; + vst1q_lane_f64(&src_value, vld1q_u8(src), 0); + double result = std::log2(src_value); + return vld1q_lane_f64(&result, vld1q_u8(src), 0); + } + static void Emit(A64Emitter& e, const EmitArgType& i) { + assert_always(); + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ())); + } + e.CallNativeSafe(reinterpret_cast(EmulateLog2)); + e.FMOV(i.dest, D0); + } +}; +struct LOG2_V128 : Sequence> { + static float32x4_t EmulateLog2(void*, std::byte src[16]) { + alignas(16) float values[4]; + vst1q_f32(values, vld1q_u8(src)); + for (size_t i = 0; i < 4; ++i) { + values[i] = std::log2(values[i]); + } + return vld1q_f32(values); + } + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + e.ADD(e.GetNativeParam(0), SP, e.StashConstantV(0, i.src1.constant())); + } else { + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1.reg().toQ())); + } + e.CallNativeSafe(reinterpret_cast(EmulateLog2)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LOG2, LOG2_F32, LOG2_F64, LOG2_V128); + +// ============================================================================ +// OPCODE_DOT_PRODUCT_3 +// ============================================================================ +struct DOT_PRODUCT_3_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, QReg src1, QReg src2) { + e.FMUL(dest.toQ().S4(), src1.S4(), src2.S4()); + e.MOV(dest.toQ().Selem()[3], WZR); + e.FADDP(dest.toQ().S4(), dest.toQ().S4(), dest.toQ().S4()); + e.FADDP(dest.toS(), dest.toD().S2()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128); + +// ============================================================================ +// OPCODE_DOT_PRODUCT_4 +// ============================================================================ +struct DOT_PRODUCT_4_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // https://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, SReg dest, QReg src1, QReg src2) { + e.FMUL(dest.toQ().S4(), src1.S4(), src2.S4()); + e.FADDP(dest.toQ().S4(), dest.toQ().S4(), dest.toQ().S4()); + e.FADDP(dest.toS(), dest.toD().S2()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128); + +// ============================================================================ +// OPCODE_AND +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAndXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, REG dest_src, REG src) { + e.AND(dest_src, dest_src, src); + }, + [](A64Emitter& e, REG dest_src, int32_t constant) { + e.MOV(REG(1), constant); + e.AND(dest_src, dest_src, REG(1)); + }); +} +struct AND_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +struct AND_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +struct AND_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +struct AND_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +struct AND_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.AND(dest.B16(), src1.B16(), src2.B16()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_AND, AND_I8, AND_I16, AND_I32, AND_I64, AND_V128); + +// ============================================================================ +// OPCODE_AND_NOT +// ============================================================================ +template +void EmitAndNotXX(A64Emitter& e, const ARGS& i) { + if (i.src1.is_constant) { + // src1 constant. + auto temp = GetTempReg(e); + e.MOV(temp, i.src1.constant()); + e.BIC(i.dest, temp, i.src2); + } else if (i.src2.is_constant) { + // src2 constant. + if (i.dest.reg().index() == i.src1.reg().index()) { + auto temp = GetTempReg(e); + e.MOV(temp, ~i.src2.constant()); + e.AND(i.dest, i.dest, temp); + } else { + e.MOV(i.dest, i.src1); + auto temp = GetTempReg(e); + e.MOV(temp, ~i.src2.constant()); + e.AND(i.dest, i.dest, temp); + } + } else { + // neither are constant + e.BIC(i.dest, i.src1, i.src2); + } +} +struct AND_NOT_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndNotXX(e, i); + } +}; +struct AND_NOT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndNotXX(e, i); + } +}; +struct AND_NOT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndNotXX(e, i); + } +}; +struct AND_NOT_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitAndNotXX(e, i); + } +}; +struct AND_NOT_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.BIC(dest.B16(), src1.B16(), src2.B16()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_AND_NOT, AND_NOT_I8, AND_NOT_I16, AND_NOT_I32, + AND_NOT_I64, AND_NOT_V128); + +// ============================================================================ +// OPCODE_OR +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitOrXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, REG dest_src, REG src) { + e.ORR(dest_src, dest_src, src); + }, + [](A64Emitter& e, REG dest_src, int32_t constant) { + e.MOV(REG(1), constant); + e.ORR(dest_src, dest_src, REG(1)); + }); +} +struct OR_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +struct OR_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +struct OR_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +struct OR_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +struct OR_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.ORR(dest.B16(), src1.B16(), src2.B16()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_OR, OR_I8, OR_I16, OR_I32, OR_I64, OR_V128); + +// ============================================================================ +// OPCODE_XOR +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitXorXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](A64Emitter& e, REG dest_src, REG src) { + e.EOR(dest_src, dest_src, src); + }, + [](A64Emitter& e, REG dest_src, int32_t constant) { + e.MOV(REG(1), constant); + e.EOR(dest_src, dest_src, REG(1)); + }); +} +struct XOR_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +struct XOR_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +struct XOR_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +struct XOR_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +struct XOR_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryVOp( + e, i, [](A64Emitter& e, QReg dest, QReg src1, QReg src2) { + e.EOR(dest.B16(), src1.B16(), src2.B16()); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_XOR, XOR_I8, XOR_I16, XOR_I32, XOR_I64, XOR_V128); + +// ============================================================================ +// OPCODE_NOT +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +template +void EmitNotXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitUnaryOp( + e, i, [](A64Emitter& e, REG dest_src) { e.MVN(dest_src, dest_src); }); +} +struct NOT_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +struct NOT_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +struct NOT_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +struct NOT_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +struct NOT_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.NOT(i.dest.reg().B16(), i.src1.reg().B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_NOT, NOT_I8, NOT_I16, NOT_I32, NOT_I64, NOT_V128); + +// ============================================================================ +// OPCODE_SHL +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitShlXX(A64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, REG dest_src, WReg src) { + e.LSL(dest_src, dest_src, REG(src.index())); + }, + [](A64Emitter& e, REG dest_src, int8_t constant) { + e.LSL(dest_src, dest_src, constant); + }); +} +struct SHL_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +struct SHL_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +struct SHL_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +struct SHL_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +struct SHL_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.MOV(e.GetNativeParam(1), i.src2.constant()); + } else { + e.MOV(e.GetNativeParam(1), i.src2.reg().toX()); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateShlV128)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static float32x4_t EmulateShlV128(void*, std::byte src1[16], uint8_t src2) { + // Almost all instances are shamt = 1, but non-constant. + // shamt is [0,7] + uint8_t shamt = src2 & 0x7; + alignas(16) vec128_t value; + vst1q_f32(reinterpret_cast(&value), vld1q_u8(src1)); + for (int i = 0; i < 15; ++i) { + value.u8[i ^ 0x3] = (value.u8[i ^ 0x3] << shamt) | + (value.u8[(i + 1) ^ 0x3] >> (8 - shamt)); + } + value.u8[15 ^ 0x3] = value.u8[15 ^ 0x3] << shamt; + return vld1q_f32(reinterpret_cast(&value)); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SHL, SHL_I8, SHL_I16, SHL_I32, SHL_I64, SHL_V128); + +// ============================================================================ +// OPCODE_SHR +// ============================================================================ +struct SHR_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.LSR(dest_src, dest_src, src); + }, + [](A64Emitter& e, WReg dest_src, int8_t constant) { + e.LSR(dest_src, dest_src, constant); + }); + } +}; +struct SHR_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.LSR(dest_src, dest_src, src); + }, + [](A64Emitter& e, WReg dest_src, int8_t constant) { + e.LSR(dest_src, dest_src, constant); + }); + } +}; +struct SHR_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.LSR(dest_src, dest_src, src); + }, + [](A64Emitter& e, WReg dest_src, int8_t constant) { + e.LSR(dest_src, dest_src, constant); + }); + } +}; +struct SHR_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, XReg dest_src, WReg src) { + e.LSR(dest_src, dest_src, src.toX()); + }, + [](A64Emitter& e, XReg dest_src, int8_t constant) { + e.LSR(dest_src, dest_src, constant); + }); + } +}; +struct SHR_V128 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.MOV(e.GetNativeParam(1), i.src2.constant()); + } else { + e.MOV(e.GetNativeParam(1), i.src2.reg().toX()); + } + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateShrV128)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } + static float32x4_t EmulateShrV128(void*, std::byte src1[16], uint8_t src2) { + // Almost all instances are shamt = 1, but non-constant. + // shamt is [0,7] + uint8_t shamt = src2 & 0x7; + alignas(16) vec128_t value; + vst1q_f32(reinterpret_cast(&value), vld1q_u8(src1)); + for (int i = 15; i > 0; --i) { + value.u8[i ^ 0x3] = (value.u8[i ^ 0x3] >> shamt) | + (value.u8[(i - 1) ^ 0x3] << (8 - shamt)); + } + value.u8[0 ^ 0x3] = value.u8[0 ^ 0x3] >> shamt; + return vld1q_f32(reinterpret_cast(&value)); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SHR, SHR_I8, SHR_I16, SHR_I32, SHR_I64, SHR_V128); + +// ============================================================================ +// OPCODE_SHA +// ============================================================================ +struct SHA_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.SXTB(dest_src, dest_src); + e.ASR(dest_src, dest_src, src); + }, + [](A64Emitter& e, WReg dest_src, int8_t constant) { + e.SXTB(dest_src, dest_src); + e.ASR(dest_src, dest_src, constant); + }); + } +}; +struct SHA_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.SXTH(dest_src, dest_src); + e.ASR(dest_src, dest_src, src); + }, + [](A64Emitter& e, WReg dest_src, int8_t constant) { + e.ASR(dest_src, dest_src, constant); + }); + } +}; +struct SHA_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, WReg dest_src, WReg src) { + e.ASR(dest_src, dest_src, src); + }, + [](A64Emitter& e, WReg dest_src, int8_t constant) { + e.ASR(dest_src, dest_src, constant); + }); + } +}; +struct SHA_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + Sequence::EmitAssociativeBinaryOp( + e, i, + [](A64Emitter& e, XReg dest_src, WReg src) { + e.ASR(dest_src, dest_src, src.toX()); + }, + [](A64Emitter& e, XReg dest_src, int8_t constant) { + e.ASR(dest_src, dest_src, constant); + }); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SHA, SHA_I8, SHA_I16, SHA_I32, SHA_I64); + +// ============================================================================ +// OPCODE_ROTATE_LEFT +// ============================================================================ +// TODO(benvanik): put dest/src1 together, src2 in cl. +template +void EmitRotateLeftXX(A64Emitter& e, const ARGS& i) { + // ; rotate r1 left by r2, producing r0 + // ; (destroys r2) + // ; r1 = ABCDEFGH + // lslv r0, r1, r2 ; r0 = EFGH0000 + // mvn r2, r2 ; r2 = leftover bits + // lsrv r2, r1, r2 ; r2 = 0000ABCD + // orr r0, r0, r2 ; r0 = EFGHABCD + if (i.src1.is_constant) { + e.MOV(REG(0), i.src1.constant()); + } else { + e.MOV(REG(0), i.src1.reg()); + } + + if (i.src2.is_constant) { + e.MOV(REG(1), i.src2.constant()); + } else { + e.MOV(W1, i.src2.reg().toW()); + } + + e.LSLV(i.dest, REG(0), REG(1)); + e.MVN(REG(1), REG(1)); + e.LSRV(REG(1), REG(0), REG(1)); + e.ORR(i.dest, i.dest, REG(1)); +} +struct ROTATE_LEFT_I8 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +struct ROTATE_LEFT_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +struct ROTATE_LEFT_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + e.MOV(W0, i.src1.constant()); + } else { + e.MOV(W0, i.src1.reg()); + } + + if (i.src2.is_constant) { + e.MOV(W1, i.src2.constant()); + } else { + e.SXTB(W1, i.src2.reg()); + } + e.NEG(W1, W1); + + e.ROR(i.dest, W0, W1); + } +}; +struct ROTATE_LEFT_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + e.MOV(X0, i.src1.constant()); + } else { + e.MOV(X0, i.src1.reg()); + } + + if (i.src2.is_constant) { + e.MOV(X1, i.src2.constant()); + } else { + e.SXTB(X1, i.src2.reg().toW()); + } + e.NEG(X1, X1); + + e.ROR(i.dest, X0, X1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT, ROTATE_LEFT_I8, ROTATE_LEFT_I16, + ROTATE_LEFT_I32, ROTATE_LEFT_I64); + +// ============================================================================ +// OPCODE_BYTE_SWAP +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +struct BYTE_SWAP_I16 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitUnaryOp(e, i, [](A64Emitter& e, WReg dest_src) { + e.REV16(dest_src, dest_src); + }); + } +}; +struct BYTE_SWAP_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, [](A64Emitter& e, WReg dest_src) { e.REV(dest_src, dest_src); }); + } +}; +struct BYTE_SWAP_I64 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, [](A64Emitter& e, XReg dest_src) { e.REV(dest_src, dest_src); }); + } +}; +struct BYTE_SWAP_V128 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // Reverse upper and lower 64-bit halfs + e.REV32(i.dest.reg().B16(), i.src1.reg().B16()); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP, BYTE_SWAP_I16, BYTE_SWAP_I32, + BYTE_SWAP_I64, BYTE_SWAP_V128); + +// ============================================================================ +// OPCODE_CNTLZ +// Count leading zeroes +// ============================================================================ +struct CNTLZ_I8 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // No 8bit lzcnt, so do 32 and sub 24. + e.UXTB(i.dest, i.src1); + e.CLZ(i.dest, i.dest); + e.SUB(i.dest.reg(), i.dest.reg(), 24); + } +}; +struct CNTLZ_I16 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // No 16bit lzcnt, so do 32 and sub 16. + e.UXTH(i.dest, i.src1); + e.CLZ(i.dest, i.dest); + e.SUB(i.dest.reg(), i.dest.reg(), 16); + } +}; +struct CNTLZ_I32 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CLZ(i.dest, i.src1); + } +}; +struct CNTLZ_I64 : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + e.CLZ(i.dest.reg().toX(), i.src1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_CNTLZ, CNTLZ_I8, CNTLZ_I16, CNTLZ_I32, CNTLZ_I64); + +// ============================================================================ +// OPCODE_SET_ROUNDING_MODE +// ============================================================================ +// Input: FPSCR (PPC format) +// Convert from PPC rounding mode to ARM +// PPC | ARM | +// 00 | 00 | nearest +// 01 | 11 | toward zero +// 10 | 01 | toward +infinity +// 11 | 10 | toward -infinity +static const uint8_t fpcr_table[] = { + 0b0'00, // |--|nearest + 0b0'11, // |--|toward zero + 0b0'01, // |--|toward +infinity + 0b0'10, // |--|toward -infinity + 0b1'00, // |FZ|nearest + 0b1'11, // |FZ|toward zero + 0b1'01, // |FZ|toward +infinity + 0b1'10, // |FZ|toward -infinity +}; +struct SET_ROUNDING_MODE_I32 + : Sequence> { + static void Emit(A64Emitter& e, const EmitArgType& i) { + // Low 3 bits are |Non-IEEE:1|RoundingMode:2| + // Non-IEEE bit is flush-to-zero + e.AND(W1, i.src1, 0b111); + + // Use the low 3 bits as an index into a LUT + e.MOV(X0, reinterpret_cast(fpcr_table)); + e.LDRB(W0, X0, X1); + + // Replace FPCR bits with new value + e.MRS(X1, SystemReg::FPCR); + e.BFI(X1, X0, 23, 3); + e.MSR(SystemReg::FPCR, X1); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32); + +// Include anchors to other sequence sources so they get included in the build. +extern volatile int anchor_control; +static int anchor_control_dest = anchor_control; + +extern volatile int anchor_memory; +static int anchor_memory_dest = anchor_memory; + +extern volatile int anchor_vector; +static int anchor_vector_dest = anchor_vector; + +bool SelectSequence(A64Emitter* e, const Instr* i, const Instr** new_tail) { + const InstrKey key(i); + auto it = sequence_table.find(key); + if (it != sequence_table.end()) { + if (it->second(*e, i)) { + *new_tail = i->next; + return true; + } + } + XELOGE("No sequence match for variant {}", i->opcode->name); + return false; +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_sequences.h b/src/xenia/cpu/backend/a64/a64_sequences.h new file mode 100644 index 000000000..b47382633 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_sequences.h @@ -0,0 +1,51 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_SEQUENCES_H_ +#define XENIA_CPU_BACKEND_A64_A64_SEQUENCES_H_ + +#include "xenia/cpu/hir/instr.h" + +#include + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +class A64Emitter; + +typedef bool (*SequenceSelectFn)(A64Emitter&, const hir::Instr*); +extern std::unordered_map sequence_table; + +template +bool Register() { + sequence_table.insert({T::head_key(), T::Select}); + return true; +} + +template +static bool Register() { + bool b = true; + b = b && Register(); // Call the above function + b = b && Register(); // Call ourself again (recursively) + return b; +} +#define EMITTER_OPCODE_TABLE(name, ...) \ + const auto A64_INSTR_##name = Register<__VA_ARGS__>(); + +bool SelectSequence(A64Emitter* e, const hir::Instr* i, + const hir::Instr** new_tail); + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_SEQUENCES_H_ diff --git a/src/xenia/cpu/backend/a64/a64_stack_layout.h b/src/xenia/cpu/backend/a64/a64_stack_layout.h new file mode 100644 index 000000000..ee8cbcfac --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_stack_layout.h @@ -0,0 +1,129 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_STACK_LAYOUT_H_ +#define XENIA_CPU_BACKEND_A64_A64_STACK_LAYOUT_H_ + +#include "xenia/base/vec128.h" +#include "xenia/cpu/backend/a64/a64_backend.h" +#include "xenia/cpu/backend/a64/a64_emitter.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +class StackLayout { + public: + /** + * Stack Layout + * ---------------------------- + * NOTE: stack must always be 16b aligned. + * + * Thunk stack: + * Non-Volatile Volatile + * +------------------+------------------+ + * | arg temp, 3 * 8 | arg temp, 3 * 8 | sp + 0x000 + * | | | + * | | | + * +------------------+------------------+ + * | rbx | (unused) | sp + 0x018 + * +------------------+------------------+ + * | rbp | X1 | sp + 0x020 + * +------------------+------------------+ + * | rcx (Win32) | X2 | sp + 0x028 + * +------------------+------------------+ + * | rsi (Win32) | X3 | sp + 0x030 + * +------------------+------------------+ + * | rdi (Win32) | X4 | sp + 0x038 + * +------------------+------------------+ + * | r12 | X5 | sp + 0x040 + * +------------------+------------------+ + * | r13 | X6 | sp + 0x048 + * +------------------+------------------+ + * | r14 | X7 | sp + 0x050 + * +------------------+------------------+ + * | r15 | X8 | sp + 0x058 + * +------------------+------------------+ + * | xmm6 (Win32) | X9 | sp + 0x060 + * | | | + * +------------------+------------------+ + * | xmm7 (Win32) | X10 | sp + 0x070 + * | | | + * +------------------+------------------+ + * | xmm8 (Win32) | X11 | sp + 0x080 + * | | | + * +------------------+------------------+ + * | xmm9 (Win32) | X12 | sp + 0x090 + * | | | + * +------------------+------------------+ + * | xmm10 (Win32) | X13 | sp + 0x0A0 + * | | | + * +------------------+------------------+ + * | xmm11 (Win32) | X14 | sp + 0x0B0 + * | | | + * +------------------+------------------+ + * | xmm12 (Win32) | X15 | sp + 0x0C0 + * | | | + * +------------------+------------------+ + * | xmm13 (Win32) | X16 | sp + 0x0D0 + * | | | + * +------------------+------------------+ + * | xmm14 (Win32) | X17 | sp + 0x0E0 + * | | | + * +------------------+------------------+ + * | xmm15 (Win32) | X18 | sp + 0x0F0 + * | | | + * +------------------+------------------+ + */ + XEPACKEDSTRUCT(Thunk, { + uint64_t arg_temp[3]; + uint64_t r[17]; + vec128_t xmm[22]; + }); + static_assert(sizeof(Thunk) % 16 == 0, + "sizeof(Thunk) must be a multiple of 16!"); + static const size_t THUNK_STACK_SIZE = sizeof(Thunk); + + /** + * + * + * Guest stack: + * +------------------+ + * | arg temp, 3 * 8 | sp + 0 + * | | + * | | + * +------------------+ + * | scratch, 48b | sp + 32(kStashOffset) + * | | + * +------------------+ + * | X0 / context | sp + 80 + * +------------------+ + * | guest ret addr | sp + 88 + * +------------------+ + * | call ret addr | sp + 96 + * +------------------+ + * ... locals ... + * +------------------+ + * | (return address) | + * +------------------+ + * + */ + static const size_t GUEST_STACK_SIZE = 96 + 16; + static const size_t GUEST_CTX_HOME = 80; + static const size_t GUEST_RET_ADDR = 88; + static const size_t GUEST_CALL_RET_ADDR = 96; +}; + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_STACK_LAYOUT_H_ diff --git a/src/xenia/cpu/backend/a64/a64_tracers.cc b/src/xenia/cpu/backend/a64/a64_tracers.cc new file mode 100644 index 000000000..146f50982 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_tracers.cc @@ -0,0 +1,225 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/cpu/backend/a64/a64_tracers.h" + +#include + +#include "xenia/base/logging.h" +#include "xenia/base/vec128.h" +#include "xenia/cpu/backend/a64/a64_emitter.h" +#include "xenia/cpu/processor.h" +#include "xenia/cpu/thread_state.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +#define ITRACE 0 +#define DTRACE 0 + +#define TARGET_THREAD 0 + +bool trace_enabled = true; + +#define THREAD_MATCH \ + (!TARGET_THREAD || thread_state->thread_id() == TARGET_THREAD) +#define IFLUSH() +#define IPRINT(s) \ + if (trace_enabled && THREAD_MATCH) \ + xe::logging::AppendLogLine(xe::LogLevel::Debug, 't', s) +#define DFLUSH() +#define DPRINT(...) \ + if (trace_enabled && THREAD_MATCH) \ + xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 't', __VA_ARGS__) + +uint32_t GetTracingMode() { + uint32_t mode = 0; +#if ITRACE + mode |= TRACING_INSTR; +#endif // ITRACE +#if DTRACE + mode |= TRACING_DATA; +#endif // DTRACE + return mode; +} + +void TraceString(void* raw_context, const char* str) { + auto thread_state = *reinterpret_cast(raw_context); + IPRINT(str); + IFLUSH(); +} + +void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = ctx i8 +{}\n", (int8_t)value, value, offset); +} +void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = ctx i16 +{}\n", (int16_t)value, value, offset); +} +void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = ctx i32 +{}\n", (int32_t)value, value, offset); +} +void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = ctx i64 +{}\n", (int64_t)value, value, offset); +} +void TraceContextLoadF32(void* raw_context, uint64_t offset, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = ctx f32 +{}\n", xe::m128_f32<0>(value), + xe::m128_i32<0>(value), offset); +} +void TraceContextLoadF64(void* raw_context, uint64_t offset, + const double* value) { + auto thread_state = *reinterpret_cast(raw_context); + // auto v = _mm_loadu_pd(value); + auto v = vld1q_f64(value); + DPRINT("{} ({:X}) = ctx f64 +{}\n", xe::m128_f64<0>(v), xe::m128_i64<0>(v), + offset); +} +void TraceContextLoadV128(void* raw_context, uint64_t offset, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("[{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}] = ctx v128 +{}\n", + xe::m128_f32<0>(value), xe::m128_f32<1>(value), xe::m128_f32<2>(value), + xe::m128_f32<3>(value), xe::m128_i32<0>(value), xe::m128_i32<1>(value), + xe::m128_i32<2>(value), xe::m128_i32<3>(value), offset); +} + +void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("ctx i8 +{} = {} ({:X})\n", offset, (int8_t)value, value); +} +void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("ctx i16 +{} = {} ({:X})\n", offset, (int16_t)value, value); +} +void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("ctx i32 +{} = {} ({:X})\n", offset, (int32_t)value, value); +} +void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("ctx i64 +{} = {} ({:X})\n", offset, (int64_t)value, value); +} +void TraceContextStoreF32(void* raw_context, uint64_t offset, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("ctx f32 +{} = {} ({:X})\n", offset, xe::m128_f32<0>(value), + xe::m128_i32<0>(value)); +} +void TraceContextStoreF64(void* raw_context, uint64_t offset, + const double* value) { + auto thread_state = *reinterpret_cast(raw_context); + // auto v = _mm_loadu_pd(value); + auto v = vld1q_f64(value); + DPRINT("ctx f64 +{} = {} ({:X})\n", offset, xe::m128_f64<0>(v), + xe::m128_i64<0>(v)); +} +void TraceContextStoreV128(void* raw_context, uint64_t offset, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("ctx v128 +{} = [{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}]\n", + offset, xe::m128_f32<0>(value), xe::m128_f32<1>(value), + xe::m128_f32<2>(value), xe::m128_f32<3>(value), xe::m128_i32<0>(value), + xe::m128_i32<1>(value), xe::m128_i32<2>(value), + xe::m128_i32<3>(value)); +} + +void TraceMemoryLoadI8(void* raw_context, uint32_t address, uint8_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = load.i8 {:08X}\n", (int8_t)value, value, address); +} +void TraceMemoryLoadI16(void* raw_context, uint32_t address, uint16_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = load.i16 {:08X}\n", (int16_t)value, value, address); +} +void TraceMemoryLoadI32(void* raw_context, uint32_t address, uint32_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = load.i32 {:08X}\n", (int32_t)value, value, address); +} +void TraceMemoryLoadI64(void* raw_context, uint32_t address, uint64_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = load.i64 {:08X}\n", (int64_t)value, value, address); +} +void TraceMemoryLoadF32(void* raw_context, uint32_t address, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = load.f32 {:08X}\n", xe::m128_f32<0>(value), + xe::m128_i32<0>(value), address); +} +void TraceMemoryLoadF64(void* raw_context, uint32_t address, + float64x2_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("{} ({:X}) = load.f64 {:08X}\n", xe::m128_f64<0>(value), + xe::m128_i64<0>(value), address); +} +void TraceMemoryLoadV128(void* raw_context, uint32_t address, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT( + "[{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}] = load.v128 {:08X}\n", + xe::m128_f32<0>(value), xe::m128_f32<1>(value), xe::m128_f32<2>(value), + xe::m128_f32<3>(value), xe::m128_i32<0>(value), xe::m128_i32<1>(value), + xe::m128_i32<2>(value), xe::m128_i32<3>(value), address); +} + +void TraceMemoryStoreI8(void* raw_context, uint32_t address, uint8_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("store.i8 {:08X} = {} ({:X})\n", address, (int8_t)value, value); +} +void TraceMemoryStoreI16(void* raw_context, uint32_t address, uint16_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("store.i16 {:08X} = {} ({:X})\n", address, (int16_t)value, value); +} +void TraceMemoryStoreI32(void* raw_context, uint32_t address, uint32_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("store.i32 {:08X} = {} ({:X})\n", address, (int32_t)value, value); +} +void TraceMemoryStoreI64(void* raw_context, uint32_t address, uint64_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("store.i64 {:08X} = {} ({:X})\n", address, (int64_t)value, value); +} +void TraceMemoryStoreF32(void* raw_context, uint32_t address, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("store.f32 {:08X} = {} ({:X})\n", address, xe::m128_f32<0>(value), + xe::m128_i32<0>(value)); +} +void TraceMemoryStoreF64(void* raw_context, uint32_t address, + float64x2_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("store.f64 {:08X} = {} ({:X})\n", address, xe::m128_f64<0>(value), + xe::m128_i64<0>(value)); +} +void TraceMemoryStoreV128(void* raw_context, uint32_t address, + float32x4_t value) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT( + "store.v128 {:08X} = [{}, {}, {}, {}] [{:08X}, {:08X}, {:08X}, {:08X}]\n", + address, xe::m128_f32<0>(value), xe::m128_f32<1>(value), + xe::m128_f32<2>(value), xe::m128_f32<3>(value), xe::m128_i32<0>(value), + xe::m128_i32<1>(value), xe::m128_i32<2>(value), xe::m128_i32<3>(value)); +} + +void TraceMemset(void* raw_context, uint32_t address, uint8_t value, + uint32_t length) { + auto thread_state = *reinterpret_cast(raw_context); + DPRINT("memset {:08X}-{:08X} ({}) = {:02X}", address, address + length, + length, value); +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe diff --git a/src/xenia/cpu/backend/a64/a64_tracers.h b/src/xenia/cpu/backend/a64/a64_tracers.h new file mode 100644 index 000000000..62b740356 --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_tracers.h @@ -0,0 +1,82 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_TRACERS_H_ +#define XENIA_CPU_BACKEND_A64_A64_TRACERS_H_ + +#include +#include + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { +class A64Emitter; + +enum TracingMode { + TRACING_INSTR = (1 << 1), + TRACING_DATA = (1 << 2), +}; + +uint32_t GetTracingMode(); +inline bool IsTracingInstr() { return (GetTracingMode() & TRACING_INSTR) != 0; } +inline bool IsTracingData() { return (GetTracingMode() & TRACING_DATA) != 0; } + +void TraceString(void* raw_context, const char* str); + +void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value); +void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value); +void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value); +void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value); +void TraceContextLoadF32(void* raw_context, uint64_t offset, float32x4_t value); +void TraceContextLoadF64(void* raw_context, uint64_t offset, + const double* value); +void TraceContextLoadV128(void* raw_context, uint64_t offset, + float32x4_t value); + +void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value); +void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value); +void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value); +void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value); +void TraceContextStoreF32(void* raw_context, uint64_t offset, + float32x4_t value); +void TraceContextStoreF64(void* raw_context, uint64_t offset, + const double* value); +void TraceContextStoreV128(void* raw_context, uint64_t offset, + float32x4_t value); + +void TraceMemoryLoadI8(void* raw_context, uint32_t address, uint8_t value); +void TraceMemoryLoadI16(void* raw_context, uint32_t address, uint16_t value); +void TraceMemoryLoadI32(void* raw_context, uint32_t address, uint32_t value); +void TraceMemoryLoadI64(void* raw_context, uint32_t address, uint64_t value); +void TraceMemoryLoadF32(void* raw_context, uint32_t address, float32x4_t value); +void TraceMemoryLoadF64(void* raw_context, uint32_t address, float64x2_t value); +void TraceMemoryLoadV128(void* raw_context, uint32_t address, + float32x4_t value); + +void TraceMemoryStoreI8(void* raw_context, uint32_t address, uint8_t value); +void TraceMemoryStoreI16(void* raw_context, uint32_t address, uint16_t value); +void TraceMemoryStoreI32(void* raw_context, uint32_t address, uint32_t value); +void TraceMemoryStoreI64(void* raw_context, uint32_t address, uint64_t value); +void TraceMemoryStoreF32(void* raw_context, uint32_t address, + float32x4_t value); +void TraceMemoryStoreF64(void* raw_context, uint32_t address, + float64x2_t value); +void TraceMemoryStoreV128(void* raw_context, uint32_t address, + float32x4_t value); + +void TraceMemset(void* raw_context, uint32_t address, uint8_t value, + uint32_t length); + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_TRACERS_H_ diff --git a/src/xenia/cpu/backend/a64/a64_util.h b/src/xenia/cpu/backend/a64/a64_util.h new file mode 100644 index 000000000..0b950b8ae --- /dev/null +++ b/src/xenia/cpu/backend/a64/a64_util.h @@ -0,0 +1,95 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_BACKEND_A64_A64_UTIL_H_ +#define XENIA_CPU_BACKEND_A64_A64_UTIL_H_ + +#include "xenia/base/vec128.h" +#include "xenia/cpu/backend/a64/a64_backend.h" +#include "xenia/cpu/backend/a64/a64_emitter.h" + +namespace xe { +namespace cpu { +namespace backend { +namespace a64 { + +// Attempts to convert an fp32 bit-value into an fp8-immediate value for FMOV +// returns false if the value cannot be represented +// C2.2.3 Modified immediate constants in A64 ing-point instructions +// abcdefgh +// V +// aBbbbbbc defgh000 00000000 00000000 +// B = NOT(b) +constexpr bool f32_to_fimm8(uint32_t u32, oaknut::FImm8& fp8) { + const uint32_t sign = (u32 >> 31) & 1; + int32_t exp = ((u32 >> 23) & 0xff) - 127; + int64_t mantissa = u32 & 0x7fffff; + + // Too many mantissa bits + if (mantissa & 0x7ffff) { + return false; + } + // Too many exp bits + if (exp < -3 || exp > 4) { + return false; + } + + // mantissa = (16 + e:f:g:h) / 16. + mantissa >>= 19; + if ((mantissa & 0b1111) != mantissa) { + return false; + } + + // exp = (NOT(b):c:d) - 3 + exp = ((exp + 3) & 0b111) ^ 0b100; + + fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa)); + return true; +} + +// Attempts to convert an fp64 bit-value into an fp8-immediate value for FMOV +// returns false if the value cannot be represented +// C2.2.3 Modified immediate constants in A64 floating-point instructions +// abcdefgh +// V +// aBbbbbbb bbcdefgh 00000000 00000000 00000000 00000000 00000000 00000000 +// B = NOT(b) +constexpr bool f64_to_fimm8(uint64_t u64, oaknut::FImm8& fp8) { + const uint32_t sign = (u64 >> 63) & 1; + int32_t exp = ((u64 >> 52) & 0x7ff) - 1023; + int64_t mantissa = u64 & 0xfffffffffffffULL; + + // Too many mantissa bits + if (mantissa & 0xffffffffffffULL) { + return false; + } + // Too many exp bits + if (exp < -3 || exp > 4) { + return false; + } + + // mantissa = (16 + e:f:g:h) / 16. + mantissa >>= 48; + if ((mantissa & 0b1111) != mantissa) { + return false; + } + + // exp = (NOT(b):c:d) - 3 + exp = ((exp + 3) & 0b111) ^ 0b100; + + fp8 = oaknut::FImm8(sign, exp, uint8_t(mantissa)); + return true; +} + +} // namespace a64 +} // namespace backend +} // namespace cpu +} // namespace xe + +#endif // XENIA_CPU_BACKEND_A64_A64_UTIL_H_ diff --git a/src/xenia/cpu/backend/a64/premake5.lua b/src/xenia/cpu/backend/a64/premake5.lua new file mode 100644 index 000000000..32b2d51a0 --- /dev/null +++ b/src/xenia/cpu/backend/a64/premake5.lua @@ -0,0 +1,31 @@ +project_root = "../../../../.." +include(project_root.."/tools/build") + +group("src") +project("xenia-cpu-backend-a64") + uuid("495f3f3e-f5e8-489a-bd0f-289d0495bc08") + filter("architecture:ARM64") + kind("StaticLib") + filter("architecture:not ARM64") + kind("None") + filter({}) + language("C++") + cppdialect("C++20") + links({ + "fmt", + "xenia-base", + "xenia-cpu", + }) + defines({ + }) + + disablewarnings({ + -- Silence errors in oaknut + "4146", -- unary minus operator applied to unsigned type, result still unsigned + "4267" -- 'initializing': conversion from 'size_t' to 'uint32_t', possible loss of data + }) + + includedirs({ + project_root.."/third_party/oaknut/include", + }) + local_platform_files() diff --git a/src/xenia/cpu/backend/x64/premake5.lua b/src/xenia/cpu/backend/x64/premake5.lua index f2a990f29..90e5288cf 100644 --- a/src/xenia/cpu/backend/x64/premake5.lua +++ b/src/xenia/cpu/backend/x64/premake5.lua @@ -4,7 +4,11 @@ include(project_root.."/tools/build") group("src") project("xenia-cpu-backend-x64") uuid("7d8d5dce-4696-4197-952a-09506f725afe") - kind("StaticLib") + filter("architecture:x86_64") + kind("StaticLib") + filter("architecture:not x86_64") + kind("None") + filter({}) language("C++") links({ "capstone", diff --git a/src/xenia/cpu/breakpoint.cc b/src/xenia/cpu/breakpoint.cc index 9572d4760..ebcd84efb 100644 --- a/src/xenia/cpu/breakpoint.cc +++ b/src/xenia/cpu/breakpoint.cc @@ -48,7 +48,8 @@ std::string Breakpoint::to_string() const { str += " " + functions[0]->name(); return str; } else { - return std::string("x64 ") + xe::string_util::to_hex_string(host_address()); + return std::string(XE_HOST_ARCH_NAME " ") + + xe::string_util::to_hex_string(host_address()); } } diff --git a/src/xenia/cpu/cpu_flags.cc b/src/xenia/cpu/cpu_flags.cc index 614dabae8..de7fb78e8 100644 --- a/src/xenia/cpu/cpu_flags.cc +++ b/src/xenia/cpu/cpu_flags.cc @@ -9,7 +9,7 @@ #include "xenia/cpu/cpu_flags.h" -DEFINE_string(cpu, "any", "CPU backend [any, x64].", "CPU"); +DEFINE_string(cpu, "any", "CPU backend [any, x64, a64].", "CPU"); DEFINE_string( load_module_map, "", diff --git a/src/xenia/cpu/ppc/testing/ppc_testing_main.cc b/src/xenia/cpu/ppc/testing/ppc_testing_main.cc index 5faa4998e..639b14ba3 100644 --- a/src/xenia/cpu/ppc/testing/ppc_testing_main.cc +++ b/src/xenia/cpu/ppc/testing/ppc_testing_main.cc @@ -23,6 +23,8 @@ #if XE_ARCH_AMD64 #include "xenia/cpu/backend/x64/x64_backend.h" +#elif XE_ARCH_ARM64 +#include "xenia/cpu/backend/a64/a64_backend.h" #endif // XE_ARCH #if XE_COMPILER_MSVC @@ -203,11 +205,17 @@ class TestRunner { if (cvars::cpu == "x64") { backend.reset(new xe::cpu::backend::x64::X64Backend()); } +#elif XE_ARCH_ARM64 + if (cvars::cpu == "a64") { + backend.reset(new xe::cpu::backend::a64::A64Backend()); + } #endif // XE_ARCH if (cvars::cpu == "any") { if (!backend) { #if XE_ARCH_AMD64 backend.reset(new xe::cpu::backend::x64::X64Backend()); +#elif XE_ARCH_ARM64 + backend.reset(new xe::cpu::backend::a64::A64Backend()); #endif // XE_ARCH } } diff --git a/src/xenia/cpu/ppc/testing/premake5.lua b/src/xenia/cpu/ppc/testing/premake5.lua index bca2bb81e..96afb593e 100644 --- a/src/xenia/cpu/ppc/testing/premake5.lua +++ b/src/xenia/cpu/ppc/testing/premake5.lua @@ -27,7 +27,11 @@ project("xenia-cpu-ppc-tests") links({ "xenia-cpu-backend-x64", }) - filter("platforms:Windows") + filter("architecture:ARM64") + links({ + "xenia-cpu-backend-a64", + }) + filter("platforms:Windows-*") debugdir(project_root) debugargs({ "2>&1", diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index eb63a1abf..cf8c028f0 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -34,7 +34,11 @@ #include "xenia/cpu/xex_module.h" // TODO(benvanik): based on compiler support +#ifdef XE_ARCH_AMD64 #include "xenia/cpu/backend/x64/x64_backend.h" +#elif XE_ARCH_ARM64 +#include "xenia/cpu/backend/a64/a64_backend.h" +#endif // XE_ARCH #if 0 && DEBUG #define DEFAULT_DEBUG_FLAG true diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h index 0aa06a26d..5e13ab818 100644 --- a/src/xenia/cpu/processor.h +++ b/src/xenia/cpu/processor.h @@ -162,7 +162,7 @@ class Processor { // This will cancel any active step operations and resume all threads. void Continue(); - // Steps the given thread a single x64 host instruction. + // Steps the given thread a single host instruction. // If the step is over a branch the branch will be followed. void StepHostInstruction(uint32_t thread_id); diff --git a/src/xenia/cpu/stack_walker_win.cc b/src/xenia/cpu/stack_walker_win.cc index aaaab140a..7444e725a 100644 --- a/src/xenia/cpu/stack_walker_win.cc +++ b/src/xenia/cpu/stack_walker_win.cc @@ -58,6 +58,12 @@ LPSYMFUNCTIONTABLEACCESS64 sym_function_table_access_64_ = nullptr; LPSYMGETMODULEBASE64 sym_get_module_base_64_ = nullptr; LPSYMGETSYMFROMADDR64 sym_get_sym_from_addr_64_ = nullptr; +#if XE_ARCH_AMD64 +static const DWORD kMachineType = IMAGE_FILE_MACHINE_AMD64; +#elif XE_ARCH_ARM64 +static const DWORD kMachineType = IMAGE_FILE_MACHINE_ARM64; +#endif + namespace xe { namespace cpu { @@ -173,40 +179,70 @@ class Win32StackWalker : public StackWalker { } else { // Copy thread context local. We will be modifying it during stack // walking, so we don't want to mess with the incoming copy. +#if XE_ARCH_AMD64 thread_context.Rip = in_host_context->rip; thread_context.EFlags = in_host_context->eflags; std::memcpy(&thread_context.Rax, in_host_context->int_registers, sizeof(in_host_context->int_registers)); std::memcpy(&thread_context.Xmm0, in_host_context->xmm_registers, sizeof(in_host_context->xmm_registers)); +#elif XE_ARCH_ARM64 + thread_context.Pc = in_host_context->pc; + thread_context.Cpsr = in_host_context->cpsr; + std::memcpy(thread_context.X, in_host_context->x, + sizeof(in_host_context->x)); + std::memcpy(&thread_context.V, in_host_context->v, + sizeof(in_host_context->v)); +#endif } if (out_host_context) { // Write out the captured thread context if the caller asked for it. +#if XE_ARCH_AMD64 out_host_context->rip = thread_context.Rip; out_host_context->eflags = thread_context.EFlags; std::memcpy(out_host_context->int_registers, &thread_context.Rax, sizeof(out_host_context->int_registers)); std::memcpy(out_host_context->xmm_registers, &thread_context.Xmm0, sizeof(out_host_context->xmm_registers)); +#elif XE_ARCH_ARM64 + out_host_context->pc = thread_context.Pc; + out_host_context->cpsr = thread_context.Cpsr; + std::memcpy(out_host_context->x, &thread_context.X, + sizeof(out_host_context->x)); + std::memcpy(out_host_context->v, &thread_context.V, + sizeof(out_host_context->v)); +#endif } // Setup the frame for walking. STACKFRAME64 stack_frame = {0}; stack_frame.AddrPC.Mode = AddrModeFlat; +#if XE_ARCH_AMD64 stack_frame.AddrPC.Offset = thread_context.Rip; +#elif XE_ARCH_ARM64 + stack_frame.AddrPC.Offset = thread_context.Pc; +#endif stack_frame.AddrFrame.Mode = AddrModeFlat; +#if XE_ARCH_AMD64 stack_frame.AddrFrame.Offset = thread_context.Rbp; +#elif XE_ARCH_ARM64 + stack_frame.AddrFrame.Offset = thread_context.Fp; +#endif stack_frame.AddrStack.Mode = AddrModeFlat; +#if XE_ARCH_AMD64 stack_frame.AddrStack.Offset = thread_context.Rsp; +#elif XE_ARCH_ARM64 + stack_frame.AddrStack.Offset = thread_context.Sp; +#endif // Walk the stack. // Note that StackWalk64 is thread safe, though other dbghelp functions are // not. size_t frame_index = 0; while (frame_index < frame_count && - stack_walk_64_(IMAGE_FILE_MACHINE_AMD64, GetCurrentProcess(), - thread_handle, &stack_frame, &thread_context, nullptr, + stack_walk_64_(kMachineType, GetCurrentProcess(), thread_handle, + &stack_frame, &thread_context, nullptr, XSymFunctionTableAccess64, XSymGetModuleBase64, nullptr) == TRUE) { if (frame_index >= frame_offset) { @@ -237,7 +273,7 @@ class Win32StackWalker : public StackWalker { if (function) { frame.guest_symbol.function = function; // Figure out where in guest code we are by looking up the - // displacement in x64 from the JIT'ed code start to the PC. + // displacement in bytes from the JIT'ed code start to the PC. if (function->is_guest()) { auto guest_function = static_cast(function); // Adjust the host PC by -1 so that we will go back into whatever diff --git a/src/xenia/cpu/testing/premake5.lua b/src/xenia/cpu/testing/premake5.lua index 5e70fb3f8..afc1540e7 100644 --- a/src/xenia/cpu/testing/premake5.lua +++ b/src/xenia/cpu/testing/premake5.lua @@ -19,6 +19,12 @@ test_suite("xenia-cpu-tests", project_root, ".", { links = { "xenia-cpu-backend-x64", }, - } + }, + { + filter = 'architecture:ARM64', + links = { + "xenia-cpu-backend-a64", + }, + }, }, }) diff --git a/src/xenia/cpu/testing/util.h b/src/xenia/cpu/testing/util.h index 8f6df2d57..f77c5a11a 100644 --- a/src/xenia/cpu/testing/util.h +++ b/src/xenia/cpu/testing/util.h @@ -13,7 +13,12 @@ #include #include "xenia/base/platform.h" +#if XE_ARCH_AMD64 #include "xenia/cpu/backend/x64/x64_backend.h" +#elif XE_ARCH_ARM64 +#include "xenia/cpu/backend/a64/a64_backend.h" +#endif // XE_ARCH + #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/ppc/ppc_context.h" #include "xenia/cpu/ppc/ppc_frontend.h" @@ -39,6 +44,8 @@ class TestFunction { std::unique_ptr backend; #if XE_ARCH_AMD64 backend.reset(new xe::cpu::backend::x64::X64Backend()); +#elif XE_ARCH_ARM64 + backend.reset(new xe::cpu::backend::a64::A64Backend()); #endif // XE_ARCH if (backend) { auto processor = std::make_unique(memory.get(), nullptr); @@ -74,7 +81,7 @@ class TestFunction { uint32_t stack_address = memory_size - stack_size; uint32_t thread_state_address = stack_address - 0x1000; auto thread_state = std::make_unique(processor.get(), 0x100); - assert_always(); // TODO: Allocate a thread stack!!! + // assert_always(); // TODO: Allocate a thread stack!!! auto ctx = thread_state->context(); ctx->lr = 0xBCBCBCBC; diff --git a/src/xenia/debug/ui/debug_window.cc b/src/xenia/debug/ui/debug_window.cc index 07d4404db..06dba452a 100644 --- a/src/xenia/debug/ui/debug_window.cc +++ b/src/xenia/debug/ui/debug_window.cc @@ -63,7 +63,13 @@ DebugWindow::DebugWindow(Emulator* emulator, processor_(emulator->processor()), app_context_(app_context), window_(xe::ui::Window::Create(app_context_, kBaseTitle, 1500, 1000)) { - if (cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) != CS_ERR_OK) { + if ( +#ifdef XE_ARCH_AMD64 + cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) +#elif XE_ARCH_ARM64 + cs_open(CS_ARCH_ARM64, CS_MODE_LITTLE_ENDIAN, &capstone_handle_) +#endif + != CS_ERR_OK) { assert_always("Failed to initialize capstone"); } cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL); @@ -338,7 +344,7 @@ void DebugWindow::DrawSourcePane() { // copy button // address start - end // name text box (editable) - // combo for interleaved + [ppc, hir, opt hir, x64 + byte with sizes] + // combo for interleaved + [ppc, hir, opt hir, asm + byte with sizes] ImGui::AlignTextToFramePadding(); ImGui::Text("%s", function->module()->name().c_str()); ImGui::SameLine(); @@ -383,11 +389,11 @@ void DebugWindow::DrawSourcePane() { } ImGui::SameLine(); if (state_.source_display_mode > 0) { - // Only show x64 step button if we have x64 visible. + // Only show asm step button if we have asm visible. ImGui::Dummy(ImVec2(4, 0)); ImGui::SameLine(); ImGui::PushButtonRepeat(true); - if (ImGui::ButtonEx("Step x64", ImVec2(0, 0), + if (ImGui::ButtonEx("Step " XE_HOST_ARCH_NAME, ImVec2(0, 0), can_step ? 0 : ImGuiItemFlags_Disabled)) { // By enabling the button when stepping we allow repeat behavior. if (processor_->execution_state() != cpu::ExecutionState::kStepping) { @@ -396,8 +402,8 @@ void DebugWindow::DrawSourcePane() { } ImGui::PopButtonRepeat(); if (ImGui::IsItemHovered()) { - ImGui::SetTooltip( - "Step one x64 instruction on the current thread (hold for many)."); + ImGui::SetTooltip("Step one " XE_HOST_ARCH_NAME + " instruction on the current thread (hold for many)."); } ImGui::SameLine(); } @@ -412,9 +418,9 @@ void DebugWindow::DrawSourcePane() { if (function->is_guest()) { const char* kSourceDisplayModes[] = { "PPC", - "PPC+HIR+x64", - "PPC+HIR (opt)+x64", - "PPC+x64", + "PPC+HIR+" XE_HOST_ARCH_NAME, + "PPC+HIR (opt)+" XE_HOST_ARCH_NAME, + "PPC+" XE_HOST_ARCH_NAME, }; ImGui::PushItemWidth(90); ImGui::Combo("##display_mode", &state_.source_display_mode, @@ -459,7 +465,7 @@ void DebugWindow::DrawGuestFunctionSource() { // labels get their own line with duped addresses // show xrefs to labels? // hir greyed and offset (background color change?) - // x64 greyed and offset with native address + // asm greyed and offset with native address // hover on registers/etc for tooltip/highlight others // click register to go to location of last write // click code address to jump to code @@ -472,18 +478,18 @@ void DebugWindow::DrawGuestFunctionSource() { bool draw_hir = false; bool draw_hir_opt = false; - bool draw_x64 = false; + bool draw_asm = false; switch (state_.source_display_mode) { case 1: draw_hir = true; - draw_x64 = true; + draw_asm = true; break; case 2: draw_hir_opt = true; - draw_x64 = true; + draw_asm = true; break; case 3: - draw_x64 = true; + draw_asm = true; break; } @@ -498,8 +504,8 @@ void DebugWindow::DrawGuestFunctionSource() { if (draw_hir_opt) { // TODO(benvanik): get HIR and draw preamble. } - if (draw_x64) { - // x64 preamble. + if (draw_asm) { + // asm preamble. DrawMachineCodeSource(function->machine_code(), source_map[0].code_offset); } @@ -512,7 +518,7 @@ void DebugWindow::DrawGuestFunctionSource() { bool is_current_instr = address == guest_pc; if (is_current_instr) { ImGui::PushStyleColor(ImGuiCol_Text, ImVec4(0.0f, 1.0f, 0.0f, 1.0f)); - if (!draw_x64) { + if (!draw_asm) { ScrollToSourceIfPcChanged(); } } @@ -548,7 +554,7 @@ void DebugWindow::DrawGuestFunctionSource() { if (draw_hir_opt) { // TODO(benvanik): get HIR and draw for this PPC function. } - if (draw_x64) { + if (draw_asm) { const uint8_t* machine_code_start = function->machine_code() + source_map[source_map_index].code_offset; const size_t machine_code_length = @@ -851,10 +857,10 @@ void DebugWindow::DrawRegistersPane() { if (state_.register_group == RegisterGroup::kHostGeneral) { ImGui::PushStyleColor(ImGuiCol_Button, ImGui::GetStyle().Colors[ImGuiCol_ButtonActive]); - ImGui::Button("x64"); + ImGui::Button(XE_HOST_ARCH_NAME); ImGui::PopStyleColor(); } else { - if (ImGui::Button("x64")) { + if (ImGui::Button(XE_HOST_ARCH_NAME)) { state_.register_group = RegisterGroup::kHostGeneral; } } @@ -862,10 +868,10 @@ void DebugWindow::DrawRegistersPane() { if (state_.register_group == RegisterGroup::kHostVector) { ImGui::PushStyleColor(ImGuiCol_Button, ImGui::GetStyle().Colors[ImGuiCol_ButtonActive]); - ImGui::Button("XMM"); + ImGui::Button(XE_HOST_ARCH_NAME "-vec"); ImGui::PopStyleColor(); } else { - if (ImGui::Button("XMM")) { + if (ImGui::Button(XE_HOST_ARCH_NAME "-vec")) { state_.register_group = RegisterGroup::kHostVector; } } @@ -958,6 +964,7 @@ void DebugWindow::DrawRegistersPane() { } break; case RegisterGroup::kHostGeneral: { ImGui::BeginChild("##host_general"); +#if XE_ARCH_AMD64 for (int i = 0; i < 18; ++i) { auto reg = static_cast(i); ImGui::BeginGroup(); @@ -995,6 +1002,46 @@ void DebugWindow::DrawRegistersPane() { i, thread_info->host_context.xmm_registers[i].f32); ImGui::EndGroup(); } +#elif XE_ARCH_ARM64 + // TODO(wunkolo): print ARM64 registers + for (int i = 0; i < 34; ++i) { + auto reg = static_cast(i); + ImGui::BeginGroup(); + ImGui::AlignTextToFramePadding(); + ImGui::Text("%3s", HostThreadContext::GetRegisterName(reg)); + ImGui::SameLine(); + ImGui::Dummy(ImVec2(4, 0)); + ImGui::SameLine(); + if (reg == Arm64Register::kPc) { + dirty_guest_context |= + DrawRegisterTextBox(i, &thread_info->host_context.pc); + } else if (reg == Arm64Register::kPstate) { + dirty_guest_context = + DrawRegisterTextBox(i, &thread_info->host_context.cpsr); + } else { + dirty_guest_context |= + DrawRegisterTextBox(i, &thread_info->host_context.x[i]); + } + ImGui::EndGroup(); + } + ImGui::EndChild(); + } break; + case RegisterGroup::kHostVector: { + ImGui::BeginChild("##host_vector"); + for (int i = 0; i < 32; ++i) { + auto reg = static_cast( + static_cast(Arm64Register::kV0) + i); + ImGui::BeginGroup(); + ImGui::AlignTextToFramePadding(); + ImGui::Text("%5s", HostThreadContext::GetRegisterName(reg)); + ImGui::SameLine(); + ImGui::Dummy(ImVec2(4, 0)); + ImGui::SameLine(); + dirty_host_context |= + DrawRegisterTextBoxes(i, thread_info->host_context.v[i].f32); + ImGui::EndGroup(); + } +#endif ImGui::EndChild(); } } @@ -1144,7 +1191,8 @@ void DebugWindow::DrawBreakpointsPane() { ImGui::OpenPopup("##add_code_breakpoint"); } if (ImGui::IsItemHovered()) { - ImGui::SetTooltip("Add a code breakpoint for either PPC or x64."); + ImGui::SetTooltip( + "Add a code breakpoint for either PPC or " XE_HOST_ARCH_NAME "."); } // TODO(benvanik): remove this set focus workaround when imgui is fixed: // https://github.com/ocornut/imgui/issues/343 @@ -1178,15 +1226,15 @@ void DebugWindow::DrawBreakpointsPane() { ImGui::Dummy(ImVec2(0, 2)); ImGui::AlignTextToFramePadding(); - ImGui::Text("x64"); + ImGui::Text(XE_HOST_ARCH_NAME); ImGui::SameLine(); ImGui::Dummy(ImVec2(2, 0)); ImGui::SameLine(); - static char x64_buffer[64] = {0}; + static char asm_buffer[64] = {0}; ImGui::PushItemWidth(100); - if (ImGui::InputText("##host_address", x64_buffer, 17, input_flags)) { - uint64_t address = string_util::from_string(x64_buffer, true); - x64_buffer[0] = 0; + if (ImGui::InputText("##host_address", asm_buffer, 17, input_flags)) { + uint64_t address = string_util::from_string(asm_buffer, true); + asm_buffer[0] = 0; CreateCodeBreakpoint(Breakpoint::AddressType::kHost, address); ImGui::CloseCurrentPopup(); } diff --git a/src/xenia/emulator.cc b/src/xenia/emulator.cc index cca28982f..836ba3420 100644 --- a/src/xenia/emulator.cc +++ b/src/xenia/emulator.cc @@ -53,6 +53,8 @@ #if XE_ARCH_AMD64 #include "xenia/cpu/backend/x64/x64_backend.h" +#elif XE_ARCH_ARM64 +#include "xenia/cpu/backend/a64/a64_backend.h" #endif // XE_ARCH DECLARE_int32(user_language); @@ -172,11 +174,18 @@ X_STATUS Emulator::Setup( if (cvars::cpu == "x64") { backend.reset(new xe::cpu::backend::x64::X64Backend()); } +#elif XE_ARCH_ARM64 + if (cvars::cpu == "a64") { + backend.reset(new xe::cpu::backend::a64::A64Backend()); + } #endif // XE_ARCH if (cvars::cpu == "any") { if (!backend) { #if XE_ARCH_AMD64 backend.reset(new xe::cpu::backend::x64::X64Backend()); +#elif XE_ARCH_ARM64 + // TODO(wunkolo): Arm64 backend + backend.reset(new xe::cpu::backend::a64::A64Backend()); #endif // XE_ARCH } } diff --git a/src/xenia/gpu/d3d12/premake5.lua b/src/xenia/gpu/d3d12/premake5.lua index f0ee8cc02..92633f74c 100644 --- a/src/xenia/gpu/d3d12/premake5.lua +++ b/src/xenia/gpu/d3d12/premake5.lua @@ -70,6 +70,11 @@ project("xenia-gpu-d3d12-trace-viewer") "xenia-cpu-backend-x64", }) + filter("architecture:ARM64") + links({ + "xenia-cpu-backend-a64", + }) + group("src") project("xenia-gpu-d3d12-trace-dump") uuid("686b859c-0046-44c4-a02c-41fc3fb75698") @@ -120,3 +125,8 @@ project("xenia-gpu-d3d12-trace-dump") links({ "xenia-cpu-backend-x64", }) + + filter("architecture:ARM64") + links({ + "xenia-cpu-backend-a64", + }) diff --git a/src/xenia/gpu/premake5.lua b/src/xenia/gpu/premake5.lua index 971d6ef70..850580ca2 100644 --- a/src/xenia/gpu/premake5.lua +++ b/src/xenia/gpu/premake5.lua @@ -43,7 +43,7 @@ project("xenia-gpu-shader-compiler") "../base/console_app_main_"..platform_suffix..".cc", }) - filter("platforms:Windows") + filter("platforms:Windows-*") -- Only create the .user file if it doesn't already exist. local user_file = project_root.."/build/xenia-gpu-shader-compiler.vcxproj.user" if not os.isfile(user_file) then diff --git a/src/xenia/gpu/vulkan/premake5.lua b/src/xenia/gpu/vulkan/premake5.lua index 90ae7c46e..41f862aeb 100644 --- a/src/xenia/gpu/vulkan/premake5.lua +++ b/src/xenia/gpu/vulkan/premake5.lua @@ -68,6 +68,11 @@ project("xenia-gpu-vulkan-trace-viewer") "xenia-cpu-backend-x64", }) + filter("architecture:ARM64") + links({ + "xenia-cpu-backend-a64", + }) + filter("platforms:Linux") links({ "X11", @@ -75,7 +80,7 @@ project("xenia-gpu-vulkan-trace-viewer") "X11-xcb", }) - filter("platforms:Windows") + filter("platforms:Windows-*") -- Only create the .user file if it doesn't already exist. local user_file = project_root.."/build/xenia-gpu-vulkan-trace-viewer.vcxproj.user" if not os.isfile(user_file) then @@ -131,6 +136,11 @@ project("xenia-gpu-vulkan-trace-dump") "xenia-cpu-backend-x64", }) + filter("architecture:ARM64") + links({ + "xenia-cpu-backend-a64", + }) + filter("platforms:Linux") links({ "X11", @@ -138,7 +148,7 @@ project("xenia-gpu-vulkan-trace-dump") "X11-xcb", }) - filter("platforms:Windows") + filter("platforms:Windows-*") -- Only create the .user file if it doesn't already exist. local user_file = project_root.."/build/xenia-gpu-vulkan-trace-dump.vcxproj.user" if not os.isfile(user_file) then diff --git a/src/xenia/hid/premake5.lua b/src/xenia/hid/premake5.lua index 4e961f623..844a313f4 100644 --- a/src/xenia/hid/premake5.lua +++ b/src/xenia/hid/premake5.lua @@ -53,7 +53,7 @@ project("xenia-hid-demo") "X11-xcb", }) - filter("platforms:Windows") + filter("platforms:Windows-*") links({ "xenia-hid-winkey", "xenia-hid-xinput", diff --git a/src/xenia/ui/premake5.lua b/src/xenia/ui/premake5.lua index 6aff82bec..8f50fd515 100644 --- a/src/xenia/ui/premake5.lua +++ b/src/xenia/ui/premake5.lua @@ -19,7 +19,7 @@ project("xenia-ui") -- Exports JNI functions. wholelib("On") - filter("platforms:Windows") + filter("platforms:Windows-*") links({ "dwmapi", "dxgi", diff --git a/third_party/SDL2.lua b/third_party/SDL2.lua index 972aa1aa7..2186de6b7 100644 --- a/third_party/SDL2.lua +++ b/third_party/SDL2.lua @@ -26,7 +26,7 @@ end -- Call this function in project scope to include the SDL2 headers. -- function sdl2_include() - filter("platforms:Windows") + filter("platforms:Windows-*") includedirs({ path.getrelative(".", third_party_path) .. "/SDL2/include", }) diff --git a/third_party/capstone.lua b/third_party/capstone.lua index 6dc415974..8dfb328f7 100644 --- a/third_party/capstone.lua +++ b/third_party/capstone.lua @@ -4,13 +4,37 @@ project("capstone") kind("StaticLib") language("C") defines({ - "CAPSTONE_X86_ATT_DISABLE", "CAPSTONE_DIET_NO", - "CAPSTONE_X86_REDUCE_NO", - "CAPSTONE_HAS_X86", "CAPSTONE_USE_SYS_DYN_MEM", "_LIB", }) + filter("architecture:x86_64") + defines({ + "CAPSTONE_HAS_X86", + "CAPSTONE_X86_ATT_DISABLE", + "CAPSTONE_X86_REDUCE_NO", + }) + files({ + "capstone/arch/X86/*.c", + "capstone/arch/X86/*.h", + "capstone/arch/X86/*.inc", + }) + force_compile_as_c({ + "capstone/arch/X86/**.c", + }) + filter("architecture:ARM64") + defines({ + "CAPSTONE_HAS_ARM64", + }) + files({ + "capstone/arch/AArch64/*.c", + "capstone/arch/AArch64/*.h", + "capstone/arch/AArch64/*.inc", + }) + force_compile_as_c({ + "capstone/arch/AArch64/**.c", + }) + filter({}) includedirs({ "capstone", "capstone/include", @@ -32,12 +56,7 @@ project("capstone") "capstone/SStream.h", "capstone/utils.c", "capstone/utils.h", - - "capstone/arch/X86/*.c", - "capstone/arch/X86/*.h", - "capstone/arch/X86/*.inc", }) force_compile_as_c({ - "capstone/**.c", - "capstone/arch/X86/**.c", - }) + "capstone/**.c", + }) \ No newline at end of file diff --git a/third_party/discord-rpc.lua b/third_party/discord-rpc.lua index 1f6e795f8..ca7d0370e 100644 --- a/third_party/discord-rpc.lua +++ b/third_party/discord-rpc.lua @@ -30,7 +30,7 @@ project("discord-rpc") files({ "discord-rpc/src/discord_register_osx.m" }) - filter("platforms:Windows") + filter("platforms:Windows-*") files({ "discord-rpc/src/connection_win.cpp", "discord-rpc/src/discord_register_win.cpp" diff --git a/third_party/microprofile/microprofileui.h b/third_party/microprofile/microprofileui.h index d422445dd..8f47a619d 100644 --- a/third_party/microprofile/microprofileui.h +++ b/third_party/microprofile/microprofileui.h @@ -3252,7 +3252,7 @@ void MicroProfileDraw(uint32_t nWidth, uint32_t nHeight) #if MICROPROFILE_CONTEXT_SWITCH_TRACE MicroProfileStringArrayAddLiteral(&Debug, "Context Switch"); - MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", S.nContextSwitchUsage, MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE / S.nContextSwitchUsage ); + MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", S.nContextSwitchUsage, S.nContextSwitchUsage ? MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE / S.nContextSwitchUsage : 0 ); #endif for(int i = 0; i < MICROPROFILE_MAX_THREADS; ++i) diff --git a/third_party/mspack.lua b/third_party/mspack.lua index c1d1b44a5..94d6a6c81 100644 --- a/third_party/mspack.lua +++ b/third_party/mspack.lua @@ -28,7 +28,7 @@ project("mspack") "mspack/system.h", }) - filter("platforms:Windows") + filter("platforms:Windows-*") defines({ }) filter("platforms:Linux") diff --git a/third_party/oaknut b/third_party/oaknut new file mode 160000 index 000000000..94c726ce0 --- /dev/null +++ b/third_party/oaknut @@ -0,0 +1 @@ +Subproject commit 94c726ce0338b054eb8cb5ea91de8fe6c19f4392 diff --git a/third_party/snappy.lua b/third_party/snappy.lua index bf13b762e..3e6b1009d 100644 --- a/third_party/snappy.lua +++ b/third_party/snappy.lua @@ -18,5 +18,5 @@ project("snappy") "snappy/snappy.h", }) - filter("platforms:Windows") + filter("platforms:Windows-*") warnings("Off") -- Too many warnings. diff --git a/tools/build/scripts/platform_files.lua b/tools/build/scripts/platform_files.lua index ec1579cf0..332436dad 100644 --- a/tools/build/scripts/platform_files.lua +++ b/tools/build/scripts/platform_files.lua @@ -20,7 +20,7 @@ local function match_platform_files(base_path, base_match) removefiles({base_path.."/".."**_android.h", base_path.."/".."**_android.cc"}) removefiles({base_path.."/".."**_mac.h", base_path.."/".."**_mac.cc"}) removefiles({base_path.."/".."**_win.h", base_path.."/".."**_win.cc"}) - filter("platforms:Windows") + filter("platforms:Windows-*") files({ base_path.."/"..base_match.."_win.h", base_path.."/"..base_match.."_win.cc", diff --git a/xenia-build b/xenia-build index 130032323..cfd134143 100755 --- a/xenia-build +++ b/xenia-build @@ -781,6 +781,8 @@ class BaseBuildCommand(Command): self.parser.add_argument( '--target', action='append', default=[], help='Builds only the given target(s).') + self.parser.add_argument( + '--arch', default='x86_64', help='Builds only the given architecture') self.parser.add_argument( '--force', action='store_true', help='Forces a full rebuild.') @@ -823,6 +825,7 @@ class BaseBuildCommand(Command): '/m', '/v:m', '/p:Configuration=' + args['config'], + '/p:Platform=' + "Windows-" + args['arch'], ] + ([targets] if targets is not None else []) + pass_args, shell=False) elif sys.platform == 'darwin':