From 326e718035c531a43fa72da45e6e9fe371739a61 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Wed, 6 Jul 2022 21:02:59 +0300 Subject: [PATCH] [CPU] MMIO: Arm64, load register writes + exception cleanup --- src/xenia/base/exception_handler.cc | 9 +- src/xenia/base/exception_handler.h | 73 ++++- src/xenia/base/exception_handler_posix.cc | 57 +++- src/xenia/base/exception_handler_win.cc | 20 +- src/xenia/base/host_thread_context.cc | 24 +- src/xenia/base/host_thread_context.h | 14 +- src/xenia/cpu/mmio_handler.cc | 341 +++++++++++++++++----- src/xenia/cpu/mmio_handler.h | 56 ++++ src/xenia/cpu/processor.cc | 7 + 9 files changed, 496 insertions(+), 105 deletions(-) diff --git a/src/xenia/base/exception_handler.cc b/src/xenia/base/exception_handler.cc index 6198797fe..1b6cbd4b3 100644 --- a/src/xenia/base/exception_handler.cc +++ b/src/xenia/base/exception_handler.cc @@ -40,11 +40,14 @@ namespace xe { // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bool IsArm64LoadPrefetchStore(uint32_t instruction, bool& is_store_out) { - if ((instruction & kArm64LoadStoreAnyMask) != kArm64LoadStoreAnyValue) { + if ((instruction & kArm64LoadLiteralFMask) == kArm64LoadLiteralFixed) { + return true; + } + if ((instruction & kArm64LoadStoreAnyFMask) != kArm64LoadStoreAnyFixed) { return false; } - if ((instruction & kArm64LoadStorePairAnyMask) == - kArm64LoadStorePairAnyValue) { + if ((instruction & kArm64LoadStorePairAnyFMask) == + kArm64LoadStorePairAnyFixed) { is_store_out = !(instruction & kArm64LoadStorePairLoadBit); return true; } diff --git a/src/xenia/base/exception_handler.h b/src/xenia/base/exception_handler.h index 27f97f8f1..218a2e4bc 100644 --- a/src/xenia/base/exception_handler.h +++ b/src/xenia/base/exception_handler.h @@ -48,13 +48,19 @@ namespace xe { // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -constexpr uint32_t kArm64LoadStoreAnyMask = UINT32_C(0x0A000000); -constexpr uint32_t kArm64LoadStoreAnyValue = UINT32_C(0x08000000); -constexpr uint32_t kArm64LoadStorePairAnyMask = UINT32_C(0x3A000000); -constexpr uint32_t kArm64LoadStorePairAnyValue = UINT32_C(0x28000000); -constexpr uint32_t kArm64LoadStorePairLoadBit = UINT32_C(1) << 22; -constexpr uint32_t kArm64LoadStoreMask = UINT32_C(0xC4C00000); +// `Instruction address + literal offset` loads. +// This includes PRFM_lit. +constexpr uint32_t kArm64LoadLiteralFMask = UINT32_C(0x3B000000); +constexpr uint32_t kArm64LoadLiteralFixed = UINT32_C(0x18000000); +constexpr uint32_t kArm64LoadStoreAnyFMask = UINT32_C(0x0A000000); +constexpr uint32_t kArm64LoadStoreAnyFixed = UINT32_C(0x08000000); + +constexpr uint32_t kArm64LoadStorePairAnyFMask = UINT32_C(0x3A000000); +constexpr uint32_t kArm64LoadStorePairAnyFixed = UINT32_C(0x28000000); +constexpr uint32_t kArm64LoadStorePairLoadBit = UINT32_C(1) << 22; + +constexpr uint32_t kArm64LoadStoreMask = UINT32_C(0xC4C00000); enum class Arm64LoadStoreOp : uint32_t { kSTRB_w = UINT32_C(0x00000000), kSTRH_w = UINT32_C(0x40000000), @@ -82,6 +88,17 @@ enum class Arm64LoadStoreOp : uint32_t { kPRFM = UINT32_C(0xC0800000), }; +constexpr uint32_t kArm64LoadStoreOffsetFMask = UINT32_C(0x3B200C00); +enum class Arm64LoadStoreOffsetFixed : uint32_t { + kUnscaledOffset = UINT32_C(0x38000000), + kPostIndex = UINT32_C(0x38000400), + kPreIndex = UINT32_C(0x38000C00), + kRegisterOffset = UINT32_C(0x38200800), +}; + +constexpr uint32_t kArm64LoadStoreUnsignedOffsetFMask = UINT32_C(0x3B000000); +constexpr uint32_t kArm64LoadStoreUnsignedOffsetFixed = UINT32_C(0x39000000); + bool IsArm64LoadPrefetchStore(uint32_t instruction, bool& is_store_out); class Exception { @@ -114,6 +131,14 @@ class Exception { Code code() const { return code_; } // Returns the platform-specific thread context info. + // Note that certain registers must be modified through Modify* proxy + // functions rather than directly: + // x86-64: + // - General-purpose registers (r##, r8-r15). + // - XMM registers. + // AArch64: + // - General-purpose registers (Xn), including FP and LR. + // - SIMD and floating-point registers (Vn). HostThreadContext* thread_context() const { return thread_context_; } // Returns the program counter where the exception occurred. @@ -139,6 +164,35 @@ class Exception { #endif // XE_ARCH } +#if XE_ARCH_AMD64 + // The index is relative to X64Register::kIntRegisterFirst. + uint64_t& ModifyIntRegister(uint32_t index) { + assert_true(index <= 15); + modified_int_registers_ |= UINT16_C(1) << index; + return thread_context_->int_registers[index]; + } + uint16_t modified_int_registers() const { return modified_int_registers_; } + vec128_t& ModifyXmmRegister(uint32_t index) { + assert_true(index <= 15); + modified_xmm_registers_ |= UINT16_C(1) << index; + return thread_context_->xmm_registers[index]; + } + uint16_t modified_xmm_registers() const { return modified_xmm_registers_; } +#elif XE_ARCH_ARM64 + uint64_t& ModifyXRegister(uint32_t index) { + assert_true(index <= 30); + modified_x_registers_ |= UINT32_C(1) << index; + return thread_context_->x[index]; + } + uint32_t modified_x_registers() const { return modified_x_registers_; } + vec128_t& ModifyVRegister(uint32_t index) { + assert_true(index <= 31); + modified_v_registers_ |= UINT32_C(1) << index; + return thread_context_->v[index]; + } + uint32_t modified_v_registers() const { return modified_v_registers_; } +#endif // XE_ARCH + // In case of AV, address that was read from/written to. uint64_t fault_address() const { return fault_address_; } @@ -150,6 +204,13 @@ class Exception { private: Code code_ = Code::kInvalidException; HostThreadContext* thread_context_ = nullptr; +#if XE_ARCH_AMD64 + uint16_t modified_int_registers_ = 0; + uint16_t modified_xmm_registers_ = 0; +#elif XE_ARCH_ARM64 + uint32_t modified_x_registers_ = 0; + uint32_t modified_v_registers_ = 0; +#endif // XE_ARCH uint64_t fault_address_ = 0; AccessViolationOperation access_violation_operation_ = AccessViolationOperation::kUnknown; diff --git a/src/xenia/base/exception_handler_posix.cc b/src/xenia/base/exception_handler_posix.cc index 41a391e53..0b11003ff 100644 --- a/src/xenia/base/exception_handler_posix.cc +++ b/src/xenia/base/exception_handler_posix.cc @@ -16,6 +16,7 @@ #include "xenia/base/assert.h" #include "xenia/base/host_thread_context.h" #include "xenia/base/logging.h" +#include "xenia/base/math.h" #include "xenia/base/platform.h" namespace xe { @@ -43,6 +44,8 @@ static void ExceptionHandlerCallback(int signal_number, siginfo_t* signal_info, #if XE_ARCH_AMD64 thread_context.rip = uint64_t(mcontext.gregs[REG_RIP]); thread_context.eflags = uint32_t(mcontext.gregs[REG_EFL]); + // The REG_ order may be different than the register indices in the + // instruction encoding. thread_context.rax = uint64_t(mcontext.gregs[REG_RAX]); thread_context.rcx = uint64_t(mcontext.gregs[REG_RCX]); thread_context.rdx = uint64_t(mcontext.gregs[REG_RDX]); @@ -160,11 +163,61 @@ static void ExceptionHandlerCallback(int signal_number, siginfo_t* signal_info, for (size_t i = 0; i < xe::countof(handlers_) && handlers_[i].first; ++i) { if (handlers_[i].first(&ex, handlers_[i].second)) { // Exception handled. - // TODO(benvanik): Update all thread state? Dirty flags? #if XE_ARCH_AMD64 - mcontext.gregs[REG_RIP] = thread_context.rip; + mcontext.gregs[REG_RIP] = greg_t(thread_context.rip); + mcontext.gregs[REG_EFL] = greg_t(thread_context.eflags); + uint32_t modified_register_index; + // The order must match the order in X64Register. + static const size_t kIntRegisterMap[] = { + REG_RAX, REG_RCX, REG_RDX, REG_RBX, REG_RSP, REG_RBP, + REG_RSI, REG_RDI, REG_R8, REG_R9, REG_R10, REG_R11, + REG_R12, REG_R13, REG_R14, REG_R15, + }; + uint16_t modified_int_registers_remaining = ex.modified_int_registers(); + while (xe::bit_scan_forward(modified_int_registers_remaining, + &modified_register_index)) { + modified_int_registers_remaining &= + ~(UINT16_C(1) << modified_register_index); + mcontext.gregs[kIntRegisterMap[modified_register_index]] = + thread_context.int_registers[modified_register_index]; + } + uint16_t modified_xmm_registers_remaining = ex.modified_xmm_registers(); + while (xe::bit_scan_forward(modified_xmm_registers_remaining, + &modified_register_index)) { + modified_xmm_registers_remaining &= + ~(UINT16_C(1) << modified_register_index); + std::memcpy(&mcontext.fpregs->_xmm[modified_register_index], + &thread_context.xmm_registers[modified_register_index], + sizeof(vec128_t)); + } #elif XE_ARCH_ARM64 + uint32_t modified_register_index; + uint32_t modified_x_registers_remaining = ex.modified_x_registers(); + while (xe::bit_scan_forward(modified_x_registers_remaining, + &modified_register_index)) { + modified_x_registers_remaining &= + ~(UINT32_C(1) << modified_register_index); + mcontext.regs[modified_register_index] = + thread_context.x[modified_register_index]; + } + mcontext.sp = thread_context.sp; mcontext.pc = thread_context.pc; + mcontext.pstate = thread_context.pstate; + if (mcontext_fpsimd) { + mcontext_fpsimd->fpsr = thread_context.fpsr; + mcontext_fpsimd->fpcr = thread_context.fpcr; + uint32_t modified_v_registers_remaining = ex.modified_v_registers(); + while (xe::bit_scan_forward(modified_v_registers_remaining, + &modified_register_index)) { + modified_v_registers_remaining &= + ~(UINT32_C(1) << modified_register_index); + std::memcpy(&mcontext_fpsimd->vregs[modified_register_index], + &thread_context.v[modified_register_index], + sizeof(vec128_t)); + mcontext.regs[modified_register_index] = + thread_context.x[modified_register_index]; + } + } #endif // XE_ARCH return; } diff --git a/src/xenia/base/exception_handler_win.cc b/src/xenia/base/exception_handler_win.cc index a2cfa8b44..786a129a5 100644 --- a/src/xenia/base/exception_handler_win.cc +++ b/src/xenia/base/exception_handler_win.cc @@ -78,8 +78,26 @@ LONG CALLBACK ExceptionHandlerCallback(PEXCEPTION_POINTERS ex_info) { for (size_t i = 0; i < xe::countof(handlers_) && handlers_[i].first; ++i) { if (handlers_[i].first(&ex, handlers_[i].second)) { // Exception handled. - // TODO(benvanik): update all thread state? Dirty flags? ex_info->ContextRecord->Rip = thread_context.rip; + ex_info->ContextRecord->EFlags = thread_context.eflags; + uint32_t modified_register_index; + uint16_t modified_int_registers_remaining = ex.modified_int_registers(); + while (xe::bit_scan_forward(modified_int_registers_remaining, + &modified_register_index)) { + modified_int_registers_remaining &= + ~(UINT16_C(1) << modified_register_index); + (&ex_info->ContextRecord->Rax)[modified_register_index] = + thread_context.int_registers[modified_register_index]; + } + uint16_t modified_xmm_registers_remaining = ex.modified_xmm_registers(); + while (xe::bit_scan_forward(modified_xmm_registers_remaining, + &modified_register_index)) { + modified_xmm_registers_remaining &= + ~(UINT16_C(1) << modified_register_index); + std::memcpy(&ex_info->ContextRecord->Xmm0 + modified_register_index, + &thread_context.xmm_registers[modified_register_index], + sizeof(vec128_t)); + } return EXCEPTION_CONTINUE_EXECUTION; } } diff --git a/src/xenia/base/host_thread_context.cc b/src/xenia/base/host_thread_context.cc index 435b68a85..bf668bdd3 100644 --- a/src/xenia/base/host_thread_context.cc +++ b/src/xenia/base/host_thread_context.cc @@ -27,10 +27,10 @@ static const char* kRegisterNames[] = { "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x29", - "x30", "sp", "pc", "pstate", "fpsr", "fpcr", "q0", "q1", "q2", "q3", - "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", - "q14", "q15", "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23", - "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31", + "x30", "sp", "pc", "pstate", "fpsr", "fpcr", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", #endif // XE_ARCH }; @@ -47,12 +47,12 @@ std::string HostThreadContext::GetStringFromValue(HostRegister reg, case X64Register::kEflags: return hex ? string_util::to_hex_string(eflags) : std::to_string(eflags); default: - if (int(reg) >= int(X64Register::kRax) && - int(reg) <= int(X64Register::kR15)) { - auto value = int_registers[int(reg) - int(X64Register::kRax)]; + if (reg >= X64Register::kIntRegisterFirst && + reg <= X64Register::kIntRegisterLast) { + auto value = + int_registers[int(reg) - int(X64Register::kIntRegisterFirst)]; return hex ? string_util::to_hex_string(value) : std::to_string(value); - } else if (int(reg) >= int(X64Register::kXmm0) && - int(reg) <= int(X64Register::kXmm15)) { + } else if (reg >= X64Register::kXmm0 && reg <= X64Register::kXmm15) { auto value = xmm_registers[int(reg) - int(X64Register::kXmm0)]; return hex ? string_util::to_hex_string(value) : xe::to_string(value); } else { @@ -73,12 +73,10 @@ std::string HostThreadContext::GetStringFromValue(HostRegister reg, case Arm64Register::kFpcr: return hex ? string_util::to_hex_string(fpcr) : std::to_string(fpcr); default: - if (int(reg) >= int(Arm64Register::kX0) && - int(reg) <= int(Arm64Register::kX30)) { + if (reg >= Arm64Register::kX0 && reg <= Arm64Register::kX30) { auto value = x[int(reg) - int(Arm64Register::kX0)]; return hex ? string_util::to_hex_string(value) : std::to_string(value); - } else if (int(reg) >= int(Arm64Register::kV0) && - int(reg) <= int(Arm64Register::kV31)) { + } else if (reg >= Arm64Register::kV0 && reg <= Arm64Register::kV31) { auto value = v[int(reg) - int(Arm64Register::kV0)]; return hex ? string_util::to_hex_string(value) : xe::to_string(value); } else { diff --git a/src/xenia/base/host_thread_context.h b/src/xenia/base/host_thread_context.h index 8947cc1ec..554d09f44 100644 --- a/src/xenia/base/host_thread_context.h +++ b/src/xenia/base/host_thread_context.h @@ -23,12 +23,17 @@ namespace xe { // NOTE: The order of the registers in the enumerations must match the order in -// the string table in host_thread_context.cc. +// the string table in host_thread_context.cc, as well as remapping tables in +// exception handler implementations. enum class X64Register { kRip, kEflags, - kRax, + + kIntRegisterFirst, + // The order matches the indices in the instruction encoding, as well as the + // Windows CONTEXT structure. + kRax = kIntRegisterFirst, kRcx, kRdx, kRbx, @@ -44,6 +49,8 @@ enum class X64Register { kR13, kR14, kR15, + kIntRegisterLast = kR15, + kXmm0, kXmm1, kXmm2, @@ -101,8 +108,7 @@ enum class Arm64Register { kPstate, kFpsr, kFpcr, - // In assembly, the whole 128 bits of the Neon vector registers are accessible - // as Q# (quadword registers). VFP also uses these registers. + // The whole 128 bits of a Vn register are also known as Qn (quadword). kV0, kV1, kV2, diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index 3bcefb6e3..eb28703d1 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -18,6 +18,7 @@ #include "xenia/base/exception_handler.h" #include "xenia/base/logging.h" #include "xenia/base/memory.h" +#include "xenia/base/platform.h" namespace xe { namespace cpu { @@ -114,28 +115,10 @@ bool MMIOHandler::CheckStore(uint32_t virtual_address, uint32_t value) { return false; } -struct DecodedMov { - size_t length; - // Inidicates this is a load (or conversely a store). - bool is_load; - // Indicates the memory must be swapped. - bool byte_swap; - // Source (for store) or target (for load) register. - // AX CX DX BX SP BP SI DI // REX.R=0 - // R8 R9 R10 R11 R12 R13 R14 R15 // REX.R=1 - uint32_t value_reg; - // [base + (index * scale) + displacement] - bool mem_has_base; - uint8_t mem_base_reg; - bool mem_has_index; - uint8_t mem_index_reg; - uint8_t mem_scale; - int32_t mem_displacement; - bool is_constant; - int32_t constant; -}; - -bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { +bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p, + DecodedLoadStore& decoded_out) { + std::memset(&decoded_out, 0, sizeof(decoded_out)); +#if XE_ARCH_AMD64 uint8_t i = 0; // Current byte decode index. uint8_t rex = 0; if ((p[i] & 0xF0) == 0x40) { @@ -148,8 +131,8 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { // 44 0f 38 f1 a4 02 00 movbe DWORD PTR [rdx+rax*1+0x0],r12d // 42 0f 38 f1 8c 22 00 movbe DWORD PTR [rdx+r12*1+0x0],ecx // 0f 38 f1 8c 02 00 00 movbe DWORD PTR [rdx + rax * 1 + 0x0], ecx - mov->is_load = false; - mov->byte_swap = true; + decoded_out.is_load = false; + decoded_out.byte_swap = true; i += 3; } else if (p[i] == 0x0F && p[i + 1] == 0x38 && p[i + 2] == 0xF0) { // MOVBE r32, m32 (load) @@ -159,8 +142,8 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { // 46 0f 38 f0 a4 22 00 movbe r12d,DWORD PTR [rdx+r12*1+0x0] // 0f 38 f0 8c 02 00 00 movbe ecx,DWORD PTR [rdx+rax*1+0x0] // 0F 38 F0 1C 02 movbe ebx,dword ptr [rdx+rax] - mov->is_load = true; - mov->byte_swap = true; + decoded_out.is_load = true; + decoded_out.byte_swap = true; i += 3; } else if (p[i] == 0x89) { // MOV m32, r32 (store) @@ -168,8 +151,8 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { // 44 89 24 02 mov DWORD PTR[rdx + rax * 1], r12d // 42 89 0c 22 mov DWORD PTR[rdx + r12 * 1], ecx // 89 0c 02 mov DWORD PTR[rdx + rax * 1], ecx - mov->is_load = false; - mov->byte_swap = false; + decoded_out.is_load = false; + decoded_out.byte_swap = false; ++i; } else if (p[i] == 0x8B) { // MOV r32, m32 (load) @@ -178,16 +161,16 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { // 42 8b 0c 22 mov ecx, DWORD PTR[rdx + r12 * 1] // 46 8b 24 22 mov r12d, DWORD PTR[rdx + r12 * 1] // 8b 0c 02 mov ecx, DWORD PTR[rdx + rax * 1] - mov->is_load = true; - mov->byte_swap = false; + decoded_out.is_load = true; + decoded_out.byte_swap = false; ++i; } else if (p[i] == 0xC7) { // MOV m32, simm32 // https://web.archive.org/web/20161017042413/https://www.asmpedia.org/index.php?title=MOV // C7 04 02 02 00 00 00 mov dword ptr [rdx+rax],2 - mov->is_load = false; - mov->byte_swap = false; - mov->is_constant = true; + decoded_out.is_load = false; + decoded_out.byte_swap = false; + decoded_out.is_constant = true; ++i; } else { return false; @@ -204,13 +187,13 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { uint8_t mod = (modrm & 0b11000000) >> 6; uint8_t reg = (modrm & 0b00111000) >> 3; uint8_t rm = (modrm & 0b00000111); - mov->value_reg = reg + (rex_r ? 8 : 0); - mov->mem_has_base = false; - mov->mem_base_reg = 0; - mov->mem_has_index = false; - mov->mem_index_reg = 0; - mov->mem_scale = 1; - mov->mem_displacement = 0; + decoded_out.value_reg = reg + (rex_r ? 8 : 0); + decoded_out.mem_has_base = false; + decoded_out.mem_base_reg = 0; + decoded_out.mem_has_index = false; + decoded_out.mem_index_reg = 0; + decoded_out.mem_scale = 1; + decoded_out.mem_displacement = 0; bool has_sib = false; switch (rm) { case 0b100: // SIB @@ -221,17 +204,17 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { // RIP-relative not supported. return false; } - mov->mem_has_base = true; - mov->mem_base_reg = rm + (rex_b ? 8 : 0); + decoded_out.mem_has_base = true; + decoded_out.mem_base_reg = rm + (rex_b ? 8 : 0); break; default: - mov->mem_has_base = true; - mov->mem_base_reg = rm + (rex_b ? 8 : 0); + decoded_out.mem_has_base = true; + decoded_out.mem_base_reg = rm + (rex_b ? 8 : 0); break; } if (has_sib) { uint8_t sib = p[i++]; - mov->mem_scale = 1 << ((sib & 0b11000000) >> 8); + decoded_out.mem_scale = 1 << ((sib & 0b11000000) >> 8); uint8_t sib_index = (sib & 0b00111000) >> 3; uint8_t sib_base = (sib & 0b00000111); switch (sib_index) { @@ -239,8 +222,9 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { // No index. break; default: - mov->mem_has_index = true; - mov->mem_index_reg = sib_index + (rex_x ? 8 : 0); + decoded_out.mem_has_index = true; + decoded_out.mem_index_reg = sib_index + (rex_x ? 8 : 0); + decoded_out.mem_index_size = sizeof(uint64_t); break; } switch (sib_base) { @@ -249,29 +233,162 @@ bool TryDecodeMov(const uint8_t* p, DecodedMov* mov) { assert_zero(mod); return false; default: - mov->mem_has_base = true; - mov->mem_base_reg = sib_base + (rex_b ? 8 : 0); + decoded_out.mem_has_base = true; + decoded_out.mem_base_reg = sib_base + (rex_b ? 8 : 0); break; } } switch (mod) { case 0b00: { - mov->mem_displacement += 0; + decoded_out.mem_displacement += 0; } break; case 0b01: { - mov->mem_displacement += int8_t(p[i++]); + decoded_out.mem_displacement += int8_t(p[i++]); } break; case 0b10: { - mov->mem_displacement += xe::load(p + i); + decoded_out.mem_displacement += xe::load(p + i); i += 4; } break; } - if (mov->is_constant) { - mov->constant = xe::load(p + i); + if (decoded_out.is_constant) { + decoded_out.constant = xe::load(p + i); i += 4; } - mov->length = i; + decoded_out.length = i; return true; + +#elif XE_ARCH_ARM64 + decoded_out.length = sizeof(uint32_t); + uint32_t instruction = *reinterpret_cast(p); + + // Literal loading (PC-relative) is not handled. + + if ((instruction & kArm64LoadStoreAnyFMask) != kArm64LoadStoreAnyFixed) { + // Not a load or a store instruction. + return false; + } + + if ((instruction & kArm64LoadStorePairAnyFMask) == + kArm64LoadStorePairAnyFixed) { + // Handling MMIO only for single 32-bit values, not for pairs. + return false; + } + + uint8_t value_reg_base; + switch (Arm64LoadStoreOp(instruction & kArm64LoadStoreMask)) { + case Arm64LoadStoreOp::kSTR_w: + decoded_out.is_load = false; + value_reg_base = DecodedLoadStore::kArm64ValueRegX0; + break; + case Arm64LoadStoreOp::kLDR_w: + decoded_out.is_load = true; + value_reg_base = DecodedLoadStore::kArm64ValueRegX0; + break; + case Arm64LoadStoreOp::kSTR_s: + decoded_out.is_load = false; + value_reg_base = DecodedLoadStore::kArm64ValueRegV0; + break; + case Arm64LoadStoreOp::kLDR_s: + decoded_out.is_load = true; + value_reg_base = DecodedLoadStore::kArm64ValueRegV0; + break; + default: + return false; + } + + // `Rt` field (load / store register). + decoded_out.value_reg = value_reg_base + (instruction & 31); + if (decoded_out.is_load && + decoded_out.value_reg == DecodedLoadStore::kArm64ValueRegZero) { + // Zero constant rather than a register read. + decoded_out.is_constant = true; + decoded_out.constant = 0; + } + + decoded_out.mem_has_base = true; + // The base is Xn (for 0...30) or SP (for 31). + // `Rn` field (first source register). + decoded_out.mem_base_reg = (instruction >> 5) & 31; + + bool is_unsigned_offset = + (instruction & kArm64LoadStoreUnsignedOffsetFMask) == + kArm64LoadStoreUnsignedOffsetFixed; + if (is_unsigned_offset) { + // LDR|STR Wt|St, [Xn|SP{, #pimm}] + // pimm (positive immediate) is scaled by the size of the data (4 for + // words). + // `ImmLSUnsigned` field. + uint32_t unsigned_offset = (instruction >> 10) & 4095; + decoded_out.mem_displacement = + ptrdiff_t(sizeof(uint32_t) * unsigned_offset); + } else { + Arm64LoadStoreOffsetFixed offset = + Arm64LoadStoreOffsetFixed(instruction & kArm64LoadStoreOffsetFMask); + // simm (signed immediate) is not scaled. + // Only applicable to kUnscaledOffset, kPostIndex and kPreIndex. + // `ImmLS` field. + int32_t signed_offset = int32_t(instruction << (32 - (9 + 12))) >> (32 - 9); + // For both post- and pre-indexing, the new address is written to the + // register after the data register write, thus if Xt and Xn are the same, + // the final value in the register will be the new address. + // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- + switch (offset) { + case Arm64LoadStoreOffsetFixed::kUnscaledOffset: { + // LDUR|STUR Wt|St, [Xn|SP{, #simm}] + decoded_out.mem_displacement = signed_offset; + } break; + case Arm64LoadStoreOffsetFixed::kPostIndex: { + // LDR|STR Wt|St, [Xn|SP], #simm + decoded_out.mem_base_writeback = true; + decoded_out.mem_base_writeback_offset = signed_offset; + } break; + case Arm64LoadStoreOffsetFixed::kPreIndex: { + // LDR|STR Wt|St, [Xn|SP, #simm]! + decoded_out.mem_base_writeback = true; + decoded_out.mem_base_writeback_offset = signed_offset; + decoded_out.mem_displacement = signed_offset; + } break; + case Arm64LoadStoreOffsetFixed::kRegisterOffset: { + // LDR|STR Wt|St, [Xn|SP, (Wm|Xm){, extend {amount}}] + // `Rm` field. + decoded_out.mem_index_reg = (instruction >> 16) & 31; + if (decoded_out.mem_index_reg != DecodedLoadStore::kArm64RegZero) { + decoded_out.mem_has_index = true; + // Allowed extend types in the `option` field are UXTW (0b010), LSL + // (0b011 - identical to UXTX), SXTW (0b110), SXTX (0b111). + // The shift (0 or 2 for 32-bit LDR/STR) can be applied regardless of + // the extend type ("LSL" is just a term for assembly readability, + // internally it's treated simply as UXTX). + // If bit 0 of the `option` field is 0 (UXTW, SXTW), the index + // register is treated as 32-bit (Wm) extended to 64-bit. If it's 1 + // (LSL aka UXTX, SXTX), the index register is treated as 64-bit (Xm). + // `ExtendMode` (`option`) field. + uint32_t extend_mode = (instruction >> 13) & 0b111; + if (!(extend_mode & 0b010)) { + // Sub-word index - undefined. + return false; + } + decoded_out.mem_index_size = + (extend_mode & 0b001) ? sizeof(uint64_t) : sizeof(uint32_t); + decoded_out.mem_index_sign_extend = (extend_mode & 0b100) != 0; + // Shift is either 0 or log2(sizeof(load or store size)). + // Supporting MMIO only for 4-byte words. + // `ImmShiftLS` field. + decoded_out.mem_scale = + (instruction & (UINT32_C(1) << 12)) ? sizeof(uint32_t) : 1; + } + } break; + default: + return false; + } + } + + return true; + +#else +#error TryDecodeLoadStore not implemented for the target CPU architecture. + return false; +#endif // XE_ARCH } bool MMIOHandler::ExceptionCallbackThunk(Exception* ex, void* data) { @@ -300,11 +417,13 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { // Access violations are pretty rare, so we can do a linear search here. // Only check if in the virtual range, as we only support virtual ranges. const MMIORange* range = nullptr; + uint32_t fault_guest_virtual_address = 0; if (ex->fault_address() < uint64_t(physical_membase_)) { - uint32_t fault_virtual_address = host_to_guest_virtual_( + fault_guest_virtual_address = host_to_guest_virtual_( host_to_guest_virtual_context_, fault_host_address); for (const auto& test_range : mapped_ranges_) { - if ((fault_virtual_address & test_range.mask) == test_range.address) { + if ((fault_guest_virtual_address & test_range.mask) == + test_range.address) { // Address is within the range of this mapping. range = &test_range; break; @@ -336,44 +455,114 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { auto rip = ex->pc(); auto p = reinterpret_cast(rip); - DecodedMov mov = {0}; - bool decoded = TryDecodeMov(p, &mov); - if (!decoded) { - XELOGE("Unable to decode MMIO mov at {}", p); + DecodedLoadStore decoded_load_store; + if (!TryDecodeLoadStore(p, decoded_load_store)) { + XELOGE("Unable to decode MMIO load or store instruction at {}", p); assert_always("Unknown MMIO instruction type"); return false; } - if (mov.is_load) { + HostThreadContext& thread_context = *ex->thread_context(); + +#if XE_ARCH_ARM64 + // Preserve the base address with the pre- or the post-index offset to write + // it after writing the result (since the base address register and the + // register to load to may be the same, in which case it should receive the + // original base address with the offset). + uintptr_t mem_base_writeback_address = 0; + if (decoded_load_store.mem_has_base && + decoded_load_store.mem_base_writeback) { + if (decoded_load_store.mem_base_reg == + DecodedLoadStore::kArm64MemBaseRegSp) { + mem_base_writeback_address = thread_context.sp; + } else { + assert_true(decoded_load_store.mem_base_reg <= 30); + mem_base_writeback_address = + thread_context.x[decoded_load_store.mem_base_reg]; + } + mem_base_writeback_address += decoded_load_store.mem_base_writeback_offset; + } +#endif // XE_ARCH_ARM64 + + uint8_t value_reg = decoded_load_store.value_reg; + if (decoded_load_store.is_load) { // Load of a memory value - read from range, swap, and store in the // register. uint32_t value = range->read(nullptr, range->callback_context, - static_cast(ex->fault_address())); - uint64_t* reg_ptr = &ex->thread_context()->int_registers[mov.value_reg]; - if (!mov.byte_swap) { + fault_guest_virtual_address); + if (!decoded_load_store.byte_swap) { // We swap only if it's not a movbe, as otherwise we are swapping twice. value = xe::byte_swap(value); } - *reg_ptr = value; +#if XE_ARCH_AMD64 + ex->ModifyIntRegister(value_reg) = value; +#elif XE_ARCH_ARM64 + if (value_reg >= DecodedLoadStore::kArm64ValueRegX0 && + value_reg <= (DecodedLoadStore::kArm64ValueRegX0 + 30)) { + ex->ModifyXRegister(value_reg - DecodedLoadStore::kArm64ValueRegX0) = + value; + } else if (value_reg >= DecodedLoadStore::kArm64ValueRegV0 && + value_reg <= (DecodedLoadStore::kArm64ValueRegV0 + 31)) { + ex->ModifyVRegister(value_reg - DecodedLoadStore::kArm64ValueRegV0) + .u32[0] = value; + } else { + assert_true(value_reg == DecodedLoadStore::kArm64ValueRegZero); + // Register write is ignored for X31. + } +#else +#error Register value writing not implemented for the target CPU architecture. +#endif // XE_ARCH } else { // Store of a register value - read register, swap, write to range. - int32_t value; - if (mov.is_constant) { - value = uint32_t(mov.constant); + uint32_t value; + if (decoded_load_store.is_constant) { + value = uint32_t(decoded_load_store.constant); } else { - uint64_t* reg_ptr = &ex->thread_context()->int_registers[mov.value_reg]; - value = static_cast(*reg_ptr); - if (!mov.byte_swap) { +#if XE_ARCH_AMD64 + value = uint32_t(thread_context.int_registers[value_reg]); +#elif XE_ARCH_ARM64 + if (value_reg >= DecodedLoadStore::kArm64ValueRegX0 && + value_reg <= (DecodedLoadStore::kArm64ValueRegX0 + 30)) { + value = uint32_t( + thread_context.x[value_reg - DecodedLoadStore::kArm64ValueRegX0]); + } else if (value_reg >= DecodedLoadStore::kArm64ValueRegV0 && + value_reg <= (DecodedLoadStore::kArm64ValueRegV0 + 31)) { + value = thread_context.v[value_reg - DecodedLoadStore::kArm64ValueRegV0] + .u32[0]; + } else { + assert_true(value_reg == DecodedLoadStore::kArm64ValueRegZero); + value = 0; + } +#else +#error Register value reading not implemented for the target CPU architecture. +#endif // XE_ARCH + if (!decoded_load_store.byte_swap) { // We swap only if it's not a movbe, as otherwise we are swapping twice. - value = xe::byte_swap(static_cast(value)); + value = xe::byte_swap(value); } } - range->write(nullptr, range->callback_context, - static_cast(ex->fault_address()), value); + range->write(nullptr, range->callback_context, fault_guest_virtual_address, + value); } +#if XE_ARCH_ARM64 + // Write the base address with the pre- or the post-index offset, overwriting + // the register to load to if it's the same. + if (decoded_load_store.mem_has_base && + decoded_load_store.mem_base_writeback) { + if (decoded_load_store.mem_base_reg == + DecodedLoadStore::kArm64MemBaseRegSp) { + thread_context.sp = mem_base_writeback_address; + } else { + assert_true(decoded_load_store.mem_base_reg <= 30); + ex->ModifyXRegister(decoded_load_store.mem_base_reg) = + mem_base_writeback_address; + } + } +#endif // XE_ARCH_ARM64 + // Advance RIP to the next instruction so that we resume properly. - ex->set_resume_pc(rip + mov.length); + ex->set_resume_pc(rip + decoded_load_store.length); return true; } diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index 711427c41..6240544e0 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -15,6 +15,7 @@ #include #include "xenia/base/mutex.h" +#include "xenia/base/platform.h" namespace xe { class Exception; @@ -93,6 +94,61 @@ class MMIOHandler { static MMIOHandler* global_handler_; xe::global_critical_region global_critical_region_; + + private: + struct DecodedLoadStore { + // Matches the Xn/Wn register number for 0 reads and ignored writes in many + // usage cases. + static constexpr uint8_t kArm64RegZero = 31; + + // Matches the actual register number encoding for an SP base in AArch64 + // load and store instructions. + static constexpr uint8_t kArm64MemBaseRegSp = kArm64RegZero; + + static constexpr uint8_t kArm64ValueRegX0 = 0; + static constexpr uint8_t kArm64ValueRegZero = + kArm64ValueRegX0 + kArm64RegZero; + static constexpr uint8_t kArm64ValueRegV0 = 32; + + size_t length; + // Inidicates this is a load (or conversely a store). + bool is_load; + // Indicates the memory must be swapped. + bool byte_swap; + // Source (for store) or target (for load) register. + // For x86-64: + // AX CX DX BX SP BP SI DI // REX.R=0 + // R8 R9 R10 R11 R12 R13 R14 R15 // REX.R=1 + // For AArch64: + // - kArm64ValueRegX0 + [0...30]: Xn (Wn for 32 bits - upper 32 bits of Xn + // are zeroed on Wn write). + // - kArm64ValueRegZero: Zero constant for register read, ignored register + // write (though memory must still be accessed - a MMIO load may have side + // effects even if the result is discarded). + // - kArm64ValueRegV0 + [0...31]: Vn (Sn for 32 bits). + uint8_t value_reg; + // [base + (index * scale) + displacement] + bool mem_has_base; + // On AArch64, if mem_base_reg is kArm64MemBaseRegSp, the base register is + // SP, not Xn. + uint8_t mem_base_reg; + // For AArch64 pre- and post-indexing. In case of a load, the base register + // is written back after the loaded data is written to the register, + // overwriting the value register if it's the same. + bool mem_base_writeback; + int32_t mem_base_writeback_offset; + bool mem_has_index; + uint8_t mem_index_reg; + uint8_t mem_index_size; + bool mem_index_sign_extend; + uint8_t mem_scale; + ptrdiff_t mem_displacement; + bool is_constant; + int32_t constant; + }; + + static bool TryDecodeLoadStore(const uint8_t* p, + DecodedLoadStore& decoded_out); }; } // namespace cpu diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 8d1fae81a..f22a4e12b 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -19,6 +19,7 @@ #include "xenia/base/literals.h" #include "xenia/base/logging.h" #include "xenia/base/memory.h" +#include "xenia/base/platform.h" #include "xenia/base/profiling.h" #include "xenia/base/threading.h" #include "xenia/cpu/breakpoint.h" @@ -675,7 +676,13 @@ bool Processor::OnThreadBreakpointHit(Exception* ex) { // Apply thread context changes. // TODO(benvanik): apply to all threads? +#if XE_ARCH_AMD64 ex->set_resume_pc(thread_info->host_context.rip); +#elif XE_ARCH_ARM64 + ex->set_resume_pc(thread_info->host_context.pc); +#else +#error Instruction pointer not specified for the target CPU architecture. +#endif // XE_ARCH // Resume execution. return true;