From fc4727c339a5d1f2342e3c43a25c268df69ee492 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 21 May 2015 00:12:28 -0700 Subject: [PATCH] Changing calls/jmps to use the indirection table. Most games seem faster. --- src/xenia/cpu/backend/x64/x64_backend.cc | 10 + src/xenia/cpu/backend/x64/x64_backend.h | 8 + src/xenia/cpu/backend/x64/x64_code_cache.cc | 21 +- src/xenia/cpu/backend/x64/x64_code_cache.h | 7 + src/xenia/cpu/backend/x64/x64_emitter.cc | 227 ++++-------------- .../cpu/backend/x64/x64_thunk_emitter.cc | 46 ++++ src/xenia/cpu/backend/x64/x64_thunk_emitter.h | 3 + 7 files changed, 144 insertions(+), 178 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index b24ae174f..d6755449c 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -50,6 +50,16 @@ bool X64Backend::Initialize() { auto thunk_emitter = std::make_unique(this, allocator.get()); host_to_guest_thunk_ = thunk_emitter->EmitHostToGuestThunk(); guest_to_host_thunk_ = thunk_emitter->EmitGuestToHostThunk(); + resolve_function_thunk_ = thunk_emitter->EmitResolveFunctionThunk(); + + // Set the code cache to use the ResolveFunction thunk for default + // indirections. + assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull); + code_cache_->set_indirection_default( + uint32_t(uint64_t(resolve_function_thunk_))); + + // Allocate some special indirections. + code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF); return true; } diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index b83e2a783..db58260fb 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -23,15 +23,21 @@ class X64CodeCache; typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); +typedef void (*ResolveFunctionThunk)(); class X64Backend : public Backend { public: + const static uint32_t kForceReturnAddress = 0x9FFF0000u; + X64Backend(Processor* processor); ~X64Backend() override; X64CodeCache* code_cache() const { return code_cache_; } HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; } GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; } + ResolveFunctionThunk resolve_function_thunk() const { + return resolve_function_thunk_; + } bool Initialize() override; @@ -41,8 +47,10 @@ class X64Backend : public Backend { private: X64CodeCache* code_cache_; + HostToGuestThunk host_to_guest_thunk_; GuestToHostThunk guest_to_host_thunk_; + ResolveFunctionThunk resolve_function_thunk_; }; } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.cc b/src/xenia/cpu/backend/x64/x64_code_cache.cc index 55d0edbff..41dac0ac9 100644 --- a/src/xenia/cpu/backend/x64/x64_code_cache.cc +++ b/src/xenia/cpu/backend/x64/x64_code_cache.cc @@ -24,7 +24,8 @@ namespace x64 { const static uint32_t kUnwindInfoSize = 4 + (2 * 1 + 2 + 2); X64CodeCache::X64CodeCache() - : indirection_table_base_(nullptr), + : indirection_default_value_(0xFEEDF00D), + indirection_table_base_(nullptr), generated_code_base_(nullptr), generated_code_offset_(0), generated_code_commit_mark_(0), @@ -87,10 +88,28 @@ bool X64CodeCache::Initialize() { return true; } +void X64CodeCache::set_indirection_default(uint32_t default_value) { + indirection_default_value_ = default_value; +} + +void X64CodeCache::AddIndirection(uint32_t guest_address, + uint32_t host_address) { + uint32_t* indirection_slot = reinterpret_cast( + indirection_table_base_ + (guest_address - kIndirectionTableBase)); + *indirection_slot = host_address; +} + void X64CodeCache::CommitExecutableRange(uint32_t guest_low, uint32_t guest_high) { + // Commit the memory. VirtualAlloc(indirection_table_base_ + (guest_low - kIndirectionTableBase), guest_high - guest_low, MEM_COMMIT, PAGE_READWRITE); + + // Fill memory with the default value. + uint32_t* p = reinterpret_cast(indirection_table_base_); + for (uint32_t address = guest_low; address < guest_high; ++address) { + p[(address - kIndirectionTableBase) / 4] = indirection_default_value_; + } } void* X64CodeCache::PlaceCode(uint32_t guest_address, void* machine_code, diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.h b/src/xenia/cpu/backend/x64/x64_code_cache.h index edf85375a..13234abd6 100644 --- a/src/xenia/cpu/backend/x64/x64_code_cache.h +++ b/src/xenia/cpu/backend/x64/x64_code_cache.h @@ -33,6 +33,10 @@ class X64CodeCache { // TODO(benvanik): keep track of code blocks // TODO(benvanik): padding/guards/etc + void set_indirection_default(uint32_t default_value); + + void AddIndirection(uint32_t guest_address, uint32_t host_address); + void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high); void* PlaceCode(uint32_t guest_address, void* machine_code, size_t code_size, @@ -52,6 +56,9 @@ class X64CodeCache { // the tables consistent and ordered. std::mutex allocation_mutex_; + // Value that the indirection table will be initialized with upon commit. + uint32_t indirection_default_value_; + // Fixed at kIndirectionTableBase in host space, holding 4 byte pointers into // the generated code table that correspond to the PPC functions in guest // space. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 338a635f8..15905c5a9 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -325,94 +325,9 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) { assert_always(); } -// Total size of ResolveFunctionSymbol call site in bytes. -// Used to overwrite it with nops as needed. -const size_t TOTAL_RESOLVE_SIZE = 27; -const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8; - -uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *reinterpret_cast(raw_context); - auto symbol_info = reinterpret_cast(symbol_info_ptr); - - // Resolve function. This will demand compile as required. - Function* fn = NULL; - thread_state->processor()->ResolveFunction(symbol_info->address(), &fn); - assert_not_null(fn); - auto x64_fn = static_cast(fn); - uint64_t addr = reinterpret_cast(x64_fn->machine_code()); - -// Overwrite the call site. -// The return address points to ReloadRCX work after the call. -#if XE_PLATFORM_WIN32 - uint64_t return_address = reinterpret_cast(_ReturnAddress()); -#else - uint64_t return_address = - reinterpret_cast(__builtin_return_address(0)); -#endif // XE_PLATFORM_WIN32 -#pragma pack(push, 1) - struct Asm { - uint16_t mov_rax; - uint64_t rax_constant; - uint16_t mov_rdx; - uint64_t rdx_constant; - uint16_t call_rax; - uint8_t mov_rcx[5]; - }; -#pragma pack(pop) - static_assert_size(Asm, TOTAL_RESOLVE_SIZE); - Asm* code = reinterpret_cast(return_address - ASM_OFFSET); - code->rax_constant = addr; - code->call_rax = 0x9066; - - // We need to return the target in rax so that it gets called. - return addr; -} - -void X64Emitter::Call(const hir::Instr* instr, FunctionInfo* symbol_info) { - auto fn = reinterpret_cast(symbol_info->function()); - // Resolve address to the function to call and store in rax. - if (fn) { - mov(rax, reinterpret_cast(fn->machine_code())); - } else { - size_t start = getSize(); - // 2b + 8b constant - mov(rax, reinterpret_cast(ResolveFunctionSymbol)); - // 2b + 8b constant - mov(rdx, reinterpret_cast(symbol_info)); - // 2b - call(rax); - // 5b - ReloadECX(); - size_t total_size = getSize() - start; - assert_true(total_size == TOTAL_RESOLVE_SIZE); - // EDX overwritten, don't bother reloading. - } - - // Actually jump/call to rax. - if (instr->flags & CALL_TAIL) { - // Since we skip the prolog we need to mark the return here. - EmitTraceUserCallReturn(); - - // Pass the callers return address over. - mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); - - add(rsp, static_cast(stack_size())); - jmp(rax); - } else { - // Return address is from the previous SET_RETURN_ADDRESS. - mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); - call(rax); - } -} - -// NOTE: slot count limited by short jump size. -const int kICSlotCount = 4; -const int kICSlotSize = 23; -const uint64_t kICSlotInvalidTargetAddress = 0x0F0F0F0F0F0F0F0F; - -uint64_t ResolveFunctionAddress(void* raw_context, uint32_t target_address) { - // TODO(benvanik): generate this thunk at runtime? or a shim? +// This is used by the X64ThunkEmitter's ResolveFunctionThunk. +extern "C" uint64_t ResolveFunction(void* raw_context, + uint32_t target_address) { auto thread_state = *reinterpret_cast(raw_context); // TODO(benvanik): required? @@ -424,100 +339,26 @@ uint64_t ResolveFunctionAddress(void* raw_context, uint32_t target_address) { auto x64_fn = static_cast(fn); uint64_t addr = reinterpret_cast(x64_fn->machine_code()); -// Add an IC slot, if there is room. -#if XE_PLATFORM_WIN32 - uint64_t return_address = reinterpret_cast(_ReturnAddress()); -#else - uint64_t return_address = - reinterpret_cast(__builtin_return_address(0)); -#endif // XE_PLATFORM_WIN32 -#pragma pack(push, 1) - struct Asm { - uint16_t cmp_rdx; - uint32_t address_constant; - uint16_t jmp_next_slot; - uint16_t mov_rax; - uint64_t target_constant; - uint8_t jmp_skip_resolve[5]; - }; -#pragma pack(pop) - static_assert_size(Asm, kICSlotSize); - // TODO(benvanik): quick check table is full (so we don't have to enum slots) - // The return address points to ReloadRCX work after the call. - // To get the top of the table, look back a ways. - uint64_t table_start = return_address - 12 - kICSlotSize * kICSlotCount; - // NOTE: order matters here - we update the address BEFORE we switch the code - // over to passing the compare. - Asm* table_slot = reinterpret_cast(table_start); - bool wrote_ic = false; - for (int i = 0; i < kICSlotCount; ++i) { - if (xe::atomic_cas(kICSlotInvalidTargetAddress, addr, - &table_slot->target_constant)) { - // Got slot! Just write the compare and we're done. - table_slot->address_constant = static_cast(target_address); - wrote_ic = true; - break; - } - ++table_slot; - } - if (!wrote_ic) { - // TODO(benvanik): log that IC table is full. - } - - // We need to return the target in rax so that it gets called. return addr; } -void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) { - // Check if return. - if (instr->flags & CALL_POSSIBLE_RETURN) { - cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]); - je("epilog", CodeGenerator::T_NEAR); - } - - if (reg.getIdx() != rdx.getIdx()) { - mov(rdx, reg); - } - - inLocalLabel(); - Xbyak::Label skip_resolve; - - // TODO(benvanik): make empty tables skippable (cmp, jump right to resolve). - - // IC table, initially empty. - // This will get filled in as functions are resolved. - // Note that we only have a limited cache, and once it's full all calls - // will fall through. - // TODO(benvanik): check miss rate when full and add a 2nd-level table? - // 0000000264BD4DC3 81 FA 0F0F0F0F cmp edx,0F0F0F0Fh - // 0000000264BD4DC9 75 0C jne 0000000264BD4DD7 - // 0000000264BD4DCB 48 B8 0F0F0F0F0F0F0F0F mov rax,0F0F0F0F0F0F0F0Fh - // 0000000264BD4DD5 EB XXXXXXXX jmp 0000000264BD4E00 - size_t table_start = getSize(); - for (int i = 0; i < kICSlotCount; ++i) { - // Compare target address with constant, if matches jump there. - // Otherwise, fall through. - // 6b - cmp(edx, 0x0F0F0F0F); - Xbyak::Label next_slot; - // 2b - jne(next_slot, T_SHORT); - // Match! Load up rax and skip down to the jmp code. - // 10b - mov(rax, kICSlotInvalidTargetAddress); - // 5b - jmp(skip_resolve, T_NEAR); - L(next_slot); - } - size_t table_size = getSize() - table_start; - assert_true(table_size == kICSlotSize * kICSlotCount); - +void X64Emitter::Call(const hir::Instr* instr, FunctionInfo* symbol_info) { + auto fn = reinterpret_cast(symbol_info->function()); // Resolve address to the function to call and store in rax. - // We fall through to this when there are no hits in the IC table. - CallNative(ResolveFunctionAddress); + if (fn) { + // TODO(benvanik): is it worth it to do this? It removes the need for + // a ResolveFunction call, but makes the table less useful. + assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000); + mov(eax, uint32_t(uint64_t(fn->machine_code()))); + } else { + // Load the pointer to the indirection table maintained in X64CodeCache. + // The target dword will either contain the address of the generated code + // or a thunk to ResolveAddress. + mov(ebx, symbol_info->address()); + mov(eax, dword[ebx]); + } // Actually jump/call to rax. - L(skip_resolve); if (instr->flags & CALL_TAIL) { // Since we skip the prolog we need to mark the return here. EmitTraceUserCallReturn(); @@ -530,10 +371,42 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) { } else { // Return address is from the previous SET_RETURN_ADDRESS. mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + call(rax); } +} - outLocalLabel(); +void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) { + // Check if return. + if (instr->flags & CALL_POSSIBLE_RETURN) { + cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]); + je("epilog", CodeGenerator::T_NEAR); + } + + // Load the pointer to the indirection table maintained in X64CodeCache. + // The target dword will either contain the address of the generated code + // or a thunk to ResolveAddress. + if (reg.cvt32() != ebx) { + mov(ebx, reg.cvt32()); + } + mov(eax, dword[ebx]); + + // Actually jump/call to rax. + if (instr->flags & CALL_TAIL) { + // Since we skip the prolog we need to mark the return here. + EmitTraceUserCallReturn(); + + // Pass the callers return address over. + mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); + + add(rsp, static_cast(stack_size())); + jmp(rax); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + + call(rax); + } } uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) { diff --git a/src/xenia/cpu/backend/x64/x64_thunk_emitter.cc b/src/xenia/cpu/backend/x64/x64_thunk_emitter.cc index 3f8e8ce9c..b3e38e59a 100644 --- a/src/xenia/cpu/backend/x64/x64_thunk_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_thunk_emitter.cc @@ -141,6 +141,52 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { return (HostToGuestThunk)fn; } +// X64Emitter handles actually resolving functions. +extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address); + +ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() { + // ebx = target PPC address + // rcx = context + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + // rsp + 0 = return address + mov(qword[rsp + 8 * 2], rdx); + mov(qword[rsp + 8 * 1], rcx); + sub(rsp, stack_size); + + mov(qword[rsp + 48], rbx); + mov(qword[rsp + 56], rcx); + mov(qword[rsp + 64], rbp); + mov(qword[rsp + 72], rsi); + mov(qword[rsp + 80], rdi); + mov(qword[rsp + 88], r12); + mov(qword[rsp + 96], r13); + mov(qword[rsp + 104], r14); + mov(qword[rsp + 112], r15); + + mov(rdx, rbx); + mov(rax, uint64_t(&ResolveFunction)); + call(rax); + + mov(rbx, qword[rsp + 48]); + mov(rcx, qword[rsp + 56]); + mov(rbp, qword[rsp + 64]); + mov(rsi, qword[rsp + 72]); + mov(rdi, qword[rsp + 80]); + mov(r12, qword[rsp + 88]); + mov(r13, qword[rsp + 96]); + mov(r14, qword[rsp + 104]); + mov(r15, qword[rsp + 112]); + + add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); + jmp(rax); + + void* fn = Emplace(0, stack_size); + return (ResolveFunctionThunk)fn; +} + } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_thunk_emitter.h b/src/xenia/cpu/backend/x64/x64_thunk_emitter.h index 9b792575c..792fca66f 100644 --- a/src/xenia/cpu/backend/x64/x64_thunk_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_thunk_emitter.h @@ -133,6 +133,9 @@ class X64ThunkEmitter : public X64Emitter { // Function that guest code can call to transition into host code. GuestToHostThunk EmitGuestToHostThunk(); + + // Function that thunks to the ResolveFunction in X64Emitter. + ResolveFunctionThunk EmitResolveFunctionThunk(); }; } // namespace x64