Changing calls/jmps to use the indirection table. Most games seem faster.
This commit is contained in:
parent
ad72c193a6
commit
fc4727c339
|
@ -50,6 +50,16 @@ bool X64Backend::Initialize() {
|
|||
auto thunk_emitter = std::make_unique<X64ThunkEmitter>(this, allocator.get());
|
||||
host_to_guest_thunk_ = thunk_emitter->EmitHostToGuestThunk();
|
||||
guest_to_host_thunk_ = thunk_emitter->EmitGuestToHostThunk();
|
||||
resolve_function_thunk_ = thunk_emitter->EmitResolveFunctionThunk();
|
||||
|
||||
// Set the code cache to use the ResolveFunction thunk for default
|
||||
// indirections.
|
||||
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
||||
code_cache_->set_indirection_default(
|
||||
uint32_t(uint64_t(resolve_function_thunk_)));
|
||||
|
||||
// Allocate some special indirections.
|
||||
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -23,15 +23,21 @@ class X64CodeCache;
|
|||
|
||||
typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
|
||||
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
|
||||
typedef void (*ResolveFunctionThunk)();
|
||||
|
||||
class X64Backend : public Backend {
|
||||
public:
|
||||
const static uint32_t kForceReturnAddress = 0x9FFF0000u;
|
||||
|
||||
X64Backend(Processor* processor);
|
||||
~X64Backend() override;
|
||||
|
||||
X64CodeCache* code_cache() const { return code_cache_; }
|
||||
HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; }
|
||||
GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; }
|
||||
ResolveFunctionThunk resolve_function_thunk() const {
|
||||
return resolve_function_thunk_;
|
||||
}
|
||||
|
||||
bool Initialize() override;
|
||||
|
||||
|
@ -41,8 +47,10 @@ class X64Backend : public Backend {
|
|||
|
||||
private:
|
||||
X64CodeCache* code_cache_;
|
||||
|
||||
HostToGuestThunk host_to_guest_thunk_;
|
||||
GuestToHostThunk guest_to_host_thunk_;
|
||||
ResolveFunctionThunk resolve_function_thunk_;
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
|
@ -24,7 +24,8 @@ namespace x64 {
|
|||
const static uint32_t kUnwindInfoSize = 4 + (2 * 1 + 2 + 2);
|
||||
|
||||
X64CodeCache::X64CodeCache()
|
||||
: indirection_table_base_(nullptr),
|
||||
: indirection_default_value_(0xFEEDF00D),
|
||||
indirection_table_base_(nullptr),
|
||||
generated_code_base_(nullptr),
|
||||
generated_code_offset_(0),
|
||||
generated_code_commit_mark_(0),
|
||||
|
@ -87,10 +88,28 @@ bool X64CodeCache::Initialize() {
|
|||
return true;
|
||||
}
|
||||
|
||||
void X64CodeCache::set_indirection_default(uint32_t default_value) {
|
||||
indirection_default_value_ = default_value;
|
||||
}
|
||||
|
||||
void X64CodeCache::AddIndirection(uint32_t guest_address,
|
||||
uint32_t host_address) {
|
||||
uint32_t* indirection_slot = reinterpret_cast<uint32_t*>(
|
||||
indirection_table_base_ + (guest_address - kIndirectionTableBase));
|
||||
*indirection_slot = host_address;
|
||||
}
|
||||
|
||||
void X64CodeCache::CommitExecutableRange(uint32_t guest_low,
|
||||
uint32_t guest_high) {
|
||||
// Commit the memory.
|
||||
VirtualAlloc(indirection_table_base_ + (guest_low - kIndirectionTableBase),
|
||||
guest_high - guest_low, MEM_COMMIT, PAGE_READWRITE);
|
||||
|
||||
// Fill memory with the default value.
|
||||
uint32_t* p = reinterpret_cast<uint32_t*>(indirection_table_base_);
|
||||
for (uint32_t address = guest_low; address < guest_high; ++address) {
|
||||
p[(address - kIndirectionTableBase) / 4] = indirection_default_value_;
|
||||
}
|
||||
}
|
||||
|
||||
void* X64CodeCache::PlaceCode(uint32_t guest_address, void* machine_code,
|
||||
|
|
|
@ -33,6 +33,10 @@ class X64CodeCache {
|
|||
// TODO(benvanik): keep track of code blocks
|
||||
// TODO(benvanik): padding/guards/etc
|
||||
|
||||
void set_indirection_default(uint32_t default_value);
|
||||
|
||||
void AddIndirection(uint32_t guest_address, uint32_t host_address);
|
||||
|
||||
void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high);
|
||||
|
||||
void* PlaceCode(uint32_t guest_address, void* machine_code, size_t code_size,
|
||||
|
@ -52,6 +56,9 @@ class X64CodeCache {
|
|||
// the tables consistent and ordered.
|
||||
std::mutex allocation_mutex_;
|
||||
|
||||
// Value that the indirection table will be initialized with upon commit.
|
||||
uint32_t indirection_default_value_;
|
||||
|
||||
// Fixed at kIndirectionTableBase in host space, holding 4 byte pointers into
|
||||
// the generated code table that correspond to the PPC functions in guest
|
||||
// space.
|
||||
|
|
|
@ -325,94 +325,9 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
|
|||
assert_always();
|
||||
}
|
||||
|
||||
// Total size of ResolveFunctionSymbol call site in bytes.
|
||||
// Used to overwrite it with nops as needed.
|
||||
const size_t TOTAL_RESOLVE_SIZE = 27;
|
||||
const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8;
|
||||
|
||||
uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
|
||||
// TODO(benvanik): generate this thunk at runtime? or a shim?
|
||||
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
|
||||
auto symbol_info = reinterpret_cast<FunctionInfo*>(symbol_info_ptr);
|
||||
|
||||
// Resolve function. This will demand compile as required.
|
||||
Function* fn = NULL;
|
||||
thread_state->processor()->ResolveFunction(symbol_info->address(), &fn);
|
||||
assert_not_null(fn);
|
||||
auto x64_fn = static_cast<X64Function*>(fn);
|
||||
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
|
||||
|
||||
// Overwrite the call site.
|
||||
// The return address points to ReloadRCX work after the call.
|
||||
#if XE_PLATFORM_WIN32
|
||||
uint64_t return_address = reinterpret_cast<uint64_t>(_ReturnAddress());
|
||||
#else
|
||||
uint64_t return_address =
|
||||
reinterpret_cast<uint64_t>(__builtin_return_address(0));
|
||||
#endif // XE_PLATFORM_WIN32
|
||||
#pragma pack(push, 1)
|
||||
struct Asm {
|
||||
uint16_t mov_rax;
|
||||
uint64_t rax_constant;
|
||||
uint16_t mov_rdx;
|
||||
uint64_t rdx_constant;
|
||||
uint16_t call_rax;
|
||||
uint8_t mov_rcx[5];
|
||||
};
|
||||
#pragma pack(pop)
|
||||
static_assert_size(Asm, TOTAL_RESOLVE_SIZE);
|
||||
Asm* code = reinterpret_cast<Asm*>(return_address - ASM_OFFSET);
|
||||
code->rax_constant = addr;
|
||||
code->call_rax = 0x9066;
|
||||
|
||||
// We need to return the target in rax so that it gets called.
|
||||
return addr;
|
||||
}
|
||||
|
||||
void X64Emitter::Call(const hir::Instr* instr, FunctionInfo* symbol_info) {
|
||||
auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
|
||||
// Resolve address to the function to call and store in rax.
|
||||
if (fn) {
|
||||
mov(rax, reinterpret_cast<uint64_t>(fn->machine_code()));
|
||||
} else {
|
||||
size_t start = getSize();
|
||||
// 2b + 8b constant
|
||||
mov(rax, reinterpret_cast<uint64_t>(ResolveFunctionSymbol));
|
||||
// 2b + 8b constant
|
||||
mov(rdx, reinterpret_cast<uint64_t>(symbol_info));
|
||||
// 2b
|
||||
call(rax);
|
||||
// 5b
|
||||
ReloadECX();
|
||||
size_t total_size = getSize() - start;
|
||||
assert_true(total_size == TOTAL_RESOLVE_SIZE);
|
||||
// EDX overwritten, don't bother reloading.
|
||||
}
|
||||
|
||||
// Actually jump/call to rax.
|
||||
if (instr->flags & CALL_TAIL) {
|
||||
// Since we skip the prolog we need to mark the return here.
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
// Pass the callers return address over.
|
||||
mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
jmp(rax);
|
||||
} else {
|
||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
call(rax);
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: slot count limited by short jump size.
|
||||
const int kICSlotCount = 4;
|
||||
const int kICSlotSize = 23;
|
||||
const uint64_t kICSlotInvalidTargetAddress = 0x0F0F0F0F0F0F0F0F;
|
||||
|
||||
uint64_t ResolveFunctionAddress(void* raw_context, uint32_t target_address) {
|
||||
// TODO(benvanik): generate this thunk at runtime? or a shim?
|
||||
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
|
||||
extern "C" uint64_t ResolveFunction(void* raw_context,
|
||||
uint32_t target_address) {
|
||||
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
|
||||
|
||||
// TODO(benvanik): required?
|
||||
|
@ -424,100 +339,26 @@ uint64_t ResolveFunctionAddress(void* raw_context, uint32_t target_address) {
|
|||
auto x64_fn = static_cast<X64Function*>(fn);
|
||||
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
|
||||
|
||||
// Add an IC slot, if there is room.
|
||||
#if XE_PLATFORM_WIN32
|
||||
uint64_t return_address = reinterpret_cast<uint64_t>(_ReturnAddress());
|
||||
#else
|
||||
uint64_t return_address =
|
||||
reinterpret_cast<uint64_t>(__builtin_return_address(0));
|
||||
#endif // XE_PLATFORM_WIN32
|
||||
#pragma pack(push, 1)
|
||||
struct Asm {
|
||||
uint16_t cmp_rdx;
|
||||
uint32_t address_constant;
|
||||
uint16_t jmp_next_slot;
|
||||
uint16_t mov_rax;
|
||||
uint64_t target_constant;
|
||||
uint8_t jmp_skip_resolve[5];
|
||||
};
|
||||
#pragma pack(pop)
|
||||
static_assert_size(Asm, kICSlotSize);
|
||||
// TODO(benvanik): quick check table is full (so we don't have to enum slots)
|
||||
// The return address points to ReloadRCX work after the call.
|
||||
// To get the top of the table, look back a ways.
|
||||
uint64_t table_start = return_address - 12 - kICSlotSize * kICSlotCount;
|
||||
// NOTE: order matters here - we update the address BEFORE we switch the code
|
||||
// over to passing the compare.
|
||||
Asm* table_slot = reinterpret_cast<Asm*>(table_start);
|
||||
bool wrote_ic = false;
|
||||
for (int i = 0; i < kICSlotCount; ++i) {
|
||||
if (xe::atomic_cas(kICSlotInvalidTargetAddress, addr,
|
||||
&table_slot->target_constant)) {
|
||||
// Got slot! Just write the compare and we're done.
|
||||
table_slot->address_constant = static_cast<uint32_t>(target_address);
|
||||
wrote_ic = true;
|
||||
break;
|
||||
}
|
||||
++table_slot;
|
||||
}
|
||||
if (!wrote_ic) {
|
||||
// TODO(benvanik): log that IC table is full.
|
||||
}
|
||||
|
||||
// We need to return the target in rax so that it gets called.
|
||||
return addr;
|
||||
}
|
||||
|
||||
void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
|
||||
// Check if return.
|
||||
if (instr->flags & CALL_POSSIBLE_RETURN) {
|
||||
cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
je("epilog", CodeGenerator::T_NEAR);
|
||||
}
|
||||
|
||||
if (reg.getIdx() != rdx.getIdx()) {
|
||||
mov(rdx, reg);
|
||||
}
|
||||
|
||||
inLocalLabel();
|
||||
Xbyak::Label skip_resolve;
|
||||
|
||||
// TODO(benvanik): make empty tables skippable (cmp, jump right to resolve).
|
||||
|
||||
// IC table, initially empty.
|
||||
// This will get filled in as functions are resolved.
|
||||
// Note that we only have a limited cache, and once it's full all calls
|
||||
// will fall through.
|
||||
// TODO(benvanik): check miss rate when full and add a 2nd-level table?
|
||||
// 0000000264BD4DC3 81 FA 0F0F0F0F cmp edx,0F0F0F0Fh
|
||||
// 0000000264BD4DC9 75 0C jne 0000000264BD4DD7
|
||||
// 0000000264BD4DCB 48 B8 0F0F0F0F0F0F0F0F mov rax,0F0F0F0F0F0F0F0Fh
|
||||
// 0000000264BD4DD5 EB XXXXXXXX jmp 0000000264BD4E00
|
||||
size_t table_start = getSize();
|
||||
for (int i = 0; i < kICSlotCount; ++i) {
|
||||
// Compare target address with constant, if matches jump there.
|
||||
// Otherwise, fall through.
|
||||
// 6b
|
||||
cmp(edx, 0x0F0F0F0F);
|
||||
Xbyak::Label next_slot;
|
||||
// 2b
|
||||
jne(next_slot, T_SHORT);
|
||||
// Match! Load up rax and skip down to the jmp code.
|
||||
// 10b
|
||||
mov(rax, kICSlotInvalidTargetAddress);
|
||||
// 5b
|
||||
jmp(skip_resolve, T_NEAR);
|
||||
L(next_slot);
|
||||
}
|
||||
size_t table_size = getSize() - table_start;
|
||||
assert_true(table_size == kICSlotSize * kICSlotCount);
|
||||
|
||||
void X64Emitter::Call(const hir::Instr* instr, FunctionInfo* symbol_info) {
|
||||
auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
|
||||
// Resolve address to the function to call and store in rax.
|
||||
// We fall through to this when there are no hits in the IC table.
|
||||
CallNative(ResolveFunctionAddress);
|
||||
if (fn) {
|
||||
// TODO(benvanik): is it worth it to do this? It removes the need for
|
||||
// a ResolveFunction call, but makes the table less useful.
|
||||
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
|
||||
mov(eax, uint32_t(uint64_t(fn->machine_code())));
|
||||
} else {
|
||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||
// The target dword will either contain the address of the generated code
|
||||
// or a thunk to ResolveAddress.
|
||||
mov(ebx, symbol_info->address());
|
||||
mov(eax, dword[ebx]);
|
||||
}
|
||||
|
||||
// Actually jump/call to rax.
|
||||
L(skip_resolve);
|
||||
if (instr->flags & CALL_TAIL) {
|
||||
// Since we skip the prolog we need to mark the return here.
|
||||
EmitTraceUserCallReturn();
|
||||
|
@ -530,10 +371,42 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
|
|||
} else {
|
||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
call(rax);
|
||||
}
|
||||
}
|
||||
|
||||
outLocalLabel();
|
||||
void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
|
||||
// Check if return.
|
||||
if (instr->flags & CALL_POSSIBLE_RETURN) {
|
||||
cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
je("epilog", CodeGenerator::T_NEAR);
|
||||
}
|
||||
|
||||
// Load the pointer to the indirection table maintained in X64CodeCache.
|
||||
// The target dword will either contain the address of the generated code
|
||||
// or a thunk to ResolveAddress.
|
||||
if (reg.cvt32() != ebx) {
|
||||
mov(ebx, reg.cvt32());
|
||||
}
|
||||
mov(eax, dword[ebx]);
|
||||
|
||||
// Actually jump/call to rax.
|
||||
if (instr->flags & CALL_TAIL) {
|
||||
// Since we skip the prolog we need to mark the return here.
|
||||
EmitTraceUserCallReturn();
|
||||
|
||||
// Pass the callers return address over.
|
||||
mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
|
||||
|
||||
add(rsp, static_cast<uint32_t>(stack_size()));
|
||||
jmp(rax);
|
||||
} else {
|
||||
// Return address is from the previous SET_RETURN_ADDRESS.
|
||||
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
|
||||
|
||||
call(rax);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) {
|
||||
|
|
|
@ -141,6 +141,52 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
|||
return (HostToGuestThunk)fn;
|
||||
}
|
||||
|
||||
// X64Emitter handles actually resolving functions.
|
||||
extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address);
|
||||
|
||||
ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
|
||||
// ebx = target PPC address
|
||||
// rcx = context
|
||||
|
||||
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
|
||||
// rsp + 0 = return address
|
||||
mov(qword[rsp + 8 * 2], rdx);
|
||||
mov(qword[rsp + 8 * 1], rcx);
|
||||
sub(rsp, stack_size);
|
||||
|
||||
mov(qword[rsp + 48], rbx);
|
||||
mov(qword[rsp + 56], rcx);
|
||||
mov(qword[rsp + 64], rbp);
|
||||
mov(qword[rsp + 72], rsi);
|
||||
mov(qword[rsp + 80], rdi);
|
||||
mov(qword[rsp + 88], r12);
|
||||
mov(qword[rsp + 96], r13);
|
||||
mov(qword[rsp + 104], r14);
|
||||
mov(qword[rsp + 112], r15);
|
||||
|
||||
mov(rdx, rbx);
|
||||
mov(rax, uint64_t(&ResolveFunction));
|
||||
call(rax);
|
||||
|
||||
mov(rbx, qword[rsp + 48]);
|
||||
mov(rcx, qword[rsp + 56]);
|
||||
mov(rbp, qword[rsp + 64]);
|
||||
mov(rsi, qword[rsp + 72]);
|
||||
mov(rdi, qword[rsp + 80]);
|
||||
mov(r12, qword[rsp + 88]);
|
||||
mov(r13, qword[rsp + 96]);
|
||||
mov(r14, qword[rsp + 104]);
|
||||
mov(r15, qword[rsp + 112]);
|
||||
|
||||
add(rsp, stack_size);
|
||||
mov(rcx, qword[rsp + 8 * 1]);
|
||||
mov(rdx, qword[rsp + 8 * 2]);
|
||||
jmp(rax);
|
||||
|
||||
void* fn = Emplace(0, stack_size);
|
||||
return (ResolveFunctionThunk)fn;
|
||||
}
|
||||
|
||||
} // namespace x64
|
||||
} // namespace backend
|
||||
} // namespace cpu
|
||||
|
|
|
@ -133,6 +133,9 @@ class X64ThunkEmitter : public X64Emitter {
|
|||
|
||||
// Function that guest code can call to transition into host code.
|
||||
GuestToHostThunk EmitGuestToHostThunk();
|
||||
|
||||
// Function that thunks to the ResolveFunction in X64Emitter.
|
||||
ResolveFunctionThunk EmitResolveFunctionThunk();
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
Loading…
Reference in New Issue