Changing calls/jmps to use the indirection table. Most games seem faster.

This commit is contained in:
Ben Vanik 2015-05-21 00:12:28 -07:00
parent ad72c193a6
commit fc4727c339
7 changed files with 144 additions and 178 deletions

View File

@ -50,6 +50,16 @@ bool X64Backend::Initialize() {
auto thunk_emitter = std::make_unique<X64ThunkEmitter>(this, allocator.get());
host_to_guest_thunk_ = thunk_emitter->EmitHostToGuestThunk();
guest_to_host_thunk_ = thunk_emitter->EmitGuestToHostThunk();
resolve_function_thunk_ = thunk_emitter->EmitResolveFunctionThunk();
// Set the code cache to use the ResolveFunction thunk for default
// indirections.
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
code_cache_->set_indirection_default(
uint32_t(uint64_t(resolve_function_thunk_)));
// Allocate some special indirections.
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
return true;
}

View File

@ -23,15 +23,21 @@ class X64CodeCache;
typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
typedef void (*ResolveFunctionThunk)();
class X64Backend : public Backend {
public:
const static uint32_t kForceReturnAddress = 0x9FFF0000u;
X64Backend(Processor* processor);
~X64Backend() override;
X64CodeCache* code_cache() const { return code_cache_; }
HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; }
GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; }
ResolveFunctionThunk resolve_function_thunk() const {
return resolve_function_thunk_;
}
bool Initialize() override;
@ -41,8 +47,10 @@ class X64Backend : public Backend {
private:
X64CodeCache* code_cache_;
HostToGuestThunk host_to_guest_thunk_;
GuestToHostThunk guest_to_host_thunk_;
ResolveFunctionThunk resolve_function_thunk_;
};
} // namespace x64

View File

@ -24,7 +24,8 @@ namespace x64 {
const static uint32_t kUnwindInfoSize = 4 + (2 * 1 + 2 + 2);
X64CodeCache::X64CodeCache()
: indirection_table_base_(nullptr),
: indirection_default_value_(0xFEEDF00D),
indirection_table_base_(nullptr),
generated_code_base_(nullptr),
generated_code_offset_(0),
generated_code_commit_mark_(0),
@ -87,10 +88,28 @@ bool X64CodeCache::Initialize() {
return true;
}
void X64CodeCache::set_indirection_default(uint32_t default_value) {
indirection_default_value_ = default_value;
}
void X64CodeCache::AddIndirection(uint32_t guest_address,
uint32_t host_address) {
uint32_t* indirection_slot = reinterpret_cast<uint32_t*>(
indirection_table_base_ + (guest_address - kIndirectionTableBase));
*indirection_slot = host_address;
}
void X64CodeCache::CommitExecutableRange(uint32_t guest_low,
uint32_t guest_high) {
// Commit the memory.
VirtualAlloc(indirection_table_base_ + (guest_low - kIndirectionTableBase),
guest_high - guest_low, MEM_COMMIT, PAGE_READWRITE);
// Fill memory with the default value.
uint32_t* p = reinterpret_cast<uint32_t*>(indirection_table_base_);
for (uint32_t address = guest_low; address < guest_high; ++address) {
p[(address - kIndirectionTableBase) / 4] = indirection_default_value_;
}
}
void* X64CodeCache::PlaceCode(uint32_t guest_address, void* machine_code,

View File

@ -33,6 +33,10 @@ class X64CodeCache {
// TODO(benvanik): keep track of code blocks
// TODO(benvanik): padding/guards/etc
void set_indirection_default(uint32_t default_value);
void AddIndirection(uint32_t guest_address, uint32_t host_address);
void CommitExecutableRange(uint32_t guest_low, uint32_t guest_high);
void* PlaceCode(uint32_t guest_address, void* machine_code, size_t code_size,
@ -52,6 +56,9 @@ class X64CodeCache {
// the tables consistent and ordered.
std::mutex allocation_mutex_;
// Value that the indirection table will be initialized with upon commit.
uint32_t indirection_default_value_;
// Fixed at kIndirectionTableBase in host space, holding 4 byte pointers into
// the generated code table that correspond to the PPC functions in guest
// space.

View File

@ -325,94 +325,9 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
assert_always();
}
// Total size of ResolveFunctionSymbol call site in bytes.
// Used to overwrite it with nops as needed.
const size_t TOTAL_RESOLVE_SIZE = 27;
const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8;
uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
// TODO(benvanik): generate this thunk at runtime? or a shim?
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
auto symbol_info = reinterpret_cast<FunctionInfo*>(symbol_info_ptr);
// Resolve function. This will demand compile as required.
Function* fn = NULL;
thread_state->processor()->ResolveFunction(symbol_info->address(), &fn);
assert_not_null(fn);
auto x64_fn = static_cast<X64Function*>(fn);
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
// Overwrite the call site.
// The return address points to ReloadRCX work after the call.
#if XE_PLATFORM_WIN32
uint64_t return_address = reinterpret_cast<uint64_t>(_ReturnAddress());
#else
uint64_t return_address =
reinterpret_cast<uint64_t>(__builtin_return_address(0));
#endif // XE_PLATFORM_WIN32
#pragma pack(push, 1)
struct Asm {
uint16_t mov_rax;
uint64_t rax_constant;
uint16_t mov_rdx;
uint64_t rdx_constant;
uint16_t call_rax;
uint8_t mov_rcx[5];
};
#pragma pack(pop)
static_assert_size(Asm, TOTAL_RESOLVE_SIZE);
Asm* code = reinterpret_cast<Asm*>(return_address - ASM_OFFSET);
code->rax_constant = addr;
code->call_rax = 0x9066;
// We need to return the target in rax so that it gets called.
return addr;
}
void X64Emitter::Call(const hir::Instr* instr, FunctionInfo* symbol_info) {
auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
// Resolve address to the function to call and store in rax.
if (fn) {
mov(rax, reinterpret_cast<uint64_t>(fn->machine_code()));
} else {
size_t start = getSize();
// 2b + 8b constant
mov(rax, reinterpret_cast<uint64_t>(ResolveFunctionSymbol));
// 2b + 8b constant
mov(rdx, reinterpret_cast<uint64_t>(symbol_info));
// 2b
call(rax);
// 5b
ReloadECX();
size_t total_size = getSize() - start;
assert_true(total_size == TOTAL_RESOLVE_SIZE);
// EDX overwritten, don't bother reloading.
}
// Actually jump/call to rax.
if (instr->flags & CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
// Pass the callers return address over.
mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
add(rsp, static_cast<uint32_t>(stack_size()));
jmp(rax);
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax);
}
}
// NOTE: slot count limited by short jump size.
const int kICSlotCount = 4;
const int kICSlotSize = 23;
const uint64_t kICSlotInvalidTargetAddress = 0x0F0F0F0F0F0F0F0F;
uint64_t ResolveFunctionAddress(void* raw_context, uint32_t target_address) {
// TODO(benvanik): generate this thunk at runtime? or a shim?
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
extern "C" uint64_t ResolveFunction(void* raw_context,
uint32_t target_address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
// TODO(benvanik): required?
@ -424,100 +339,26 @@ uint64_t ResolveFunctionAddress(void* raw_context, uint32_t target_address) {
auto x64_fn = static_cast<X64Function*>(fn);
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
// Add an IC slot, if there is room.
#if XE_PLATFORM_WIN32
uint64_t return_address = reinterpret_cast<uint64_t>(_ReturnAddress());
#else
uint64_t return_address =
reinterpret_cast<uint64_t>(__builtin_return_address(0));
#endif // XE_PLATFORM_WIN32
#pragma pack(push, 1)
struct Asm {
uint16_t cmp_rdx;
uint32_t address_constant;
uint16_t jmp_next_slot;
uint16_t mov_rax;
uint64_t target_constant;
uint8_t jmp_skip_resolve[5];
};
#pragma pack(pop)
static_assert_size(Asm, kICSlotSize);
// TODO(benvanik): quick check table is full (so we don't have to enum slots)
// The return address points to ReloadRCX work after the call.
// To get the top of the table, look back a ways.
uint64_t table_start = return_address - 12 - kICSlotSize * kICSlotCount;
// NOTE: order matters here - we update the address BEFORE we switch the code
// over to passing the compare.
Asm* table_slot = reinterpret_cast<Asm*>(table_start);
bool wrote_ic = false;
for (int i = 0; i < kICSlotCount; ++i) {
if (xe::atomic_cas(kICSlotInvalidTargetAddress, addr,
&table_slot->target_constant)) {
// Got slot! Just write the compare and we're done.
table_slot->address_constant = static_cast<uint32_t>(target_address);
wrote_ic = true;
break;
}
++table_slot;
}
if (!wrote_ic) {
// TODO(benvanik): log that IC table is full.
}
// We need to return the target in rax so that it gets called.
return addr;
}
void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
// Check if return.
if (instr->flags & CALL_POSSIBLE_RETURN) {
cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
je("epilog", CodeGenerator::T_NEAR);
}
if (reg.getIdx() != rdx.getIdx()) {
mov(rdx, reg);
}
inLocalLabel();
Xbyak::Label skip_resolve;
// TODO(benvanik): make empty tables skippable (cmp, jump right to resolve).
// IC table, initially empty.
// This will get filled in as functions are resolved.
// Note that we only have a limited cache, and once it's full all calls
// will fall through.
// TODO(benvanik): check miss rate when full and add a 2nd-level table?
// 0000000264BD4DC3 81 FA 0F0F0F0F cmp edx,0F0F0F0Fh
// 0000000264BD4DC9 75 0C jne 0000000264BD4DD7
// 0000000264BD4DCB 48 B8 0F0F0F0F0F0F0F0F mov rax,0F0F0F0F0F0F0F0Fh
// 0000000264BD4DD5 EB XXXXXXXX jmp 0000000264BD4E00
size_t table_start = getSize();
for (int i = 0; i < kICSlotCount; ++i) {
// Compare target address with constant, if matches jump there.
// Otherwise, fall through.
// 6b
cmp(edx, 0x0F0F0F0F);
Xbyak::Label next_slot;
// 2b
jne(next_slot, T_SHORT);
// Match! Load up rax and skip down to the jmp code.
// 10b
mov(rax, kICSlotInvalidTargetAddress);
// 5b
jmp(skip_resolve, T_NEAR);
L(next_slot);
}
size_t table_size = getSize() - table_start;
assert_true(table_size == kICSlotSize * kICSlotCount);
void X64Emitter::Call(const hir::Instr* instr, FunctionInfo* symbol_info) {
auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
// Resolve address to the function to call and store in rax.
// We fall through to this when there are no hits in the IC table.
CallNative(ResolveFunctionAddress);
if (fn) {
// TODO(benvanik): is it worth it to do this? It removes the need for
// a ResolveFunction call, but makes the table less useful.
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
mov(eax, uint32_t(uint64_t(fn->machine_code())));
} else {
// Load the pointer to the indirection table maintained in X64CodeCache.
// The target dword will either contain the address of the generated code
// or a thunk to ResolveAddress.
mov(ebx, symbol_info->address());
mov(eax, dword[ebx]);
}
// Actually jump/call to rax.
L(skip_resolve);
if (instr->flags & CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
@ -530,10 +371,42 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax);
}
}
outLocalLabel();
void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
// Check if return.
if (instr->flags & CALL_POSSIBLE_RETURN) {
cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
je("epilog", CodeGenerator::T_NEAR);
}
// Load the pointer to the indirection table maintained in X64CodeCache.
// The target dword will either contain the address of the generated code
// or a thunk to ResolveAddress.
if (reg.cvt32() != ebx) {
mov(ebx, reg.cvt32());
}
mov(eax, dword[ebx]);
// Actually jump/call to rax.
if (instr->flags & CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
// Pass the callers return address over.
mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
add(rsp, static_cast<uint32_t>(stack_size()));
jmp(rax);
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax);
}
}
uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) {

View File

@ -141,6 +141,52 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
return (HostToGuestThunk)fn;
}
// X64Emitter handles actually resolving functions.
extern "C" uint64_t ResolveFunction(void* raw_context, uint32_t target_address);
ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
// ebx = target PPC address
// rcx = context
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
// rsp + 0 = return address
mov(qword[rsp + 8 * 2], rdx);
mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size);
mov(qword[rsp + 48], rbx);
mov(qword[rsp + 56], rcx);
mov(qword[rsp + 64], rbp);
mov(qword[rsp + 72], rsi);
mov(qword[rsp + 80], rdi);
mov(qword[rsp + 88], r12);
mov(qword[rsp + 96], r13);
mov(qword[rsp + 104], r14);
mov(qword[rsp + 112], r15);
mov(rdx, rbx);
mov(rax, uint64_t(&ResolveFunction));
call(rax);
mov(rbx, qword[rsp + 48]);
mov(rcx, qword[rsp + 56]);
mov(rbp, qword[rsp + 64]);
mov(rsi, qword[rsp + 72]);
mov(rdi, qword[rsp + 80]);
mov(r12, qword[rsp + 88]);
mov(r13, qword[rsp + 96]);
mov(r14, qword[rsp + 104]);
mov(r15, qword[rsp + 112]);
add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]);
mov(rdx, qword[rsp + 8 * 2]);
jmp(rax);
void* fn = Emplace(0, stack_size);
return (ResolveFunctionThunk)fn;
}
} // namespace x64
} // namespace backend
} // namespace cpu

View File

@ -133,6 +133,9 @@ class X64ThunkEmitter : public X64Emitter {
// Function that guest code can call to transition into host code.
GuestToHostThunk EmitGuestToHostThunk();
// Function that thunks to the ResolveFunction in X64Emitter.
ResolveFunctionThunk EmitResolveFunctionThunk();
};
} // namespace x64