Simple IC for indirect calls.

This commit is contained in:
Ben Vanik 2014-08-06 16:31:38 -07:00
parent 4ce81fcda8
commit 6b581bcc75
3 changed files with 114 additions and 22 deletions

View File

@ -239,18 +239,6 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
const size_t TOTAL_RESOLVE_SIZE = 27; const size_t TOTAL_RESOLVE_SIZE = 27;
const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8; const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8;
// Length Assembly Byte Sequence
// =================================================================================
// 2 bytes 66 NOP 66 90H
// 3 bytes NOP DWORD ptr [EAX] 0F 1F 00H
// 4 bytes NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H
// 5 bytes NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H
// 6 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H
// 7 bytes NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H
// 8 bytes NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
// 9 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00
// 00H
uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) { uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
// TODO(benvanik): generate this thunk at runtime? or a shim? // TODO(benvanik): generate this thunk at runtime? or a shim?
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context); auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
@ -281,6 +269,7 @@ uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
uint8_t mov_rcx[5]; uint8_t mov_rcx[5];
}; };
#pragma pack(pop) #pragma pack(pop)
static_assert_size(Asm, TOTAL_RESOLVE_SIZE);
Asm* code = reinterpret_cast<Asm*>(return_address - ASM_OFFSET); Asm* code = reinterpret_cast<Asm*>(return_address - ASM_OFFSET);
code->rax_constant = addr; code->rax_constant = addr;
code->call_rax = 0x9066; code->call_rax = 0x9066;
@ -293,7 +282,6 @@ void X64Emitter::Call(const hir::Instr* instr,
runtime::FunctionInfo* symbol_info) { runtime::FunctionInfo* symbol_info) {
auto fn = reinterpret_cast<X64Function*>(symbol_info->function()); auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
// Resolve address to the function to call and store in rax. // Resolve address to the function to call and store in rax.
// TODO(benvanik): caching/etc. For now this makes debugging easier.
if (fn) { if (fn) {
mov(rax, reinterpret_cast<uint64_t>(fn->machine_code())); mov(rax, reinterpret_cast<uint64_t>(fn->machine_code()));
} else { } else {
@ -325,18 +313,66 @@ void X64Emitter::Call(const hir::Instr* instr,
} }
} }
// NOTE: slot count limited by short jump size.
const int kICSlotCount = 4;
const int kICSlotSize = 23;
const uint64_t kICSlotInvalidTargetAddress = 0x0F0F0F0F0F0F0F0F;
uint64_t ResolveFunctionAddress(void* raw_context, uint64_t target_address) { uint64_t ResolveFunctionAddress(void* raw_context, uint64_t target_address) {
// TODO(benvanik): generate this thunk at runtime? or a shim? // TODO(benvanik): generate this thunk at runtime? or a shim?
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context); auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
// TODO(benvanik): required? // TODO(benvanik): required?
target_address &= 0xFFFFFFFF; target_address &= 0xFFFFFFFF;
assert_not_zero(target_address);
Function* fn = NULL; Function* fn = NULL;
thread_state->runtime()->ResolveFunction(target_address, &fn); thread_state->runtime()->ResolveFunction(target_address, &fn);
assert_not_null(fn); assert_not_null(fn);
auto x64_fn = static_cast<X64Function*>(fn); auto x64_fn = static_cast<X64Function*>(fn);
return reinterpret_cast<uint64_t>(x64_fn->machine_code()); uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
// Add an IC slot, if there is room.
#if XE_LIKE_WIN32
uint64_t return_address = reinterpret_cast<uint64_t>(_ReturnAddress());
#else
uint64_t return_address =
reinterpret_cast<uint64_t>(__builtin_return_address(0));
#endif // XE_WIN32_LIKE
#pragma pack(push, 1)
struct Asm {
uint16_t cmp_rdx;
uint32_t address_constant;
uint16_t jmp_next_slot;
uint16_t mov_rax;
uint64_t target_constant;
uint8_t jmp_skip_resolve[5];
};
#pragma pack(pop)
static_assert_size(Asm, kICSlotSize);
// The return address points to ReloadRCX work after the call.
// To get the top of the table, look back a ways.
uint64_t table_start = return_address - 12 - kICSlotSize * kICSlotCount;
// NOTE: order matters here - we update the address BEFORE we switch the code
// over to passing the compare.
Asm* table_slot = reinterpret_cast<Asm*>(table_start);
bool wrote_ic = false;
for (int i = 0; i < kICSlotCount; ++i) {
if (poly::atomic_cas(kICSlotInvalidTargetAddress, addr,
&table_slot->target_constant)) {
// Got slot! Just write the compare and we're done.
table_slot->address_constant = static_cast<uint32_t>(target_address);
wrote_ic = true;
break;
}
++table_slot;
}
if (!wrote_ic) {
// TODO(benvanik): log that IC table is full.
}
// We need to return the target in rax so that it gets called.
return addr;
} }
void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) { void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
@ -346,14 +382,49 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
je("epilog", CodeGenerator::T_NEAR); je("epilog", CodeGenerator::T_NEAR);
} }
// Resolve address to the function to call and store in rax.
// TODO(benvanik): caching/etc. For now this makes debugging easier.
if (reg.getIdx() != rdx.getIdx()) { if (reg.getIdx() != rdx.getIdx()) {
mov(rdx, reg); mov(rdx, reg);
} }
inLocalLabel();
Xbyak::Label skip_resolve;
// TODO(benvanik): make empty tables skippable (cmp, jump right to resolve).
// IC table, initially empty.
// This will get filled in as functions are resolved.
// Note that we only have a limited cache, and once it's full all calls
// will fall through.
// TODO(benvanik): check miss rate when full and add a 2nd-level table?
// 0000000264BD4DC3 81 FA 0F0F0F0F cmp edx,0F0F0F0Fh
// 0000000264BD4DC9 75 0C jne 0000000264BD4DD7
// 0000000264BD4DCB 48 B8 0F0F0F0F0F0F0F0F mov rax,0F0F0F0F0F0F0F0Fh
// 0000000264BD4DD5 EB XXXXXXXX jmp 0000000264BD4E00
size_t table_start = getSize();
for (int i = 0; i < kICSlotCount; ++i) {
// Compare target address with constant, if matches jump there.
// Otherwise, fall through.
// 6b
cmp(edx, 0x0F0F0F0F);
Xbyak::Label next_slot;
// 2b
jne(next_slot, T_SHORT);
// Match! Load up rax and skip down to the jmp code.
// 10b
mov(rax, kICSlotInvalidTargetAddress);
// 5b
jmp(skip_resolve, T_NEAR);
L(next_slot);
}
size_t table_size = getSize() - table_start;
assert_true(table_size == kICSlotSize * kICSlotCount);
// Resolve address to the function to call and store in rax.
// We fall through to this when there are no hits in the IC table.
CallNative(ResolveFunctionAddress); CallNative(ResolveFunctionAddress);
// Actually jump/call to rax. // Actually jump/call to rax.
L(skip_resolve);
if (instr->flags & CALL_TAIL) { if (instr->flags & CALL_TAIL) {
// Pass the callers return address over. // Pass the callers return address over.
mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
@ -365,6 +436,8 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax); call(rax);
} }
outLocalLabel();
} }
uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) { uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) {
@ -451,6 +524,23 @@ void X64Emitter::ReloadEDX() {
mov(rdx, qword[rcx + 8]); // membase mov(rdx, qword[rcx + 8]); // membase
} }
// Len Assembly Byte Sequence
// ============================================================================
// 2b 66 NOP 66 90H
// 3b NOP DWORD ptr [EAX] 0F 1F 00H
// 4b NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H
// 5b NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H
// 6b 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H
// 7b NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
void X64Emitter::nop(size_t length) {
// TODO(benvanik): fat nop
for (size_t i = 0; i < length; ++i) {
db(0x90);
}
}
void X64Emitter::LoadEflags() { void X64Emitter::LoadEflags() {
#if STORE_EFLAGS #if STORE_EFLAGS
mov(eax, dword[rsp + STASH_OFFSET]); mov(eax, dword[rsp + STASH_OFFSET]);

View File

@ -138,6 +138,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
void ReloadECX(); void ReloadECX();
void ReloadEDX(); void ReloadEDX();
void nop(size_t length = 1);
// TODO(benvanik): Label for epilog (don't use strings). // TODO(benvanik): Label for epilog (don't use strings).
void LoadEflags(); void LoadEflags();

View File

@ -366,7 +366,7 @@ EMITTER(CALL_INDIRECT_TRUE_I8, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I8<>,
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
Xbyak::Label skip; Xbyak::Label skip;
e.jz(skip); e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2); e.CallIndirect(i.instr, i.src2);
e.L(skip); e.L(skip);
} }
@ -375,7 +375,7 @@ EMITTER(CALL_INDIRECT_TRUE_I16, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I16<>
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
Xbyak::Label skip; Xbyak::Label skip;
e.jz(skip); e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2); e.CallIndirect(i.instr, i.src2);
e.L(skip); e.L(skip);
} }
@ -384,7 +384,7 @@ EMITTER(CALL_INDIRECT_TRUE_I32, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32<>
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
Xbyak::Label skip; Xbyak::Label skip;
e.jz(skip); e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2); e.CallIndirect(i.instr, i.src2);
e.L(skip); e.L(skip);
} }
@ -393,7 +393,7 @@ EMITTER(CALL_INDIRECT_TRUE_I64, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64<>
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1); e.test(i.src1, i.src1);
Xbyak::Label skip; Xbyak::Label skip;
e.jz(skip); e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2); e.CallIndirect(i.instr, i.src2);
e.L(skip); e.L(skip);
} }
@ -402,7 +402,7 @@ EMITTER(CALL_INDIRECT_TRUE_F32, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32<>
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); e.vptest(i.src1, i.src1);
Xbyak::Label skip; Xbyak::Label skip;
e.jz(skip); e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2); e.CallIndirect(i.instr, i.src2);
e.L(skip); e.L(skip);
} }
@ -411,7 +411,7 @@ EMITTER(CALL_INDIRECT_TRUE_F64, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64<>
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); e.vptest(i.src1, i.src1);
Xbyak::Label skip; Xbyak::Label skip;
e.jz(skip); e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2); e.CallIndirect(i.instr, i.src2);
e.L(skip); e.L(skip);
} }