Simple IC for indirect calls.

This commit is contained in:
Ben Vanik 2014-08-06 16:31:38 -07:00
parent 4ce81fcda8
commit 6b581bcc75
3 changed files with 114 additions and 22 deletions

View File

@ -239,18 +239,6 @@ void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
const size_t TOTAL_RESOLVE_SIZE = 27;
const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8;
// Length Assembly Byte Sequence
// =================================================================================
// 2 bytes 66 NOP 66 90H
// 3 bytes NOP DWORD ptr [EAX] 0F 1F 00H
// 4 bytes NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H
// 5 bytes NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H
// 6 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H
// 7 bytes NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H
// 8 bytes NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
// 9 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00
// 00H
uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
// TODO(benvanik): generate this thunk at runtime? or a shim?
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
@ -281,6 +269,7 @@ uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) {
uint8_t mov_rcx[5];
};
#pragma pack(pop)
static_assert_size(Asm, TOTAL_RESOLVE_SIZE);
Asm* code = reinterpret_cast<Asm*>(return_address - ASM_OFFSET);
code->rax_constant = addr;
code->call_rax = 0x9066;
@ -293,7 +282,6 @@ void X64Emitter::Call(const hir::Instr* instr,
runtime::FunctionInfo* symbol_info) {
auto fn = reinterpret_cast<X64Function*>(symbol_info->function());
// Resolve address to the function to call and store in rax.
// TODO(benvanik): caching/etc. For now this makes debugging easier.
if (fn) {
mov(rax, reinterpret_cast<uint64_t>(fn->machine_code()));
} else {
@ -325,18 +313,66 @@ void X64Emitter::Call(const hir::Instr* instr,
}
}
// NOTE: slot count limited by short jump size.
const int kICSlotCount = 4;
const int kICSlotSize = 23;
const uint64_t kICSlotInvalidTargetAddress = 0x0F0F0F0F0F0F0F0F;
uint64_t ResolveFunctionAddress(void* raw_context, uint64_t target_address) {
// TODO(benvanik): generate this thunk at runtime? or a shim?
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
// TODO(benvanik): required?
target_address &= 0xFFFFFFFF;
assert_not_zero(target_address);
Function* fn = NULL;
thread_state->runtime()->ResolveFunction(target_address, &fn);
assert_not_null(fn);
auto x64_fn = static_cast<X64Function*>(fn);
return reinterpret_cast<uint64_t>(x64_fn->machine_code());
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
// Add an IC slot, if there is room.
#if XE_LIKE_WIN32
uint64_t return_address = reinterpret_cast<uint64_t>(_ReturnAddress());
#else
uint64_t return_address =
reinterpret_cast<uint64_t>(__builtin_return_address(0));
#endif // XE_WIN32_LIKE
#pragma pack(push, 1)
struct Asm {
uint16_t cmp_rdx;
uint32_t address_constant;
uint16_t jmp_next_slot;
uint16_t mov_rax;
uint64_t target_constant;
uint8_t jmp_skip_resolve[5];
};
#pragma pack(pop)
static_assert_size(Asm, kICSlotSize);
// The return address points to ReloadRCX work after the call.
// To get the top of the table, look back a ways.
uint64_t table_start = return_address - 12 - kICSlotSize * kICSlotCount;
// NOTE: order matters here - we update the address BEFORE we switch the code
// over to passing the compare.
Asm* table_slot = reinterpret_cast<Asm*>(table_start);
bool wrote_ic = false;
for (int i = 0; i < kICSlotCount; ++i) {
if (poly::atomic_cas(kICSlotInvalidTargetAddress, addr,
&table_slot->target_constant)) {
// Got slot! Just write the compare and we're done.
table_slot->address_constant = static_cast<uint32_t>(target_address);
wrote_ic = true;
break;
}
++table_slot;
}
if (!wrote_ic) {
// TODO(benvanik): log that IC table is full.
}
// We need to return the target in rax so that it gets called.
return addr;
}
void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
@ -346,14 +382,49 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
je("epilog", CodeGenerator::T_NEAR);
}
// Resolve address to the function to call and store in rax.
// TODO(benvanik): caching/etc. For now this makes debugging easier.
if (reg.getIdx() != rdx.getIdx()) {
mov(rdx, reg);
}
inLocalLabel();
Xbyak::Label skip_resolve;
// TODO(benvanik): make empty tables skippable (cmp, jump right to resolve).
// IC table, initially empty.
// This will get filled in as functions are resolved.
// Note that we only have a limited cache, and once it's full all calls
// will fall through.
// TODO(benvanik): check miss rate when full and add a 2nd-level table?
// 0000000264BD4DC3 81 FA 0F0F0F0F cmp edx,0F0F0F0Fh
// 0000000264BD4DC9 75 0C jne 0000000264BD4DD7
// 0000000264BD4DCB 48 B8 0F0F0F0F0F0F0F0F mov rax,0F0F0F0F0F0F0F0Fh
// 0000000264BD4DD5 EB XXXXXXXX jmp 0000000264BD4E00
size_t table_start = getSize();
for (int i = 0; i < kICSlotCount; ++i) {
// Compare target address with constant, if matches jump there.
// Otherwise, fall through.
// 6b
cmp(edx, 0x0F0F0F0F);
Xbyak::Label next_slot;
// 2b
jne(next_slot, T_SHORT);
// Match! Load up rax and skip down to the jmp code.
// 10b
mov(rax, kICSlotInvalidTargetAddress);
// 5b
jmp(skip_resolve, T_NEAR);
L(next_slot);
}
size_t table_size = getSize() - table_start;
assert_true(table_size == kICSlotSize * kICSlotCount);
// Resolve address to the function to call and store in rax.
// We fall through to this when there are no hits in the IC table.
CallNative(ResolveFunctionAddress);
// Actually jump/call to rax.
L(skip_resolve);
if (instr->flags & CALL_TAIL) {
// Pass the callers return address over.
mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
@ -365,6 +436,8 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) {
mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax);
}
outLocalLabel();
}
uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) {
@ -451,6 +524,23 @@ void X64Emitter::ReloadEDX() {
mov(rdx, qword[rcx + 8]); // membase
}
// Len Assembly Byte Sequence
// ============================================================================
// 2b 66 NOP 66 90H
// 3b NOP DWORD ptr [EAX] 0F 1F 00H
// 4b NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H
// 5b NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H
// 6b 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H
// 7b NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
void X64Emitter::nop(size_t length) {
// TODO(benvanik): fat nop
for (size_t i = 0; i < length; ++i) {
db(0x90);
}
}
void X64Emitter::LoadEflags() {
#if STORE_EFLAGS
mov(eax, dword[rsp + STASH_OFFSET]);

View File

@ -138,6 +138,8 @@ class X64Emitter : public Xbyak::CodeGenerator {
void ReloadECX();
void ReloadEDX();
void nop(size_t length = 1);
// TODO(benvanik): Label for epilog (don't use strings).
void LoadEflags();

View File

@ -366,7 +366,7 @@ EMITTER(CALL_INDIRECT_TRUE_I8, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I8<>,
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
}
@ -375,7 +375,7 @@ EMITTER(CALL_INDIRECT_TRUE_I16, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I16<>
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
}
@ -384,7 +384,7 @@ EMITTER(CALL_INDIRECT_TRUE_I32, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I32<>
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
}
@ -393,7 +393,7 @@ EMITTER(CALL_INDIRECT_TRUE_I64, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, I64<>
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.test(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
}
@ -402,7 +402,7 @@ EMITTER(CALL_INDIRECT_TRUE_F32, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32<>
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
}
@ -411,7 +411,7 @@ EMITTER(CALL_INDIRECT_TRUE_F64, MATCH(I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64<>
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
}