[a64] Update guest calling conventions

Guest-function calls will use W17 for indirect calls
This commit is contained in:
Wunkolo 2024-05-08 11:34:26 -07:00
parent fd32c0e959
commit dc6666d4d2
5 changed files with 81 additions and 133 deletions

View File

@ -52,8 +52,8 @@ class A64ThunkEmitter : public A64Emitter {
// Caller saved:
// Dont assume these registers will survive a subroutine call
// x0, v0 is not saved/preserved since this is used to return values from
// subroutines x1-x15, x30 | d0-d7 and d16-v31
// x0, v0 is not saved for use as arg0/return
// x1-x15, x30 | v0-v7 and v16-v31
void EmitSaveVolatileRegs();
void EmitLoadVolatileRegs();
@ -223,47 +223,23 @@ HostToGuestThunk A64ThunkEmitter::EmitHostToGuestThunk() {
code_offsets.prolog = offset();
// mov(qword[rsp + 8 * 3], r8);
// mov(qword[rsp + 8 * 2], rdx);
// mov(qword[rsp + 8 * 1], rcx);
// sub(rsp, stack_size);
STR(X2, SP, 8 * 3);
STR(X1, SP, 8 * 2);
STR(X0, SP, 8 * 1);
SUB(SP, SP, stack_size);
code_offsets.prolog_stack_alloc = offset();
code_offsets.body = offset();
// Save nonvolatile registers.
EmitSaveNonvolatileRegs();
// mov(rax, rcx);
// mov(rsi, rdx); // context
// mov(rcx, r8); // return address
// call(rax);
MOV(X16, X0);
MOV(A64Emitter::GetContextReg(), X1); // context
MOV(X0, X2); // return address
MOV(GetContextReg(), X1); // context
MOV(X0, X2); // return address
BLR(X16);
EmitLoadNonvolatileRegs();
code_offsets.epilog = offset();
// add(rsp, stack_size);
// mov(rcx, qword[rsp + 8 * 1]);
// mov(rdx, qword[rsp + 8 * 2]);
// mov(r8, qword[rsp + 8 * 3]);
// ret();
ADD(SP, SP, stack_size);
LDR(X0, SP, 8 * 1);
LDR(X1, SP, 8 * 2);
LDR(X2, SP, 8 * 3);
RET();
@ -302,19 +278,13 @@ GuestToHostThunk A64ThunkEmitter::EmitGuestToHostThunk() {
code_offsets.prolog = offset();
// rsp + 0 = return address
// sub(rsp, stack_size);
SUB(SP, SP, stack_size);
code_offsets.prolog_stack_alloc = offset();
code_offsets.body = offset();
// Save off volatile registers.
EmitSaveVolatileRegs();
// mov(rax, rcx); // function
// mov(rcx, GetContextReg()); // context
// call(rax);
MOV(X16, X0); // function
MOV(X0, GetContextReg()); // context
BLR(X16);
@ -323,8 +293,6 @@ GuestToHostThunk A64ThunkEmitter::EmitGuestToHostThunk() {
code_offsets.epilog = offset();
// add(rsp, stack_size);
// ret();
ADD(SP, SP, stack_size);
RET();
@ -350,11 +318,8 @@ uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
ResolveFunctionThunk A64ThunkEmitter::EmitResolveFunctionThunk() {
// Entry:
// X0 = target PPC address
// Resolve Function:
// W17 = target PPC address
// X0 = context
// X1 = target PPC address
struct _code_offsets {
size_t prolog;
@ -369,22 +334,20 @@ ResolveFunctionThunk A64ThunkEmitter::EmitResolveFunctionThunk() {
code_offsets.prolog = offset();
// rsp + 0 = return address
// sub(rsp, stack_size);
SUB(SP, SP, stack_size);
code_offsets.prolog_stack_alloc = offset();
code_offsets.body = offset();
// Save volatile registers
EmitSaveVolatileRegs();
// mov(rcx, rsi); // context
// mov(rdx, rbx);
// mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
// call(rax)
MOV(X1, X0);
MOV(X0, GetContextReg()); // context
MOVP2R(X16, &ResolveFunction);
MOV(W1, W17);
MOV(X16, reinterpret_cast<uint64_t>(&ResolveFunction));
BLR(X16);
EmitLoadVolatileRegs();
@ -432,7 +395,6 @@ void A64ThunkEmitter::EmitSaveVolatileRegs() {
STP(Q3, Q4, SP, offsetof(StackLayout::Thunk, xmm[2]));
STP(Q5, Q6, SP, offsetof(StackLayout::Thunk, xmm[4]));
STP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6]));
STP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6]));
STP(Q17, Q18, SP, offsetof(StackLayout::Thunk, xmm[8]));
STP(Q19, Q20, SP, offsetof(StackLayout::Thunk, xmm[10]));
STP(Q21, Q22, SP, offsetof(StackLayout::Thunk, xmm[12]));
@ -461,7 +423,6 @@ void A64ThunkEmitter::EmitLoadVolatileRegs() {
LDP(Q3, Q4, SP, offsetof(StackLayout::Thunk, xmm[2]));
LDP(Q5, Q6, SP, offsetof(StackLayout::Thunk, xmm[4]));
LDP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6]));
LDP(Q7, Q16, SP, offsetof(StackLayout::Thunk, xmm[6]));
LDP(Q17, Q18, SP, offsetof(StackLayout::Thunk, xmm[8]));
LDP(Q19, Q20, SP, offsetof(StackLayout::Thunk, xmm[10]));
LDP(Q21, Q22, SP, offsetof(StackLayout::Thunk, xmm[12]));
@ -480,10 +441,12 @@ void A64ThunkEmitter::EmitSaveNonvolatileRegs() {
STP(X27, X28, SP, offsetof(StackLayout::Thunk, r[8]));
STP(X29, X30, SP, offsetof(StackLayout::Thunk, r[10]));
STP(Q8, Q9, SP, offsetof(StackLayout::Thunk, xmm[0]));
STP(Q10, Q11, SP, offsetof(StackLayout::Thunk, xmm[2]));
STP(Q12, Q13, SP, offsetof(StackLayout::Thunk, xmm[4]));
STP(Q14, Q15, SP, offsetof(StackLayout::Thunk, xmm[6]));
STR(X17, SP, offsetof(StackLayout::Thunk, r[12]));
STP(D8, D9, SP, offsetof(StackLayout::Thunk, xmm[0]));
STP(D10, D11, SP, offsetof(StackLayout::Thunk, xmm[1]));
STP(D12, D13, SP, offsetof(StackLayout::Thunk, xmm[2]));
STP(D14, D15, SP, offsetof(StackLayout::Thunk, xmm[3]));
}
void A64ThunkEmitter::EmitLoadNonvolatileRegs() {
@ -494,10 +457,12 @@ void A64ThunkEmitter::EmitLoadNonvolatileRegs() {
LDP(X27, X28, SP, offsetof(StackLayout::Thunk, r[8]));
LDP(X29, X30, SP, offsetof(StackLayout::Thunk, r[10]));
LDP(Q8, Q9, SP, offsetof(StackLayout::Thunk, xmm[0]));
LDP(Q10, Q11, SP, offsetof(StackLayout::Thunk, xmm[2]));
LDP(Q12, Q13, SP, offsetof(StackLayout::Thunk, xmm[4]));
LDP(Q14, Q15, SP, offsetof(StackLayout::Thunk, xmm[6]));
LDR(X17, SP, offsetof(StackLayout::Thunk, r[12]));
LDP(D8, D9, SP, offsetof(StackLayout::Thunk, xmm[0]));
LDP(D10, D11, SP, offsetof(StackLayout::Thunk, xmm[1]));
LDP(D12, D13, SP, offsetof(StackLayout::Thunk, xmm[2]));
LDP(D14, D15, SP, offsetof(StackLayout::Thunk, xmm[3]));
}
} // namespace a64

View File

@ -83,20 +83,6 @@ A64Emitter::A64Emitter(A64Backend* backend)
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
}
// TEST_EMIT_FEATURE(kA64EmitAVX2, oaknut::util::Cpu::tAVX2);
// TEST_EMIT_FEATURE(kA64EmitFMA, oaknut::util::Cpu::tFMA);
// TEST_EMIT_FEATURE(kA64EmitLZCNT, oaknut::util::Cpu::tLZCNT);
// TEST_EMIT_FEATURE(kA64EmitBMI1, oaknut::util::Cpu::tBMI1);
// TEST_EMIT_FEATURE(kA64EmitBMI2, oaknut::util::Cpu::tBMI2);
// TEST_EMIT_FEATURE(kA64EmitF16C, oaknut::util::Cpu::tF16C);
// TEST_EMIT_FEATURE(kA64EmitMovbe, oaknut::util::Cpu::tMOVBE);
// TEST_EMIT_FEATURE(kA64EmitGFNI, oaknut::util::Cpu::tGFNI);
// TEST_EMIT_FEATURE(kA64EmitAVX512F, oaknut::util::Cpu::tAVX512F);
// TEST_EMIT_FEATURE(kA64EmitAVX512VL, oaknut::util::Cpu::tAVX512VL);
// TEST_EMIT_FEATURE(kA64EmitAVX512BW, oaknut::util::Cpu::tAVX512BW);
// TEST_EMIT_FEATURE(kA64EmitAVX512DQ, oaknut::util::Cpu::tAVX512DQ);
// TEST_EMIT_FEATURE(kA64EmitAVX512VBMI, oaknut::util::Cpu::tAVX512_VBMI);
#undef TEST_EMIT_FEATURE
}
@ -218,15 +204,11 @@ bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
STP(X29, X30, SP, PRE_INDEXED, -32);
MOV(X29, SP);
// sub(rsp, (uint32_t)stack_size);
SUB(SP, SP, (uint32_t)stack_size);
code_offsets.prolog_stack_alloc = offset();
code_offsets.body = offset();
// mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
// mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
// mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
STR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME);
STR(X0, SP, StackLayout::GUEST_RET_ADDR);
STR(XZR, SP, StackLayout::GUEST_CALL_RET_ADDR);
@ -260,8 +242,6 @@ bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
}
// Load membase.
// mov(GetMembaseReg(),
// qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
LDR(GetMembaseReg(), GetContextReg(),
offsetof(ppc::PPCContext, virtual_membase));
@ -297,13 +277,10 @@ bool A64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
l(epilog_label);
epilog_label_ = nullptr;
EmitTraceUserCallReturn();
// mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
LDR(GetContextReg(), SP, StackLayout::GUEST_CTX_HOME);
code_offsets.epilog = offset();
// add(rsp, (uint32_t)stack_size);
// ret();
ADD(SP, SP, (uint32_t)stack_size);
MOV(SP, X29);
@ -342,7 +319,6 @@ void A64Emitter::MarkSourceOffset(const Instr* i) {
if (cvars::emit_source_annotations) {
NOP();
NOP();
// mov(eax, entry->guest_address);
MOV(X0, entry->guest_address);
NOP();
NOP();
@ -451,8 +427,8 @@ void A64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
// or a thunk to ResolveAddress.
// mov(ebx, function->address());
// mov(eax, dword[ebx]);
MOV(W1, function->address());
LDR(W16, X1);
MOV(W17, function->address());
LDR(W16, X17);
} else {
// Old-style resolve.
// Not too important because indirection table is almost always available.
@ -472,7 +448,11 @@ void A64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
// add(rsp, static_cast<uint32_t>(stack_size()));
// jmp(rax);
ADD(SP, SP, stack_size());
ADD(SP, SP, static_cast<uint32_t>(stack_size()));
MOV(SP, X29);
LDP(X29, X30, SP, POST_INDEXED, 32);
BR(X16);
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
@ -499,10 +479,11 @@ void A64Emitter::CallIndirect(const hir::Instr* instr,
// The target dword will either contain the address of the generated code
// or a thunk to ResolveAddress.
if (code_cache_->has_indirection_table()) {
if (reg.toW().index() != W1.index()) {
if (reg.toW().index() != W17.index()) {
// mov(ebx, reg.cvt32());
MOV(W1, reg.toW());
MOV(W17, reg.toW());
}
LDR(W16, X17);
// mov(eax, dword[ebx]);
} else {
// Old-style resolve.
@ -515,7 +496,7 @@ void A64Emitter::CallIndirect(const hir::Instr* instr,
MOV(X0, GetContextReg());
MOV(W1, reg.toW());
ADRP(X16, ResolveFunction);
MOV(X16, reinterpret_cast<uint64_t>(ResolveFunction));
BLR(X16);
MOV(X16, X0);
}
@ -526,18 +507,16 @@ void A64Emitter::CallIndirect(const hir::Instr* instr,
EmitTraceUserCallReturn();
// Pass the callers return address over.
// mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
LDR(X0, SP, StackLayout::GUEST_RET_ADDR);
// add(rsp, static_cast<uint32_t>(stack_size()));
ADD(SP, SP, static_cast<uint32_t>(stack_size()));
// jmp(rax);
MOV(SP, X29);
LDP(X29, X30, SP, POST_INDEXED, 32);
BR(X16);
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
// mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
// call(rax);
LDR(X0, SP, StackLayout::GUEST_CALL_RET_ADDR);
BLR(X16);
@ -571,7 +550,6 @@ void A64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
auto thunk = backend()->guest_to_host_thunk();
MOV(X16, reinterpret_cast<uint64_t>(thunk));
BLR(X16);
// x0 = host return
@ -589,7 +567,6 @@ void A64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
auto thunk = backend()->guest_to_host_thunk();
MOV(X16, reinterpret_cast<uint64_t>(thunk));
BLR(X16);
// x0 = host return
@ -612,7 +589,6 @@ void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)) {
void A64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0),
uint64_t arg0) {
// mov(GetNativeParam(0), arg0);
MOV(GetNativeParam(0), arg0);
CallNativeSafe(reinterpret_cast<void*>(fn));
}
@ -698,7 +674,7 @@ void A64Emitter::MovMem64(const oaknut::XRegSp& addr, intptr_t offset,
}
}
static const vec128_t xmm_consts[] = {
static const vec128_t v_consts[] = {
/* VZero */ vec128f(0.0f),
/* VOne */ vec128f(1.0f),
/* VOnePD */ vec128d(1.0),
@ -813,7 +789,7 @@ static const vec128_t xmm_consts[] = {
// First location to try and place constants.
static const uintptr_t kConstDataLocation = 0x20000000;
static const uintptr_t kConstDataSize = sizeof(xmm_consts);
static const uintptr_t kConstDataSize = sizeof(v_consts);
// Increment the location by this amount for every allocation failure.
static const uintptr_t kConstDataIncrement = 0x00001000;
@ -837,7 +813,7 @@ uintptr_t A64Emitter::PlaceConstData() {
// The pointer must not be greater than 31 bits.
assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
std::memcpy(mem, v_consts, sizeof(v_consts));
memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
return reinterpret_cast<uintptr_t>(mem);

View File

@ -33,7 +33,8 @@ XReg ComputeMemoryAddressOffset(A64Emitter& e, const T& guest, const T& offset,
uint32_t address = static_cast<uint32_t>(guest.constant());
address += offset_const;
if (address < 0x80000000) {
e.ADD(address_register.toX(), e.GetMembaseReg(), address);
e.MOV(address_register.toX(), address);
e.ADD(address_register.toX(), e.GetMembaseReg(), address_register.toX());
return address_register.toX();
} else {
if (address >= 0xE0000000 &&

View File

@ -1199,7 +1199,15 @@ void EmitAddCarryXX(A64Emitter& e, const ARGS& i) {
e.BFI(X1, X0, 61, 1);
e.MSR(SystemReg::NZCV, X1);
}
e.ADC(i.dest, i.src1, i.src2);
SEQ::EmitCommutativeBinaryOp(
e, i,
[](A64Emitter& e, const REG& dest_src, const REG& src) {
e.ADC(dest_src, dest_src, src);
},
[](A64Emitter& e, const REG& dest_src, int32_t constant) {
e.MOV(REG(1), constant);
e.ADC(dest_src, dest_src, REG(1));
});
}
struct ADD_CARRY_I8
: Sequence<ADD_CARRY_I8, I<OPCODE_ADD_CARRY, I8Op, I8Op, I8Op, I8Op>> {
@ -1240,7 +1248,8 @@ void EmitSubXX(A64Emitter& e, const ARGS& i) {
e.SUB(dest_src, dest_src, src);
},
[](A64Emitter& e, REG dest_src, int32_t constant) {
e.SUB(dest_src, dest_src, constant);
e.MOV(REG(1), constant);
e.SUB(dest_src, dest_src, REG(1));
});
}
struct SUB_I8 : Sequence<SUB_I8, I<OPCODE_SUB, I8Op, I8Op, I8Op>> {
@ -2157,7 +2166,8 @@ void EmitAndXX(A64Emitter& e, const ARGS& i) {
e.AND(dest_src, dest_src, src);
},
[](A64Emitter& e, REG dest_src, int32_t constant) {
e.AND(dest_src, dest_src, constant);
e.MOV(REG(1), constant);
e.AND(dest_src, dest_src, REG(1));
});
}
struct AND_I8 : Sequence<AND_I8, I<OPCODE_AND, I8Op, I8Op, I8Op>> {
@ -2264,7 +2274,8 @@ void EmitOrXX(A64Emitter& e, const ARGS& i) {
e.ORR(dest_src, dest_src, src);
},
[](A64Emitter& e, REG dest_src, int32_t constant) {
e.ORR(dest_src, dest_src, constant);
e.MOV(REG(1), constant);
e.ORR(dest_src, dest_src, REG(1));
});
}
struct OR_I8 : Sequence<OR_I8, I<OPCODE_OR, I8Op, I8Op, I8Op>> {
@ -2309,7 +2320,8 @@ void EmitXorXX(A64Emitter& e, const ARGS& i) {
e.EOR(dest_src, dest_src, src);
},
[](A64Emitter& e, REG dest_src, int32_t constant) {
e.EOR(dest_src, dest_src, constant);
e.MOV(REG(1), constant);
e.EOR(dest_src, dest_src, REG(1));
});
}
struct XOR_I8 : Sequence<XOR_I8, I<OPCODE_XOR, I8Op, I8Op, I8Op>> {

View File

@ -29,64 +29,58 @@ class StackLayout {
* Thunk stack:
* Non-Volatile Volatile
* +------------------+------------------+
* | arg temp, 3 * 8 | arg temp, 3 * 8 | xsp + 0x000
* | arg temp, 3 * 8 | arg temp, 3 * 8 | sp + 0x000
* | | |
* | | |
* +------------------+------------------+
* | rbx | (unused) | xsp + 0x018
* | rbx | (unused) | sp + 0x018
* +------------------+------------------+
* | rbp | X1 | xsp + 0x020
* | rbp | X1 | sp + 0x020
* +------------------+------------------+
* | rcx (Win32) | X2 | xsp + 0x028
* | rcx (Win32) | X2 | sp + 0x028
* +------------------+------------------+
* | rsi (Win32) | X3 | xsp + 0x030
* | rsi (Win32) | X3 | sp + 0x030
* +------------------+------------------+
* | rdi (Win32) | X4 | xsp + 0x038
* | rdi (Win32) | X4 | sp + 0x038
* +------------------+------------------+
* | r12 | X5 | xsp + 0x040
* | r12 | X5 | sp + 0x040
* +------------------+------------------+
* | r13 | X6 | xsp + 0x048
* | r13 | X6 | sp + 0x048
* +------------------+------------------+
* | r14 | X7 | xsp + 0x050
* | r14 | X7 | sp + 0x050
* +------------------+------------------+
* | r15 | X8 | xsp + 0x058
* | r15 | X8 | sp + 0x058
* +------------------+------------------+
* | xmm6 (Win32) | X9 | xsp + 0x060
* | xmm6 (Win32) | X9 | sp + 0x060
* | | |
* +------------------+------------------+
* | xmm7 (Win32) | X10 | xsp + 0x070
* | xmm7 (Win32) | X10 | sp + 0x070
* | | |
* +------------------+------------------+
* | xmm8 (Win32) | X11 | xsp + 0x080
* | xmm8 (Win32) | X11 | sp + 0x080
* | | |
* +------------------+------------------+
* | xmm9 (Win32) | X12 | xsp + 0x090
* | xmm9 (Win32) | X12 | sp + 0x090
* | | |
* +------------------+------------------+
* | xmm10 (Win32) | X13 | xsp + 0x0A0
* | xmm10 (Win32) | X13 | sp + 0x0A0
* | | |
* +------------------+------------------+
* | xmm11 (Win32) | X14 | xsp + 0x0B0
* | xmm11 (Win32) | X14 | sp + 0x0B0
* | | |
* +------------------+------------------+
* | xmm12 (Win32) | X15 | xsp + 0x0C0
* | xmm12 (Win32) | X15 | sp + 0x0C0
* | | |
* +------------------+------------------+
* | xmm13 (Win32) | X16 | xsp + 0x0D0
* | xmm13 (Win32) | X16 | sp + 0x0D0
* | | |
* +------------------+------------------+
* | xmm14 (Win32) | X17 | xsp + 0x0E0
* | xmm14 (Win32) | X17 | sp + 0x0E0
* | | |
* +------------------+------------------+
* | xmm15 (Win32) | X18 | xsp + 0x0F0
* | xmm15 (Win32) | X18 | sp + 0x0F0
* | | |
* +------------------+------------------+
* | (return address) | (return address) | xsp + 0x100
* +------------------+------------------+
* | (rcx home) | (rcx home) | xsp + 0x108
* +------------------+------------------+
* | (rdx home) | (rdx home) | xsp + 0x110
* +------------------+------------------+
*/
XEPACKEDSTRUCT(Thunk, {
uint64_t arg_temp[3];
@ -95,25 +89,25 @@ class StackLayout {
});
static_assert(sizeof(Thunk) % 16 == 0,
"sizeof(Thunk) must be a multiple of 16!");
static const size_t THUNK_STACK_SIZE = sizeof(Thunk) + 16;
static const size_t THUNK_STACK_SIZE = sizeof(Thunk);
/**
*
*
* Guest stack:
* +------------------+
* | arg temp, 3 * 8 | xsp + 0
* | arg temp, 3 * 8 | sp + 0
* | |
* | |
* +------------------+
* | scratch, 48b | xsp + 32
* | scratch, 48b | sp + 32(kStashOffset)
* | |
* +------------------+
* | X0 / context | xsp + 80
* | X0 / context | sp + 80
* +------------------+
* | guest ret addr | xsp + 88
* | guest ret addr | sp + 88
* +------------------+
* | call ret addr | xsp + 96
* | call ret addr | sp + 96
* +------------------+
* ... locals ...
* +------------------+