x64 backend: Save nonvolatile XMM registers on host -> guest transitions

Define the context and membase registers in the x64 emitter.
This commit is contained in:
Dr. Chat 2016-08-22 14:55:16 -05:00
parent e3fdb08ad7
commit 5f4416ee2f
5 changed files with 139 additions and 139 deletions

View File

@ -398,52 +398,53 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
mov(qword[rsp + 8 * 1], rcx); mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size); sub(rsp, stack_size);
mov(qword[rsp + 48], rbx); // Preserve nonvolatile registers.
mov(qword[rsp + 56], rcx); mov(qword[rsp + 40], rbx);
mov(qword[rsp + 64], rbp); mov(qword[rsp + 48], rcx);
mov(qword[rsp + 72], rsi); mov(qword[rsp + 56], rbp);
mov(qword[rsp + 80], rdi); mov(qword[rsp + 64], rsi);
mov(qword[rsp + 88], r12); mov(qword[rsp + 72], rdi);
mov(qword[rsp + 96], r13); mov(qword[rsp + 80], r12);
mov(qword[rsp + 104], r14); mov(qword[rsp + 88], r13);
mov(qword[rsp + 112], r15); mov(qword[rsp + 96], r14);
mov(qword[rsp + 104], r15);
/*movaps(ptr[rsp + 128], xmm6); movaps(ptr[rsp + 112], xmm6);
movaps(ptr[rsp + 144], xmm7); movaps(ptr[rsp + 128], xmm7);
movaps(ptr[rsp + 160], xmm8); movaps(ptr[rsp + 144], xmm8);
movaps(ptr[rsp + 176], xmm9); movaps(ptr[rsp + 160], xmm9);
movaps(ptr[rsp + 192], xmm10); movaps(ptr[rsp + 176], xmm10);
movaps(ptr[rsp + 208], xmm11); movaps(ptr[rsp + 192], xmm11);
movaps(ptr[rsp + 224], xmm12); movaps(ptr[rsp + 208], xmm12);
movaps(ptr[rsp + 240], xmm13); movaps(ptr[rsp + 224], xmm13);
movaps(ptr[rsp + 256], xmm14); movaps(ptr[rsp + 240], xmm14);
movaps(ptr[rsp + 272], xmm15);*/ movaps(ptr[rsp + 256], xmm15);
mov(rax, rcx); mov(rax, rcx);
mov(rcx, rdx); mov(rcx, rdx);
mov(rdx, r8); mov(rdx, r8);
call(rax); call(rax);
/*movaps(xmm6, ptr[rsp + 128]); movaps(xmm6, ptr[rsp + 112]);
movaps(xmm7, ptr[rsp + 144]); movaps(xmm7, ptr[rsp + 128]);
movaps(xmm8, ptr[rsp + 160]); movaps(xmm8, ptr[rsp + 144]);
movaps(xmm9, ptr[rsp + 176]); movaps(xmm9, ptr[rsp + 160]);
movaps(xmm10, ptr[rsp + 192]); movaps(xmm10, ptr[rsp + 176]);
movaps(xmm11, ptr[rsp + 208]); movaps(xmm11, ptr[rsp + 192]);
movaps(xmm12, ptr[rsp + 224]); movaps(xmm12, ptr[rsp + 208]);
movaps(xmm13, ptr[rsp + 240]); movaps(xmm13, ptr[rsp + 224]);
movaps(xmm14, ptr[rsp + 256]); movaps(xmm14, ptr[rsp + 240]);
movaps(xmm15, ptr[rsp + 272]);*/ movaps(xmm15, ptr[rsp + 256]);
mov(rbx, qword[rsp + 48]); mov(rbx, qword[rsp + 40]);
mov(rcx, qword[rsp + 56]); mov(rcx, qword[rsp + 48]);
mov(rbp, qword[rsp + 64]); mov(rbp, qword[rsp + 56]);
mov(rsi, qword[rsp + 72]); mov(rsi, qword[rsp + 64]);
mov(rdi, qword[rsp + 80]); mov(rdi, qword[rsp + 72]);
mov(r12, qword[rsp + 88]); mov(r12, qword[rsp + 80]);
mov(r13, qword[rsp + 96]); mov(r13, qword[rsp + 88]);
mov(r14, qword[rsp + 104]); mov(r14, qword[rsp + 96]);
mov(r15, qword[rsp + 112]); mov(r15, qword[rsp + 104]);
add(rsp, stack_size); add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]); mov(rcx, qword[rsp + 8 * 1]);
@ -468,17 +469,18 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
mov(qword[rsp + 8 * 1], rcx); mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size); sub(rsp, stack_size);
mov(qword[rsp + 48], rbx); mov(qword[rsp + 40], rbx);
mov(qword[rsp + 56], rcx); mov(qword[rsp + 48], rcx);
mov(qword[rsp + 64], rbp); mov(qword[rsp + 56], rbp);
mov(qword[rsp + 72], rsi); mov(qword[rsp + 64], rsi);
mov(qword[rsp + 80], rdi); mov(qword[rsp + 72], rdi);
mov(qword[rsp + 88], r12); mov(qword[rsp + 80], r12);
mov(qword[rsp + 96], r13); mov(qword[rsp + 88], r13);
mov(qword[rsp + 104], r14); mov(qword[rsp + 96], r14);
mov(qword[rsp + 112], r15); mov(qword[rsp + 104], r15);
// TODO(benvanik): save things? XMM0-5? // TODO(benvanik): save things? XMM0-5?
// HACK: Some emulated vector instructions require that we don't touch xmm0.
mov(rax, rdx); mov(rax, rdx);
mov(rdx, r8); mov(rdx, r8);
@ -486,15 +488,15 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
mov(r9, r10); mov(r9, r10);
call(rax); call(rax);
mov(rbx, qword[rsp + 48]); mov(rbx, qword[rsp + 40]);
mov(rcx, qword[rsp + 56]); mov(rcx, qword[rsp + 48]);
mov(rbp, qword[rsp + 64]); mov(rbp, qword[rsp + 56]);
mov(rsi, qword[rsp + 72]); mov(rsi, qword[rsp + 64]);
mov(rdi, qword[rsp + 80]); mov(rdi, qword[rsp + 72]);
mov(r12, qword[rsp + 88]); mov(r12, qword[rsp + 80]);
mov(r13, qword[rsp + 96]); mov(r13, qword[rsp + 88]);
mov(r14, qword[rsp + 104]); mov(r14, qword[rsp + 96]);
mov(r15, qword[rsp + 112]); mov(r15, qword[rsp + 104]);
add(rsp, stack_size); add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]); mov(rcx, qword[rsp + 8 * 1]);
@ -502,7 +504,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
ret(); ret();
void* fn = Emplace(stack_size); void* fn = Emplace(stack_size);
return (HostToGuestThunk)fn; return (GuestToHostThunk)fn;
} }
// X64Emitter handles actually resolving functions. // X64Emitter handles actually resolving functions.
@ -518,29 +520,29 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
mov(qword[rsp + 8 * 1], rcx); mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size); sub(rsp, stack_size);
mov(qword[rsp + 48], rbx); mov(qword[rsp + 40], rbx);
mov(qword[rsp + 56], rcx); mov(qword[rsp + 48], rcx);
mov(qword[rsp + 64], rbp); mov(qword[rsp + 56], rbp);
mov(qword[rsp + 72], rsi); mov(qword[rsp + 64], rsi);
mov(qword[rsp + 80], rdi); mov(qword[rsp + 72], rdi);
mov(qword[rsp + 88], r12); mov(qword[rsp + 80], r12);
mov(qword[rsp + 96], r13); mov(qword[rsp + 88], r13);
mov(qword[rsp + 104], r14); mov(qword[rsp + 96], r14);
mov(qword[rsp + 112], r15); mov(qword[rsp + 104], r15);
mov(rdx, rbx); mov(rdx, rbx);
mov(rax, uint64_t(&ResolveFunction)); mov(rax, uint64_t(&ResolveFunction));
call(rax); call(rax);
mov(rbx, qword[rsp + 48]); mov(rbx, qword[rsp + 40]);
mov(rcx, qword[rsp + 56]); mov(rcx, qword[rsp + 48]);
mov(rbp, qword[rsp + 64]); mov(rbp, qword[rsp + 56]);
mov(rsi, qword[rsp + 72]); mov(rsi, qword[rsp + 64]);
mov(rdi, qword[rsp + 80]); mov(rdi, qword[rsp + 72]);
mov(r12, qword[rsp + 88]); mov(r12, qword[rsp + 80]);
mov(r13, qword[rsp + 96]); mov(r13, qword[rsp + 88]);
mov(r14, qword[rsp + 104]); mov(r14, qword[rsp + 96]);
mov(r15, qword[rsp + 112]); mov(r15, qword[rsp + 104]);
add(rsp, stack_size); add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]); mov(rcx, qword[rsp + 8 * 1]);

View File

@ -160,10 +160,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
// Must be 16b aligned. // Must be 16b aligned.
// Windows is very strict about the form of this and the epilog: // Windows is very strict about the form of this and the epilog:
// http://msdn.microsoft.com/en-us/library/tawsa7cb.aspx // http://msdn.microsoft.com/en-us/library/tawsa7cb.aspx
// TODO(benvanik): save off non-volatile registers so we can use them:
// RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15
// Only want to do this if we actually use them, though, otherwise
// it just adds overhead.
// IMPORTANT: any changes to the prolog must be kept in sync with // IMPORTANT: any changes to the prolog must be kept in sync with
// X64CodeCache, which dynamically generates exception information. // X64CodeCache, which dynamically generates exception information.
// Adding or changing anything here must be matched! // Adding or changing anything here must be matched!
@ -172,7 +168,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
*out_stack_size = stack_size; *out_stack_size = stack_size;
stack_size_ = stack_size; stack_size_ = stack_size;
sub(rsp, (uint32_t)stack_size); sub(rsp, (uint32_t)stack_size);
mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx); mov(qword[rsp + StackLayout::GUEST_CTX_HOME], rcx);
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx); mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
@ -205,7 +201,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
} }
// Load membase. // Load membase.
mov(rdx, qword[rcx + 8]); mov(rdx, qword[rcx + offsetof(ppc::PPCContext, virtual_membase)]);
// Body. // Body.
auto block = builder->first_block(); auto block = builder->first_block();
@ -237,7 +233,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
L(epilog_label); L(epilog_label);
epilog_label_ = nullptr; epilog_label_ = nullptr;
EmitTraceUserCallReturn(); EmitTraceUserCallReturn();
mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); mov(rcx, qword[rsp + StackLayout::GUEST_CTX_HOME]);
add(rsp, (uint32_t)stack_size); add(rsp, (uint32_t)stack_size);
ret(); ret();
@ -546,8 +542,11 @@ void X64Emitter::SetReturnAddress(uint64_t value) {
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);
} }
Xbyak::Reg64 X64Emitter::GetContextReg() { return rcx; }
Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdx; }
void X64Emitter::ReloadECX() { void X64Emitter::ReloadECX() {
mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); mov(rcx, qword[rsp + StackLayout::GUEST_CTX_HOME]);
} }
void X64Emitter::ReloadEDX() { void X64Emitter::ReloadEDX() {

View File

@ -125,7 +125,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
public: public:
// Reserved: rsp // Reserved: rsp
// Scratch: rax/rcx/rdx // Scratch: rax/rcx/rdx
// xmm0-2 (could be only xmm0 with some trickery) // xmm0-2
// Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?) // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?)
// xmm6-xmm15 (save to get xmm3-xmm5) // xmm6-xmm15 (save to get xmm3-xmm5)
static const int GPR_COUNT = 5; static const int GPR_COUNT = 5;
@ -170,6 +170,9 @@ class X64Emitter : public Xbyak::CodeGenerator {
uint64_t arg0); uint64_t arg0);
void CallNativeSafe(void* fn); void CallNativeSafe(void* fn);
void SetReturnAddress(uint64_t value); void SetReturnAddress(uint64_t value);
Xbyak::Reg64 GetContextReg();
Xbyak::Reg64 GetMembaseReg();
void ReloadECX(); void ReloadECX();
void ReloadEDX(); void ReloadEDX();

View File

@ -1793,7 +1793,7 @@ EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16,
// ============================================================================ // ============================================================================
// Note: all types are always aligned in the context. // Note: all types are always aligned in the context.
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
return e.rcx + offset.value; return e.GetContextReg() + offset.value;
} }
struct LOAD_CONTEXT_I8 struct LOAD_CONTEXT_I8
: Sequence<LOAD_CONTEXT_I8, I<OPCODE_LOAD_CONTEXT, I8Op, OffsetOp>> { : Sequence<LOAD_CONTEXT_I8, I<OPCODE_LOAD_CONTEXT, I8Op, OffsetOp>> {
@ -2088,12 +2088,12 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
// Since the constant is often 0x8... if we tried to use that as a // Since the constant is often 0x8... if we tried to use that as a
// displacement it would be sign extended and mess things up. // displacement it would be sign extended and mess things up.
e.mov(e.eax, static_cast<uint32_t>(guest.constant())); e.mov(e.eax, static_cast<uint32_t>(guest.constant()));
return e.rdx + e.rax; return e.GetMembaseReg() + e.rax;
} else { } else {
// Clear the top 32 bits, as they are likely garbage. // Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this. // TODO(benvanik): find a way to avoid doing this.
e.mov(e.eax, guest.reg().cvt32()); e.mov(e.eax, guest.reg().cvt32());
return e.rdx + e.rax; return e.GetMembaseReg() + e.rax;
} }
} }
struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> { struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> {
@ -3959,7 +3959,6 @@ struct MUL_HI_I8 : Sequence<MUL_HI_I8, I<OPCODE_MUL_HI, I8Op, I8Op, I8Op>> {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// mulx: $1:$2 = EDX * $3 // mulx: $1:$2 = EDX * $3
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) { if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.movzx(e.edx, i.src1); e.movzx(e.edx, i.src1);
@ -4004,7 +4003,6 @@ struct MUL_HI_I16
: Sequence<MUL_HI_I16, I<OPCODE_MUL_HI, I16Op, I16Op, I16Op>> { : Sequence<MUL_HI_I16, I<OPCODE_MUL_HI, I16Op, I16Op, I16Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) { if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.movzx(e.edx, i.src1); e.movzx(e.edx, i.src1);
@ -4049,7 +4047,6 @@ struct MUL_HI_I32
: Sequence<MUL_HI_I32, I<OPCODE_MUL_HI, I32Op, I32Op, I32Op>> { : Sequence<MUL_HI_I32, I<OPCODE_MUL_HI, I32Op, I32Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) { if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.mov(e.edx, i.src1); e.mov(e.edx, i.src1);
@ -4099,7 +4096,6 @@ struct MUL_HI_I64
: Sequence<MUL_HI_I64, I<OPCODE_MUL_HI, I64Op, I64Op, I64Op>> { : Sequence<MUL_HI_I64, I<OPCODE_MUL_HI, I64Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) { if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) { if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend // TODO(benvanik): place src1 in eax? still need to sign extend
e.mov(e.rdx, i.src1); e.mov(e.rdx, i.src1);

View File

@ -29,68 +29,68 @@ namespace x64 {
* | | * | |
* | | * | |
* +------------------+ * +------------------+
* | scratch, 16b | rsp + 32 * | scratch, 16b | rsp + 24
* | | * | |
* +------------------+ * +------------------+
* | rbx | rsp + 48 * | rbx | rsp + 40
* +------------------+ * +------------------+
* | rcx / context | rsp + 56 * | rcx / context | rsp + 48
* +------------------+ * +------------------+
* | rbp | rsp + 64 * | rbp | rsp + 56
* +------------------+ * +------------------+
* | rsi | rsp + 72 * | rsi | rsp + 64
* +------------------+ * +------------------+
* | rdi | rsp + 80 * | rdi | rsp + 72
* +------------------+ * +------------------+
* | r12 | rsp + 88 * | r12 | rsp + 80
* +------------------+ * +------------------+
* | r13 | rsp + 96 * | r13 | rsp + 88
* +------------------+ * +------------------+
* | r14 | rsp + 104 * | r14 | rsp + 96
* +------------------+ * +------------------+
* | r15 | rsp + 112 * | r15 | rsp + 104
* +------------------+ * +------------------+
* | (return address) | rsp + 120 * | xmm6/0 | rsp + 112
* | |
* +------------------+ * +------------------+
* | (rcx home) | rsp + 128 * | xmm7/1 | rsp + 128
* | |
* +------------------+ * +------------------+
* | (rdx home) | rsp + 136 * | xmm8/2 | rsp + 144
* | |
* +------------------+
* | xmm9/3 | rsp + 160
* | |
* +------------------+
* | xmm10/4 | rsp + 176
* | |
* +------------------+
* | xmm11/5 | rsp + 192
* | |
* +------------------+
* | xmm12 | rsp + 208
* | |
* +------------------+
* | xmm13 | rsp + 224
* | |
* +------------------+
* | xmm14 | rsp + 240
* | |
* +------------------+
* | xmm15 | rsp + 256
* | |
* +------------------+
* | scratch, 8b | rsp + 272
* | |
* +------------------+
* | (return address) | rsp + 280
* +------------------+
* | (rcx home) | rsp + 288
* +------------------+
* | (rdx home) | rsp + 296
* +------------------+ * +------------------+
* *
* *
* TODO:
* +------------------+
* | xmm6 | rsp + 128
* | |
* +------------------+
* | xmm7 | rsp + 144
* | |
* +------------------+
* | xmm8 | rsp + 160
* | |
* +------------------+
* | xmm9 | rsp + 176
* | |
* +------------------+
* | xmm10 | rsp + 192
* | |
* +------------------+
* | xmm11 | rsp + 208
* | |
* +------------------+
* | xmm12 | rsp + 224
* | |
* +------------------+
* | xmm13 | rsp + 240
* | |
* +------------------+
* | xmm14 | rsp + 256
* | |
* +------------------+
* | xmm15 | rsp + 272
* | |
* +------------------+
*
* Guest stack: * Guest stack:
* +------------------+ * +------------------+
* | arg temp, 3 * 8 | rsp + 0 * | arg temp, 3 * 8 | rsp + 0
@ -115,10 +115,10 @@ namespace x64 {
class StackLayout { class StackLayout {
public: public:
static const size_t THUNK_STACK_SIZE = 120; static const size_t THUNK_STACK_SIZE = 280;
static const size_t GUEST_STACK_SIZE = 104; static const size_t GUEST_STACK_SIZE = 104;
static const size_t GUEST_RCX_HOME = 80; static const size_t GUEST_CTX_HOME = 80;
static const size_t GUEST_RET_ADDR = 88; static const size_t GUEST_RET_ADDR = 88;
static const size_t GUEST_CALL_RET_ADDR = 96; static const size_t GUEST_CALL_RET_ADDR = 96;
}; };