x64 backend: Save nonvolatile XMM registers on host -> guest transitions

Define the context and membase registers in the x64 emitter.
This commit is contained in:
Dr. Chat 2016-08-22 14:55:16 -05:00
parent e3fdb08ad7
commit 5f4416ee2f
5 changed files with 139 additions and 139 deletions

View File

@ -398,52 +398,53 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size);
mov(qword[rsp + 48], rbx);
mov(qword[rsp + 56], rcx);
mov(qword[rsp + 64], rbp);
mov(qword[rsp + 72], rsi);
mov(qword[rsp + 80], rdi);
mov(qword[rsp + 88], r12);
mov(qword[rsp + 96], r13);
mov(qword[rsp + 104], r14);
mov(qword[rsp + 112], r15);
// Preserve nonvolatile registers.
mov(qword[rsp + 40], rbx);
mov(qword[rsp + 48], rcx);
mov(qword[rsp + 56], rbp);
mov(qword[rsp + 64], rsi);
mov(qword[rsp + 72], rdi);
mov(qword[rsp + 80], r12);
mov(qword[rsp + 88], r13);
mov(qword[rsp + 96], r14);
mov(qword[rsp + 104], r15);
/*movaps(ptr[rsp + 128], xmm6);
movaps(ptr[rsp + 144], xmm7);
movaps(ptr[rsp + 160], xmm8);
movaps(ptr[rsp + 176], xmm9);
movaps(ptr[rsp + 192], xmm10);
movaps(ptr[rsp + 208], xmm11);
movaps(ptr[rsp + 224], xmm12);
movaps(ptr[rsp + 240], xmm13);
movaps(ptr[rsp + 256], xmm14);
movaps(ptr[rsp + 272], xmm15);*/
movaps(ptr[rsp + 112], xmm6);
movaps(ptr[rsp + 128], xmm7);
movaps(ptr[rsp + 144], xmm8);
movaps(ptr[rsp + 160], xmm9);
movaps(ptr[rsp + 176], xmm10);
movaps(ptr[rsp + 192], xmm11);
movaps(ptr[rsp + 208], xmm12);
movaps(ptr[rsp + 224], xmm13);
movaps(ptr[rsp + 240], xmm14);
movaps(ptr[rsp + 256], xmm15);
mov(rax, rcx);
mov(rcx, rdx);
mov(rdx, r8);
call(rax);
/*movaps(xmm6, ptr[rsp + 128]);
movaps(xmm7, ptr[rsp + 144]);
movaps(xmm8, ptr[rsp + 160]);
movaps(xmm9, ptr[rsp + 176]);
movaps(xmm10, ptr[rsp + 192]);
movaps(xmm11, ptr[rsp + 208]);
movaps(xmm12, ptr[rsp + 224]);
movaps(xmm13, ptr[rsp + 240]);
movaps(xmm14, ptr[rsp + 256]);
movaps(xmm15, ptr[rsp + 272]);*/
movaps(xmm6, ptr[rsp + 112]);
movaps(xmm7, ptr[rsp + 128]);
movaps(xmm8, ptr[rsp + 144]);
movaps(xmm9, ptr[rsp + 160]);
movaps(xmm10, ptr[rsp + 176]);
movaps(xmm11, ptr[rsp + 192]);
movaps(xmm12, ptr[rsp + 208]);
movaps(xmm13, ptr[rsp + 224]);
movaps(xmm14, ptr[rsp + 240]);
movaps(xmm15, ptr[rsp + 256]);
mov(rbx, qword[rsp + 48]);
mov(rcx, qword[rsp + 56]);
mov(rbp, qword[rsp + 64]);
mov(rsi, qword[rsp + 72]);
mov(rdi, qword[rsp + 80]);
mov(r12, qword[rsp + 88]);
mov(r13, qword[rsp + 96]);
mov(r14, qword[rsp + 104]);
mov(r15, qword[rsp + 112]);
mov(rbx, qword[rsp + 40]);
mov(rcx, qword[rsp + 48]);
mov(rbp, qword[rsp + 56]);
mov(rsi, qword[rsp + 64]);
mov(rdi, qword[rsp + 72]);
mov(r12, qword[rsp + 80]);
mov(r13, qword[rsp + 88]);
mov(r14, qword[rsp + 96]);
mov(r15, qword[rsp + 104]);
add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]);
@ -468,17 +469,18 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size);
mov(qword[rsp + 48], rbx);
mov(qword[rsp + 56], rcx);
mov(qword[rsp + 64], rbp);
mov(qword[rsp + 72], rsi);
mov(qword[rsp + 80], rdi);
mov(qword[rsp + 88], r12);
mov(qword[rsp + 96], r13);
mov(qword[rsp + 104], r14);
mov(qword[rsp + 112], r15);
mov(qword[rsp + 40], rbx);
mov(qword[rsp + 48], rcx);
mov(qword[rsp + 56], rbp);
mov(qword[rsp + 64], rsi);
mov(qword[rsp + 72], rdi);
mov(qword[rsp + 80], r12);
mov(qword[rsp + 88], r13);
mov(qword[rsp + 96], r14);
mov(qword[rsp + 104], r15);
// TODO(benvanik): save things? XMM0-5?
// HACK: Some emulated vector instructions require that we don't touch xmm0.
mov(rax, rdx);
mov(rdx, r8);
@ -486,15 +488,15 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
mov(r9, r10);
call(rax);
mov(rbx, qword[rsp + 48]);
mov(rcx, qword[rsp + 56]);
mov(rbp, qword[rsp + 64]);
mov(rsi, qword[rsp + 72]);
mov(rdi, qword[rsp + 80]);
mov(r12, qword[rsp + 88]);
mov(r13, qword[rsp + 96]);
mov(r14, qword[rsp + 104]);
mov(r15, qword[rsp + 112]);
mov(rbx, qword[rsp + 40]);
mov(rcx, qword[rsp + 48]);
mov(rbp, qword[rsp + 56]);
mov(rsi, qword[rsp + 64]);
mov(rdi, qword[rsp + 72]);
mov(r12, qword[rsp + 80]);
mov(r13, qword[rsp + 88]);
mov(r14, qword[rsp + 96]);
mov(r15, qword[rsp + 104]);
add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]);
@ -502,7 +504,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
ret();
void* fn = Emplace(stack_size);
return (HostToGuestThunk)fn;
return (GuestToHostThunk)fn;
}
// X64Emitter handles actually resolving functions.
@ -518,29 +520,29 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size);
mov(qword[rsp + 48], rbx);
mov(qword[rsp + 56], rcx);
mov(qword[rsp + 64], rbp);
mov(qword[rsp + 72], rsi);
mov(qword[rsp + 80], rdi);
mov(qword[rsp + 88], r12);
mov(qword[rsp + 96], r13);
mov(qword[rsp + 104], r14);
mov(qword[rsp + 112], r15);
mov(qword[rsp + 40], rbx);
mov(qword[rsp + 48], rcx);
mov(qword[rsp + 56], rbp);
mov(qword[rsp + 64], rsi);
mov(qword[rsp + 72], rdi);
mov(qword[rsp + 80], r12);
mov(qword[rsp + 88], r13);
mov(qword[rsp + 96], r14);
mov(qword[rsp + 104], r15);
mov(rdx, rbx);
mov(rax, uint64_t(&ResolveFunction));
call(rax);
mov(rbx, qword[rsp + 48]);
mov(rcx, qword[rsp + 56]);
mov(rbp, qword[rsp + 64]);
mov(rsi, qword[rsp + 72]);
mov(rdi, qword[rsp + 80]);
mov(r12, qword[rsp + 88]);
mov(r13, qword[rsp + 96]);
mov(r14, qword[rsp + 104]);
mov(r15, qword[rsp + 112]);
mov(rbx, qword[rsp + 40]);
mov(rcx, qword[rsp + 48]);
mov(rbp, qword[rsp + 56]);
mov(rsi, qword[rsp + 64]);
mov(rdi, qword[rsp + 72]);
mov(r12, qword[rsp + 80]);
mov(r13, qword[rsp + 88]);
mov(r14, qword[rsp + 96]);
mov(r15, qword[rsp + 104]);
add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]);

View File

@ -160,10 +160,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
// Must be 16b aligned.
// Windows is very strict about the form of this and the epilog:
// http://msdn.microsoft.com/en-us/library/tawsa7cb.aspx
// TODO(benvanik): save off non-volatile registers so we can use them:
// RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15
// Only want to do this if we actually use them, though, otherwise
// it just adds overhead.
// IMPORTANT: any changes to the prolog must be kept in sync with
// X64CodeCache, which dynamically generates exception information.
// Adding or changing anything here must be matched!
@ -172,7 +168,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
*out_stack_size = stack_size;
stack_size_ = stack_size;
sub(rsp, (uint32_t)stack_size);
mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx);
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], rcx);
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
@ -205,7 +201,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
}
// Load membase.
mov(rdx, qword[rcx + 8]);
mov(rdx, qword[rcx + offsetof(ppc::PPCContext, virtual_membase)]);
// Body.
auto block = builder->first_block();
@ -237,7 +233,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) {
L(epilog_label);
epilog_label_ = nullptr;
EmitTraceUserCallReturn();
mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]);
mov(rcx, qword[rsp + StackLayout::GUEST_CTX_HOME]);
add(rsp, (uint32_t)stack_size);
ret();
@ -546,8 +542,11 @@ void X64Emitter::SetReturnAddress(uint64_t value) {
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);
}
Xbyak::Reg64 X64Emitter::GetContextReg() { return rcx; }
Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdx; }
void X64Emitter::ReloadECX() {
mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]);
mov(rcx, qword[rsp + StackLayout::GUEST_CTX_HOME]);
}
void X64Emitter::ReloadEDX() {

View File

@ -125,7 +125,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
public:
// Reserved: rsp
// Scratch: rax/rcx/rdx
// xmm0-2 (could be only xmm0 with some trickery)
// xmm0-2
// Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?)
// xmm6-xmm15 (save to get xmm3-xmm5)
static const int GPR_COUNT = 5;
@ -170,6 +170,9 @@ class X64Emitter : public Xbyak::CodeGenerator {
uint64_t arg0);
void CallNativeSafe(void* fn);
void SetReturnAddress(uint64_t value);
Xbyak::Reg64 GetContextReg();
Xbyak::Reg64 GetMembaseReg();
void ReloadECX();
void ReloadEDX();

View File

@ -1793,7 +1793,7 @@ EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16,
// ============================================================================
// Note: all types are always aligned in the context.
RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) {
return e.rcx + offset.value;
return e.GetContextReg() + offset.value;
}
struct LOAD_CONTEXT_I8
: Sequence<LOAD_CONTEXT_I8, I<OPCODE_LOAD_CONTEXT, I8Op, OffsetOp>> {
@ -2088,12 +2088,12 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) {
// Since the constant is often 0x8... if we tried to use that as a
// displacement it would be sign extended and mess things up.
e.mov(e.eax, static_cast<uint32_t>(guest.constant()));
return e.rdx + e.rax;
return e.GetMembaseReg() + e.rax;
} else {
// Clear the top 32 bits, as they are likely garbage.
// TODO(benvanik): find a way to avoid doing this.
e.mov(e.eax, guest.reg().cvt32());
return e.rdx + e.rax;
return e.GetMembaseReg() + e.rax;
}
}
struct LOAD_I8 : Sequence<LOAD_I8, I<OPCODE_LOAD, I8Op, I64Op>> {
@ -3959,7 +3959,6 @@ struct MUL_HI_I8 : Sequence<MUL_HI_I8, I<OPCODE_MUL_HI, I8Op, I8Op, I8Op>> {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// mulx: $1:$2 = EDX * $3
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend
e.movzx(e.edx, i.src1);
@ -4004,7 +4003,6 @@ struct MUL_HI_I16
: Sequence<MUL_HI_I16, I<OPCODE_MUL_HI, I16Op, I16Op, I16Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend
e.movzx(e.edx, i.src1);
@ -4049,7 +4047,6 @@ struct MUL_HI_I32
: Sequence<MUL_HI_I32, I<OPCODE_MUL_HI, I32Op, I32Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend
e.mov(e.edx, i.src1);
@ -4099,7 +4096,6 @@ struct MUL_HI_I64
: Sequence<MUL_HI_I64, I<OPCODE_MUL_HI, I64Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// TODO(justin): Find a way to shorten this has call
if (e.IsFeatureEnabled(kX64EmitBMI2)) {
// TODO(benvanik): place src1 in eax? still need to sign extend
e.mov(e.rdx, i.src1);

View File

@ -29,68 +29,68 @@ namespace x64 {
* | |
* | |
* +------------------+
* | scratch, 16b | rsp + 32
* | scratch, 16b | rsp + 24
* | |
* +------------------+
* | rbx | rsp + 48
* | rbx | rsp + 40
* +------------------+
* | rcx / context | rsp + 56
* | rcx / context | rsp + 48
* +------------------+
* | rbp | rsp + 64
* | rbp | rsp + 56
* +------------------+
* | rsi | rsp + 72
* | rsi | rsp + 64
* +------------------+
* | rdi | rsp + 80
* | rdi | rsp + 72
* +------------------+
* | r12 | rsp + 88
* | r12 | rsp + 80
* +------------------+
* | r13 | rsp + 96
* | r13 | rsp + 88
* +------------------+
* | r14 | rsp + 104
* | r14 | rsp + 96
* +------------------+
* | r15 | rsp + 112
* | r15 | rsp + 104
* +------------------+
* | (return address) | rsp + 120
* | xmm6/0 | rsp + 112
* | |
* +------------------+
* | (rcx home) | rsp + 128
* | xmm7/1 | rsp + 128
* | |
* +------------------+
* | (rdx home) | rsp + 136
* | xmm8/2 | rsp + 144
* | |
* +------------------+
* | xmm9/3 | rsp + 160
* | |
* +------------------+
* | xmm10/4 | rsp + 176
* | |
* +------------------+
* | xmm11/5 | rsp + 192
* | |
* +------------------+
* | xmm12 | rsp + 208
* | |
* +------------------+
* | xmm13 | rsp + 224
* | |
* +------------------+
* | xmm14 | rsp + 240
* | |
* +------------------+
* | xmm15 | rsp + 256
* | |
* +------------------+
* | scratch, 8b | rsp + 272
* | |
* +------------------+
* | (return address) | rsp + 280
* +------------------+
* | (rcx home) | rsp + 288
* +------------------+
* | (rdx home) | rsp + 296
* +------------------+
*
*
* TODO:
* +------------------+
* | xmm6 | rsp + 128
* | |
* +------------------+
* | xmm7 | rsp + 144
* | |
* +------------------+
* | xmm8 | rsp + 160
* | |
* +------------------+
* | xmm9 | rsp + 176
* | |
* +------------------+
* | xmm10 | rsp + 192
* | |
* +------------------+
* | xmm11 | rsp + 208
* | |
* +------------------+
* | xmm12 | rsp + 224
* | |
* +------------------+
* | xmm13 | rsp + 240
* | |
* +------------------+
* | xmm14 | rsp + 256
* | |
* +------------------+
* | xmm15 | rsp + 272
* | |
* +------------------+
*
* Guest stack:
* +------------------+
* | arg temp, 3 * 8 | rsp + 0
@ -115,10 +115,10 @@ namespace x64 {
class StackLayout {
public:
static const size_t THUNK_STACK_SIZE = 120;
static const size_t THUNK_STACK_SIZE = 280;
static const size_t GUEST_STACK_SIZE = 104;
static const size_t GUEST_RCX_HOME = 80;
static const size_t GUEST_CTX_HOME = 80;
static const size_t GUEST_RET_ADDR = 88;
static const size_t GUEST_CALL_RET_ADDR = 96;
};