diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 6c8415790..19527a3a1 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -398,52 +398,53 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); - mov(qword[rsp + 48], rbx); - mov(qword[rsp + 56], rcx); - mov(qword[rsp + 64], rbp); - mov(qword[rsp + 72], rsi); - mov(qword[rsp + 80], rdi); - mov(qword[rsp + 88], r12); - mov(qword[rsp + 96], r13); - mov(qword[rsp + 104], r14); - mov(qword[rsp + 112], r15); + // Preserve nonvolatile registers. + mov(qword[rsp + 40], rbx); + mov(qword[rsp + 48], rcx); + mov(qword[rsp + 56], rbp); + mov(qword[rsp + 64], rsi); + mov(qword[rsp + 72], rdi); + mov(qword[rsp + 80], r12); + mov(qword[rsp + 88], r13); + mov(qword[rsp + 96], r14); + mov(qword[rsp + 104], r15); - /*movaps(ptr[rsp + 128], xmm6); - movaps(ptr[rsp + 144], xmm7); - movaps(ptr[rsp + 160], xmm8); - movaps(ptr[rsp + 176], xmm9); - movaps(ptr[rsp + 192], xmm10); - movaps(ptr[rsp + 208], xmm11); - movaps(ptr[rsp + 224], xmm12); - movaps(ptr[rsp + 240], xmm13); - movaps(ptr[rsp + 256], xmm14); - movaps(ptr[rsp + 272], xmm15);*/ + movaps(ptr[rsp + 112], xmm6); + movaps(ptr[rsp + 128], xmm7); + movaps(ptr[rsp + 144], xmm8); + movaps(ptr[rsp + 160], xmm9); + movaps(ptr[rsp + 176], xmm10); + movaps(ptr[rsp + 192], xmm11); + movaps(ptr[rsp + 208], xmm12); + movaps(ptr[rsp + 224], xmm13); + movaps(ptr[rsp + 240], xmm14); + movaps(ptr[rsp + 256], xmm15); mov(rax, rcx); mov(rcx, rdx); mov(rdx, r8); call(rax); - /*movaps(xmm6, ptr[rsp + 128]); - movaps(xmm7, ptr[rsp + 144]); - movaps(xmm8, ptr[rsp + 160]); - movaps(xmm9, ptr[rsp + 176]); - movaps(xmm10, ptr[rsp + 192]); - movaps(xmm11, ptr[rsp + 208]); - movaps(xmm12, ptr[rsp + 224]); - movaps(xmm13, ptr[rsp + 240]); - movaps(xmm14, ptr[rsp + 256]); - movaps(xmm15, ptr[rsp + 272]);*/ + movaps(xmm6, ptr[rsp + 112]); + movaps(xmm7, ptr[rsp + 128]); + movaps(xmm8, ptr[rsp + 144]); + movaps(xmm9, ptr[rsp + 160]); + movaps(xmm10, ptr[rsp + 176]); + movaps(xmm11, ptr[rsp + 192]); + movaps(xmm12, ptr[rsp + 208]); + movaps(xmm13, ptr[rsp + 224]); + movaps(xmm14, ptr[rsp + 240]); + movaps(xmm15, ptr[rsp + 256]); - mov(rbx, qword[rsp + 48]); - mov(rcx, qword[rsp + 56]); - mov(rbp, qword[rsp + 64]); - mov(rsi, qword[rsp + 72]); - mov(rdi, qword[rsp + 80]); - mov(r12, qword[rsp + 88]); - mov(r13, qword[rsp + 96]); - mov(r14, qword[rsp + 104]); - mov(r15, qword[rsp + 112]); + mov(rbx, qword[rsp + 40]); + mov(rcx, qword[rsp + 48]); + mov(rbp, qword[rsp + 56]); + mov(rsi, qword[rsp + 64]); + mov(rdi, qword[rsp + 72]); + mov(r12, qword[rsp + 80]); + mov(r13, qword[rsp + 88]); + mov(r14, qword[rsp + 96]); + mov(r15, qword[rsp + 104]); add(rsp, stack_size); mov(rcx, qword[rsp + 8 * 1]); @@ -468,17 +469,18 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); - mov(qword[rsp + 48], rbx); - mov(qword[rsp + 56], rcx); - mov(qword[rsp + 64], rbp); - mov(qword[rsp + 72], rsi); - mov(qword[rsp + 80], rdi); - mov(qword[rsp + 88], r12); - mov(qword[rsp + 96], r13); - mov(qword[rsp + 104], r14); - mov(qword[rsp + 112], r15); + mov(qword[rsp + 40], rbx); + mov(qword[rsp + 48], rcx); + mov(qword[rsp + 56], rbp); + mov(qword[rsp + 64], rsi); + mov(qword[rsp + 72], rdi); + mov(qword[rsp + 80], r12); + mov(qword[rsp + 88], r13); + mov(qword[rsp + 96], r14); + mov(qword[rsp + 104], r15); // TODO(benvanik): save things? XMM0-5? + // HACK: Some emulated vector instructions require that we don't touch xmm0. mov(rax, rdx); mov(rdx, r8); @@ -486,15 +488,15 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { mov(r9, r10); call(rax); - mov(rbx, qword[rsp + 48]); - mov(rcx, qword[rsp + 56]); - mov(rbp, qword[rsp + 64]); - mov(rsi, qword[rsp + 72]); - mov(rdi, qword[rsp + 80]); - mov(r12, qword[rsp + 88]); - mov(r13, qword[rsp + 96]); - mov(r14, qword[rsp + 104]); - mov(r15, qword[rsp + 112]); + mov(rbx, qword[rsp + 40]); + mov(rcx, qword[rsp + 48]); + mov(rbp, qword[rsp + 56]); + mov(rsi, qword[rsp + 64]); + mov(rdi, qword[rsp + 72]); + mov(r12, qword[rsp + 80]); + mov(r13, qword[rsp + 88]); + mov(r14, qword[rsp + 96]); + mov(r15, qword[rsp + 104]); add(rsp, stack_size); mov(rcx, qword[rsp + 8 * 1]); @@ -502,7 +504,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { ret(); void* fn = Emplace(stack_size); - return (HostToGuestThunk)fn; + return (GuestToHostThunk)fn; } // X64Emitter handles actually resolving functions. @@ -518,29 +520,29 @@ ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() { mov(qword[rsp + 8 * 1], rcx); sub(rsp, stack_size); - mov(qword[rsp + 48], rbx); - mov(qword[rsp + 56], rcx); - mov(qword[rsp + 64], rbp); - mov(qword[rsp + 72], rsi); - mov(qword[rsp + 80], rdi); - mov(qword[rsp + 88], r12); - mov(qword[rsp + 96], r13); - mov(qword[rsp + 104], r14); - mov(qword[rsp + 112], r15); + mov(qword[rsp + 40], rbx); + mov(qword[rsp + 48], rcx); + mov(qword[rsp + 56], rbp); + mov(qword[rsp + 64], rsi); + mov(qword[rsp + 72], rdi); + mov(qword[rsp + 80], r12); + mov(qword[rsp + 88], r13); + mov(qword[rsp + 96], r14); + mov(qword[rsp + 104], r15); mov(rdx, rbx); mov(rax, uint64_t(&ResolveFunction)); call(rax); - mov(rbx, qword[rsp + 48]); - mov(rcx, qword[rsp + 56]); - mov(rbp, qword[rsp + 64]); - mov(rsi, qword[rsp + 72]); - mov(rdi, qword[rsp + 80]); - mov(r12, qword[rsp + 88]); - mov(r13, qword[rsp + 96]); - mov(r14, qword[rsp + 104]); - mov(r15, qword[rsp + 112]); + mov(rbx, qword[rsp + 40]); + mov(rcx, qword[rsp + 48]); + mov(rbp, qword[rsp + 56]); + mov(rsi, qword[rsp + 64]); + mov(rdi, qword[rsp + 72]); + mov(r12, qword[rsp + 80]); + mov(r13, qword[rsp + 88]); + mov(r14, qword[rsp + 96]); + mov(r15, qword[rsp + 104]); add(rsp, stack_size); mov(rcx, qword[rsp + 8 * 1]); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index be9b53229..dc087c3ca 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -160,10 +160,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { // Must be 16b aligned. // Windows is very strict about the form of this and the epilog: // http://msdn.microsoft.com/en-us/library/tawsa7cb.aspx - // TODO(benvanik): save off non-volatile registers so we can use them: - // RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 - // Only want to do this if we actually use them, though, otherwise - // it just adds overhead. // IMPORTANT: any changes to the prolog must be kept in sync with // X64CodeCache, which dynamically generates exception information. // Adding or changing anything here must be matched! @@ -172,7 +168,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { *out_stack_size = stack_size; stack_size_ = stack_size; sub(rsp, (uint32_t)stack_size); - mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx); + mov(qword[rsp + StackLayout::GUEST_CTX_HOME], rcx); mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx); mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); @@ -205,7 +201,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { } // Load membase. - mov(rdx, qword[rcx + 8]); + mov(rdx, qword[rcx + offsetof(ppc::PPCContext, virtual_membase)]); // Body. auto block = builder->first_block(); @@ -237,7 +233,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, size_t* out_stack_size) { L(epilog_label); epilog_label_ = nullptr; EmitTraceUserCallReturn(); - mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); + mov(rcx, qword[rsp + StackLayout::GUEST_CTX_HOME]); add(rsp, (uint32_t)stack_size); ret(); @@ -546,8 +542,11 @@ void X64Emitter::SetReturnAddress(uint64_t value) { mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); } +Xbyak::Reg64 X64Emitter::GetContextReg() { return rcx; } +Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdx; } + void X64Emitter::ReloadECX() { - mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); + mov(rcx, qword[rsp + StackLayout::GUEST_CTX_HOME]); } void X64Emitter::ReloadEDX() { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 50401ab83..5de5985f2 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -125,7 +125,7 @@ class X64Emitter : public Xbyak::CodeGenerator { public: // Reserved: rsp // Scratch: rax/rcx/rdx - // xmm0-2 (could be only xmm0 with some trickery) + // xmm0-2 // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?) // xmm6-xmm15 (save to get xmm3-xmm5) static const int GPR_COUNT = 5; @@ -170,6 +170,9 @@ class X64Emitter : public Xbyak::CodeGenerator { uint64_t arg0); void CallNativeSafe(void* fn); void SetReturnAddress(uint64_t value); + + Xbyak::Reg64 GetContextReg(); + Xbyak::Reg64 GetMembaseReg(); void ReloadECX(); void ReloadEDX(); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 83bfcd7d6..ac8237e7c 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1793,7 +1793,7 @@ EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL, STORE_LOCAL_I8, STORE_LOCAL_I16, // ============================================================================ // Note: all types are always aligned in the context. RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { - return e.rcx + offset.value; + return e.GetContextReg() + offset.value; } struct LOAD_CONTEXT_I8 : Sequence> { @@ -2088,12 +2088,12 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { // Since the constant is often 0x8... if we tried to use that as a // displacement it would be sign extended and mess things up. e.mov(e.eax, static_cast(guest.constant())); - return e.rdx + e.rax; + return e.GetMembaseReg() + e.rax; } else { // Clear the top 32 bits, as they are likely garbage. // TODO(benvanik): find a way to avoid doing this. e.mov(e.eax, guest.reg().cvt32()); - return e.rdx + e.rax; + return e.GetMembaseReg() + e.rax; } } struct LOAD_I8 : Sequence> { @@ -3959,7 +3959,6 @@ struct MUL_HI_I8 : Sequence> { if (i.instr->flags & ARITHMETIC_UNSIGNED) { // mulx: $1:$2 = EDX * $3 - // TODO(justin): Find a way to shorten this has call if (e.IsFeatureEnabled(kX64EmitBMI2)) { // TODO(benvanik): place src1 in eax? still need to sign extend e.movzx(e.edx, i.src1); @@ -4004,7 +4003,6 @@ struct MUL_HI_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // TODO(justin): Find a way to shorten this has call if (e.IsFeatureEnabled(kX64EmitBMI2)) { // TODO(benvanik): place src1 in eax? still need to sign extend e.movzx(e.edx, i.src1); @@ -4049,7 +4047,6 @@ struct MUL_HI_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // TODO(justin): Find a way to shorten this has call if (e.IsFeatureEnabled(kX64EmitBMI2)) { // TODO(benvanik): place src1 in eax? still need to sign extend e.mov(e.edx, i.src1); @@ -4099,7 +4096,6 @@ struct MUL_HI_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // TODO(justin): Find a way to shorten this has call if (e.IsFeatureEnabled(kX64EmitBMI2)) { // TODO(benvanik): place src1 in eax? still need to sign extend e.mov(e.rdx, i.src1); diff --git a/src/xenia/cpu/backend/x64/x64_stack_layout.h b/src/xenia/cpu/backend/x64/x64_stack_layout.h index 679cbc1b8..439e1d708 100644 --- a/src/xenia/cpu/backend/x64/x64_stack_layout.h +++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h @@ -29,68 +29,68 @@ namespace x64 { * | | * | | * +------------------+ - * | scratch, 16b | rsp + 32 + * | scratch, 16b | rsp + 24 * | | * +------------------+ - * | rbx | rsp + 48 + * | rbx | rsp + 40 * +------------------+ - * | rcx / context | rsp + 56 + * | rcx / context | rsp + 48 * +------------------+ - * | rbp | rsp + 64 + * | rbp | rsp + 56 * +------------------+ - * | rsi | rsp + 72 + * | rsi | rsp + 64 * +------------------+ - * | rdi | rsp + 80 + * | rdi | rsp + 72 * +------------------+ - * | r12 | rsp + 88 + * | r12 | rsp + 80 * +------------------+ - * | r13 | rsp + 96 + * | r13 | rsp + 88 * +------------------+ - * | r14 | rsp + 104 + * | r14 | rsp + 96 * +------------------+ - * | r15 | rsp + 112 + * | r15 | rsp + 104 * +------------------+ - * | (return address) | rsp + 120 + * | xmm6/0 | rsp + 112 + * | | * +------------------+ - * | (rcx home) | rsp + 128 + * | xmm7/1 | rsp + 128 + * | | * +------------------+ - * | (rdx home) | rsp + 136 + * | xmm8/2 | rsp + 144 + * | | + * +------------------+ + * | xmm9/3 | rsp + 160 + * | | + * +------------------+ + * | xmm10/4 | rsp + 176 + * | | + * +------------------+ + * | xmm11/5 | rsp + 192 + * | | + * +------------------+ + * | xmm12 | rsp + 208 + * | | + * +------------------+ + * | xmm13 | rsp + 224 + * | | + * +------------------+ + * | xmm14 | rsp + 240 + * | | + * +------------------+ + * | xmm15 | rsp + 256 + * | | + * +------------------+ + * | scratch, 8b | rsp + 272 + * | | + * +------------------+ + * | (return address) | rsp + 280 + * +------------------+ + * | (rcx home) | rsp + 288 + * +------------------+ + * | (rdx home) | rsp + 296 * +------------------+ * * - * TODO: - * +------------------+ - * | xmm6 | rsp + 128 - * | | - * +------------------+ - * | xmm7 | rsp + 144 - * | | - * +------------------+ - * | xmm8 | rsp + 160 - * | | - * +------------------+ - * | xmm9 | rsp + 176 - * | | - * +------------------+ - * | xmm10 | rsp + 192 - * | | - * +------------------+ - * | xmm11 | rsp + 208 - * | | - * +------------------+ - * | xmm12 | rsp + 224 - * | | - * +------------------+ - * | xmm13 | rsp + 240 - * | | - * +------------------+ - * | xmm14 | rsp + 256 - * | | - * +------------------+ - * | xmm15 | rsp + 272 - * | | - * +------------------+ - * * Guest stack: * +------------------+ * | arg temp, 3 * 8 | rsp + 0 @@ -115,10 +115,10 @@ namespace x64 { class StackLayout { public: - static const size_t THUNK_STACK_SIZE = 120; + static const size_t THUNK_STACK_SIZE = 280; static const size_t GUEST_STACK_SIZE = 104; - static const size_t GUEST_RCX_HOME = 80; + static const size_t GUEST_CTX_HOME = 80; static const size_t GUEST_RET_ADDR = 88; static const size_t GUEST_CALL_RET_ADDR = 96; };