diff --git a/Source/Core/Common/x64ABI.cpp b/Source/Core/Common/x64ABI.cpp index 45465619bd..046d90e509 100644 --- a/Source/Core/Common/x64ABI.cpp +++ b/Source/Core/Common/x64ABI.cpp @@ -36,67 +36,86 @@ void XEmitter::ABI_RestoreStack(unsigned int frameSize, bool noProlog) } } -void XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, bool noProlog) +void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) { - int regSize = 8; - int shadow = 0; + size_t shadow = 0; #if defined(_WIN32) shadow = 0x20; #endif + int count = 0; for (int r = 0; r < 16; r++) { if (mask & (1 << r)) - { - PUSH((X64Reg) r); count++; - } } - int size = ((noProlog ? -regSize : 0) - (count * regSize)) & 0xf; + rsp_alignment -= count * 8; + size_t subtraction = 0; + if (mask & 0xffff0000) + { + // If we have any XMMs to save, we must align the stack here. + subtraction = rsp_alignment & 0xf; + } for (int x = 0; x < 16; x++) { if (mask & (1 << (16 + x))) - size += 16; - } - size += shadow; - if (size) - SUB(regSize * 8, R(RSP), size >= 0x80 ? Imm32(size) : Imm8(size)); - int offset = shadow; - for (int x = 0; x < 16; x++) - { - if (mask & (1 << (16 + x))) - { - MOVUPD(MDisp(RSP, offset), (X64Reg) x); - offset += 16; - } + subtraction += 16; } + size_t xmm_base_subtraction = subtraction; + subtraction += needed_frame_size; + subtraction += shadow; + // Final alignment. + rsp_alignment -= subtraction; + subtraction += rsp_alignment & 0xf; + + *shadowp = shadow; + *subtractionp = subtraction; + *xmm_offsetp = subtraction - xmm_base_subtraction; } -void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, bool noProlog) +size_t XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size) { - int regSize = 8; - int size = 0; -#if defined(_WIN32) - size += 0x20; -#endif + size_t shadow, subtraction, xmm_offset; + ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); + + for (int r = 0; r < 16; r++) + { + if (mask & (1 << r)) + PUSH((X64Reg) r); + } + + if (subtraction) + SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); + for (int x = 0; x < 16; x++) { if (mask & (1 << (16 + x))) { - MOVUPD((X64Reg) x, MDisp(RSP, size)); - size += 16; + MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) x); + xmm_offset += 16; } } - int count = 0; - for (int r = 0; r < 16; r++) - { - if (mask & (1 << r)) - count++; - } - size += ((noProlog ? -regSize : 0) - (count * regSize)) & 0xf; - if (size) - ADD(regSize * 8, R(RSP), size >= 0x80 ? Imm32(size) : Imm8(size)); + return shadow; +} + +void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size) +{ + size_t shadow, subtraction, xmm_offset; + ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); + + for (int x = 0; x < 16; x++) + { + if (mask & (1 << (16 + x))) + { + MOVAPD((X64Reg) x, MDisp(RSP, (int)xmm_offset)); + xmm_offset += 16; + } + } + + if (subtraction) + ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); + for (int r = 15; r >= 0; r--) { if (mask & (1 << r)) diff --git a/Source/Core/Common/x64ABI.h b/Source/Core/Common/x64ABI.h index abc9236ef7..bf058bc04a 100644 --- a/Source/Core/Common/x64ABI.h +++ b/Source/Core/Common/x64ABI.h @@ -53,5 +53,7 @@ #endif // WIN32 +#define ABI_ALL_CALLEE_SAVED ((u32) ~ABI_ALL_CALLER_SAVED) + #define ABI_RETURN RAX diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index d6f0699e84..22c6857da0 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -281,6 +281,8 @@ private: void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); + void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); + protected: inline void Write8(u8 value) {*code++ = value;} inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} @@ -761,9 +763,11 @@ public: void ABI_PushAllCalleeSavedRegsAndAdjustStack(); void ABI_PopAllCalleeSavedRegsAndAdjustStack(); - // A more flexible version of the above. - void ABI_PushRegistersAndAdjustStack(u32 mask, bool noProlog); - void ABI_PopRegistersAndAdjustStack(u32 mask, bool noProlog); + // Saves/restores the registers and adjusts the stack to be aligned as + // required by the ABI, where the previous alignment was as specified. + // Push returns the size of the shadow space, i.e. the offset of the frame. + size_t ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); + void ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize, bool noProlog = false); void ABI_AlignStack(unsigned int frameSize, bool noProlog = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index a3707dbbe1..a69d8e0f82 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -495,9 +495,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.fifoBytesThisBlock -= 32; MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write u32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); } u32 function = HLE::GetFunctionIndex(ops[i].address); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index ba9cf8b293..0c25191736 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -116,11 +116,11 @@ void Jit64::lXXx(UGeckoInstruction inst) FixupBranch noIdle = J_CC(CC_NZ); u32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); // ! we must continue executing of the loop after exception handling, maybe there is still 0 in r0 //MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); @@ -285,9 +285,9 @@ void Jit64::dcbz(UGeckoInstruction inst) // supposedly there are, at least for some MMU titles. Let's be careful and support it to be sure. MOV(32, M(&PC), Imm32(jit->js.compilerPC)); u32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH); - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); FixupBranch exit = J(); SetJumpTarget(fast); @@ -374,7 +374,7 @@ void Jit64::stX(UGeckoInstruction inst) MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); u32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); switch (accessSize) { case 32: @@ -387,7 +387,7 @@ void Jit64::stX(UGeckoInstruction inst) ABI_CallFunctionAC((void *)&Memory::Write_U8, gpr.R(s), addr); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); if (update) gpr.SetImmediate32(a, addr); return; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index d5cce9882e..dbce5dfb85 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -110,9 +110,9 @@ void CommonAsmRoutines::GenFrsqrte() SetJumpTarget(complex1); SetJumpTarget(complex2); SetJumpTarget(complex3); - ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot); - ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); RET(); } @@ -169,9 +169,9 @@ void CommonAsmRoutines::GenFres() SetJumpTarget(complex1); SetJumpTarget(complex2); - ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal); - ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); RET(); } @@ -258,9 +258,10 @@ void CommonAsmRoutines::GenQuantizedStores() SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); FixupBranch skip_complex = J(true); SetJumpTarget(too_complex); - ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); + // RSP alignment here is 8 due to the call. + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA); - ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); SetJumpTarget(skip_complex); RET(); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp index c1a6436e62..26b8b50d60 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp @@ -56,10 +56,8 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re X64Reg dataReg = (X64Reg)info.regOperandReg; // It's a read. Easy. - // It ought to be necessary to align the stack here. Since it seems to not - // affect anybody, I'm not going to add it just to be completely safe about - // performance. - ABI_PushRegistersAndAdjustStack(registersInUse, true); + // RSP alignment here is 8 due to the call. + ABI_PushRegistersAndAdjustStack(registersInUse, 8); if (addrReg != ABI_PARAM1) MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg)); @@ -91,7 +89,7 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re MOV(32, R(dataReg), R(ABI_RETURN)); } - ABI_PopRegistersAndAdjustStack(registersInUse, true); + ABI_PopRegistersAndAdjustStack(registersInUse, 8); RET(); return trampoline; } @@ -115,7 +113,7 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs MOV(32, PPCSTATE(pc), Imm32(pc)); - ABI_PushRegistersAndAdjustStack(registersInUse, true); + ABI_PushRegistersAndAdjustStack(registersInUse, 8); MOVTwo(64, ABI_PARAM1, dataReg, ABI_PARAM2, addrReg, ABI_PARAM3); @@ -140,7 +138,7 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r break; } - ABI_PopRegistersAndAdjustStack(registersInUse, true); + ABI_PopRegistersAndAdjustStack(registersInUse, 8); RET(); return trampoline; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index be43680e88..0eb5f63d98 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -204,9 +204,9 @@ private: void CallLambda(int sbits, const std::function* lambda) { - m_code->ABI_PushRegistersAndAdjustStack(m_registers_in_use, false); + m_code->ABI_PushRegistersAndAdjustStack(m_registers_in_use, 0); m_code->ABI_CallLambdaC(lambda, m_address); - m_code->ABI_PopRegistersAndAdjustStack(m_registers_in_use, false); + m_code->ABI_PopRegistersAndAdjustStack(m_registers_in_use, 0); MoveOpArgToReg(sbits, R(ABI_RETURN)); } @@ -305,7 +305,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, } else { - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); switch (accessSize) { case 64: ABI_CallFunctionC((void *)&Memory::Read_U64, address); break; @@ -313,7 +313,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, case 16: ABI_CallFunctionC((void *)&Memory::Read_U16_ZX, address); break; case 8: ABI_CallFunctionC((void *)&Memory::Read_U8_ZX, address); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); MEMCHECK_START @@ -350,7 +350,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, FixupBranch fast = J_CC(CC_Z, true); - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); switch (accessSize) { case 64: @@ -366,7 +366,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); MEMCHECK_START @@ -470,9 +470,9 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce FixupBranch fast = J_CC(CC_Z, true); // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); - bool noProlog = (0 != (flags & SAFE_LOADSTORE_NO_PROLOG)); + size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0; bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP); - ABI_PushRegistersAndAdjustStack(registersInUse, noProlog); + ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment); switch (accessSize) { case 64: @@ -488,7 +488,7 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, noProlog); + ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment); FixupBranch exit = J(); SetJumpTarget(fast); UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap); diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index a227b6da52..e57dc134c6 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -584,7 +584,7 @@ void VertexLoader::CompileVertexTranslator() PanicAlert("Trying to recompile a vertex translator"); m_compiledCode = GetCodePtr(); - ABI_PushAllCalleeSavedRegsAndAdjustStack(); + ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); // Start loop here const u8 *loop_start = GetCodePtr(); @@ -845,7 +845,7 @@ void VertexLoader::CompileVertexTranslator() SUB(32, MatR(RAX), Imm8(1)); J_CC(CC_NZ, loop_start); - ABI_PopAllCalleeSavedRegsAndAdjustStack(); + ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); RET(); #endif }