Improve code and clarify parameters to ABI_Push/PopRegistersAndAdjustStack.

- Factor common work into a helper function.
- Replace confusingly named "noProlog" with "rsp_alignment".  Now that
x86 is not supported, we can just specify it explicitly as 8 for
clarity.
- Add the option to include more frame size, which I'll need later.
- Revert a change by magumagu in March which replaced MOVAPD with MOVUPD
on account of 32-bit Windows, since it's no longer supported.  True,
apparently recent processors don't execute the former any faster if the
pointer is, in fact, aligned, but there's no point using MOVUPD for
something that's guaranteed to be aligned...

(I discovered that GenFrsqrte and GenFres were incorrectly passing false
to noProlog - they were, in fact, functions without prologs, the
original meaning of the parameter - which caused the previous change to
break.  This is now fixed.)
This commit is contained in:
comex 2014-09-07 14:06:48 -04:00
parent 6ea82790ba
commit 2dafbfb3ef
9 changed files with 96 additions and 72 deletions

View File

@ -36,67 +36,86 @@ void XEmitter::ABI_RestoreStack(unsigned int frameSize, bool noProlog)
} }
} }
void XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, bool noProlog) void XEmitter::ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
{ {
int regSize = 8; size_t shadow = 0;
int shadow = 0;
#if defined(_WIN32) #if defined(_WIN32)
shadow = 0x20; shadow = 0x20;
#endif #endif
int count = 0; int count = 0;
for (int r = 0; r < 16; r++) for (int r = 0; r < 16; r++)
{ {
if (mask & (1 << r)) if (mask & (1 << r))
{
PUSH((X64Reg) r);
count++; count++;
} }
rsp_alignment -= count * 8;
size_t subtraction = 0;
if (mask & 0xffff0000)
{
// If we have any XMMs to save, we must align the stack here.
subtraction = rsp_alignment & 0xf;
} }
int size = ((noProlog ? -regSize : 0) - (count * regSize)) & 0xf;
for (int x = 0; x < 16; x++) for (int x = 0; x < 16; x++)
{ {
if (mask & (1 << (16 + x))) if (mask & (1 << (16 + x)))
size += 16; subtraction += 16;
}
size += shadow;
if (size)
SUB(regSize * 8, R(RSP), size >= 0x80 ? Imm32(size) : Imm8(size));
int offset = shadow;
for (int x = 0; x < 16; x++)
{
if (mask & (1 << (16 + x)))
{
MOVUPD(MDisp(RSP, offset), (X64Reg) x);
offset += 16;
}
} }
size_t xmm_base_subtraction = subtraction;
subtraction += needed_frame_size;
subtraction += shadow;
// Final alignment.
rsp_alignment -= subtraction;
subtraction += rsp_alignment & 0xf;
*shadowp = shadow;
*subtractionp = subtraction;
*xmm_offsetp = subtraction - xmm_base_subtraction;
} }
void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, bool noProlog) size_t XEmitter::ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size)
{ {
int regSize = 8; size_t shadow, subtraction, xmm_offset;
int size = 0; ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
#if defined(_WIN32)
size += 0x20; for (int r = 0; r < 16; r++)
#endif {
if (mask & (1 << r))
PUSH((X64Reg) r);
}
if (subtraction)
SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
for (int x = 0; x < 16; x++) for (int x = 0; x < 16; x++)
{ {
if (mask & (1 << (16 + x))) if (mask & (1 << (16 + x)))
{ {
MOVUPD((X64Reg) x, MDisp(RSP, size)); MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) x);
size += 16; xmm_offset += 16;
} }
} }
int count = 0;
for (int r = 0; r < 16; r++)
{
if (mask & (1 << r))
count++;
}
size += ((noProlog ? -regSize : 0) - (count * regSize)) & 0xf;
if (size) return shadow;
ADD(regSize * 8, R(RSP), size >= 0x80 ? Imm32(size) : Imm8(size)); }
void XEmitter::ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size)
{
size_t shadow, subtraction, xmm_offset;
ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
for (int x = 0; x < 16; x++)
{
if (mask & (1 << (16 + x)))
{
MOVAPD((X64Reg) x, MDisp(RSP, (int)xmm_offset));
xmm_offset += 16;
}
}
if (subtraction)
ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
for (int r = 15; r >= 0; r--) for (int r = 15; r >= 0; r--)
{ {
if (mask & (1 << r)) if (mask & (1 << r))

View File

@ -53,5 +53,7 @@
#endif // WIN32 #endif // WIN32
#define ABI_ALL_CALLEE_SAVED ((u32) ~ABI_ALL_CALLER_SAVED)
#define ABI_RETURN RAX #define ABI_RETURN RAX

View File

@ -281,6 +281,8 @@ private:
void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
protected: protected:
inline void Write8(u8 value) {*code++ = value;} inline void Write8(u8 value) {*code++ = value;}
inline void Write16(u16 value) {*(u16*)code = (value); code += 2;} inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
@ -761,9 +763,11 @@ public:
void ABI_PushAllCalleeSavedRegsAndAdjustStack(); void ABI_PushAllCalleeSavedRegsAndAdjustStack();
void ABI_PopAllCalleeSavedRegsAndAdjustStack(); void ABI_PopAllCalleeSavedRegsAndAdjustStack();
// A more flexible version of the above. // Saves/restores the registers and adjusts the stack to be aligned as
void ABI_PushRegistersAndAdjustStack(u32 mask, bool noProlog); // required by the ABI, where the previous alignment was as specified.
void ABI_PopRegistersAndAdjustStack(u32 mask, bool noProlog); // Push returns the size of the shadow space, i.e. the offset of the frame.
size_t ABI_PushRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
void ABI_PopRegistersAndAdjustStack(u32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize, bool noProlog = false); unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize, bool noProlog = false);
void ABI_AlignStack(unsigned int frameSize, bool noProlog = false); void ABI_AlignStack(unsigned int frameSize, bool noProlog = false);

View File

@ -495,9 +495,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
js.fifoBytesThisBlock -= 32; js.fifoBytesThisBlock -= 32;
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
u32 registersInUse = CallerSavedRegistersInUse(); u32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, false); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); ABI_CallFunction((void *)&GPFifo::CheckGatherPipe);
ABI_PopRegistersAndAdjustStack(registersInUse, false); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
} }
u32 function = HLE::GetFunctionIndex(ops[i].address); u32 function = HLE::GetFunctionIndex(ops[i].address);

View File

@ -116,11 +116,11 @@ void Jit64::lXXx(UGeckoInstruction inst)
FixupBranch noIdle = J_CC(CC_NZ); FixupBranch noIdle = J_CC(CC_NZ);
u32 registersInUse = CallerSavedRegistersInUse(); u32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, false); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); ABI_CallFunctionC((void *)&PowerPC::OnIdle, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16);
ABI_PopRegistersAndAdjustStack(registersInUse, false); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
// ! we must continue executing of the loop after exception handling, maybe there is still 0 in r0 // ! we must continue executing of the loop after exception handling, maybe there is still 0 in r0
//MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); //MOV(32, PPCSTATE(pc), Imm32(js.compilerPC));
@ -285,9 +285,9 @@ void Jit64::dcbz(UGeckoInstruction inst)
// supposedly there are, at least for some MMU titles. Let's be careful and support it to be sure. // supposedly there are, at least for some MMU titles. Let's be careful and support it to be sure.
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); MOV(32, M(&PC), Imm32(jit->js.compilerPC));
u32 registersInUse = CallerSavedRegistersInUse(); u32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, false); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH); ABI_CallFunctionR((void *)&Memory::ClearCacheLine, RSCRATCH);
ABI_PopRegistersAndAdjustStack(registersInUse, false); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
FixupBranch exit = J(); FixupBranch exit = J();
SetJumpTarget(fast); SetJumpTarget(fast);
@ -374,7 +374,7 @@ void Jit64::stX(UGeckoInstruction inst)
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
u32 registersInUse = CallerSavedRegistersInUse(); u32 registersInUse = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, false); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize) switch (accessSize)
{ {
case 32: case 32:
@ -387,7 +387,7 @@ void Jit64::stX(UGeckoInstruction inst)
ABI_CallFunctionAC((void *)&Memory::Write_U8, gpr.R(s), addr); ABI_CallFunctionAC((void *)&Memory::Write_U8, gpr.R(s), addr);
break; break;
} }
ABI_PopRegistersAndAdjustStack(registersInUse, false); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
if (update) if (update)
gpr.SetImmediate32(a, addr); gpr.SetImmediate32(a, addr);
return; return;

View File

@ -110,9 +110,9 @@ void CommonAsmRoutines::GenFrsqrte()
SetJumpTarget(complex1); SetJumpTarget(complex1);
SetJumpTarget(complex2); SetJumpTarget(complex2);
SetJumpTarget(complex3); SetJumpTarget(complex3);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot); ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET(); RET();
} }
@ -169,9 +169,9 @@ void CommonAsmRoutines::GenFres()
SetJumpTarget(complex1); SetJumpTarget(complex1);
SetJumpTarget(complex2); SetJumpTarget(complex2);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal); ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, false); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
RET(); RET();
} }
@ -258,9 +258,10 @@ void CommonAsmRoutines::GenQuantizedStores()
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
FixupBranch skip_complex = J(true); FixupBranch skip_complex = J(true);
SetJumpTarget(too_complex); SetJumpTarget(too_complex);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); // RSP alignment here is 8 due to the call.
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA); ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
SetJumpTarget(skip_complex); SetJumpTarget(skip_complex);
RET(); RET();

View File

@ -56,10 +56,8 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
X64Reg dataReg = (X64Reg)info.regOperandReg; X64Reg dataReg = (X64Reg)info.regOperandReg;
// It's a read. Easy. // It's a read. Easy.
// It ought to be necessary to align the stack here. Since it seems to not // RSP alignment here is 8 due to the call.
// affect anybody, I'm not going to add it just to be completely safe about ABI_PushRegistersAndAdjustStack(registersInUse, 8);
// performance.
ABI_PushRegistersAndAdjustStack(registersInUse, true);
if (addrReg != ABI_PARAM1) if (addrReg != ABI_PARAM1)
MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg)); MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg));
@ -91,7 +89,7 @@ const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info, u32 re
MOV(32, R(dataReg), R(ABI_RETURN)); MOV(32, R(dataReg), R(ABI_RETURN));
} }
ABI_PopRegistersAndAdjustStack(registersInUse, true); ABI_PopRegistersAndAdjustStack(registersInUse, 8);
RET(); RET();
return trampoline; return trampoline;
} }
@ -115,7 +113,7 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
MOV(32, PPCSTATE(pc), Imm32(pc)); MOV(32, PPCSTATE(pc), Imm32(pc));
ABI_PushRegistersAndAdjustStack(registersInUse, true); ABI_PushRegistersAndAdjustStack(registersInUse, 8);
MOVTwo(64, ABI_PARAM1, dataReg, ABI_PARAM2, addrReg, ABI_PARAM3); MOVTwo(64, ABI_PARAM1, dataReg, ABI_PARAM2, addrReg, ABI_PARAM3);
@ -140,7 +138,7 @@ const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info, u32 r
break; break;
} }
ABI_PopRegistersAndAdjustStack(registersInUse, true); ABI_PopRegistersAndAdjustStack(registersInUse, 8);
RET(); RET();
return trampoline; return trampoline;

View File

@ -204,9 +204,9 @@ private:
void CallLambda(int sbits, const std::function<T(u32)>* lambda) void CallLambda(int sbits, const std::function<T(u32)>* lambda)
{ {
m_code->ABI_PushRegistersAndAdjustStack(m_registers_in_use, false); m_code->ABI_PushRegistersAndAdjustStack(m_registers_in_use, 0);
m_code->ABI_CallLambdaC(lambda, m_address); m_code->ABI_CallLambdaC(lambda, m_address);
m_code->ABI_PopRegistersAndAdjustStack(m_registers_in_use, false); m_code->ABI_PopRegistersAndAdjustStack(m_registers_in_use, 0);
MoveOpArgToReg(sbits, R(ABI_RETURN)); MoveOpArgToReg(sbits, R(ABI_RETURN));
} }
@ -305,7 +305,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
} }
else else
{ {
ABI_PushRegistersAndAdjustStack(registersInUse, false); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize) switch (accessSize)
{ {
case 64: ABI_CallFunctionC((void *)&Memory::Read_U64, address); break; case 64: ABI_CallFunctionC((void *)&Memory::Read_U64, address); break;
@ -313,7 +313,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
case 16: ABI_CallFunctionC((void *)&Memory::Read_U16_ZX, address); break; case 16: ABI_CallFunctionC((void *)&Memory::Read_U16_ZX, address); break;
case 8: ABI_CallFunctionC((void *)&Memory::Read_U8_ZX, address); break; case 8: ABI_CallFunctionC((void *)&Memory::Read_U8_ZX, address); break;
} }
ABI_PopRegistersAndAdjustStack(registersInUse, false); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
MEMCHECK_START MEMCHECK_START
@ -350,7 +350,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
FixupBranch fast = J_CC(CC_Z, true); FixupBranch fast = J_CC(CC_Z, true);
ABI_PushRegistersAndAdjustStack(registersInUse, false); ABI_PushRegistersAndAdjustStack(registersInUse, 0);
switch (accessSize) switch (accessSize)
{ {
case 64: case 64:
@ -366,7 +366,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc);
break; break;
} }
ABI_PopRegistersAndAdjustStack(registersInUse, false); ABI_PopRegistersAndAdjustStack(registersInUse, 0);
MEMCHECK_START MEMCHECK_START
@ -470,9 +470,9 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
FixupBranch fast = J_CC(CC_Z, true); FixupBranch fast = J_CC(CC_Z, true);
// PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs // PC is used by memory watchpoints (if enabled) or to print accurate PC locations in debug logs
MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC)); MOV(32, PPCSTATE(pc), Imm32(jit->js.compilerPC));
bool noProlog = (0 != (flags & SAFE_LOADSTORE_NO_PROLOG)); size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP); bool swap = !(flags & SAFE_LOADSTORE_NO_SWAP);
ABI_PushRegistersAndAdjustStack(registersInUse, noProlog); ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
switch (accessSize) switch (accessSize)
{ {
case 64: case 64:
@ -488,7 +488,7 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false);
break; break;
} }
ABI_PopRegistersAndAdjustStack(registersInUse, noProlog); ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment);
FixupBranch exit = J(); FixupBranch exit = J();
SetJumpTarget(fast); SetJumpTarget(fast);
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap); UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap);

View File

@ -584,7 +584,7 @@ void VertexLoader::CompileVertexTranslator()
PanicAlert("Trying to recompile a vertex translator"); PanicAlert("Trying to recompile a vertex translator");
m_compiledCode = GetCodePtr(); m_compiledCode = GetCodePtr();
ABI_PushAllCalleeSavedRegsAndAdjustStack(); ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
// Start loop here // Start loop here
const u8 *loop_start = GetCodePtr(); const u8 *loop_start = GetCodePtr();
@ -845,7 +845,7 @@ void VertexLoader::CompileVertexTranslator()
SUB(32, MatR(RAX), Imm8(1)); SUB(32, MatR(RAX), Imm8(1));
J_CC(CC_NZ, loop_start); J_CC(CC_NZ, loop_start);
ABI_PopAllCalleeSavedRegsAndAdjustStack(); ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
RET(); RET();
#endif #endif
} }