JitArm64: Use STP for (parts of) ppcState.ps
The savestate incompatibility problem mentioned in a comment
was solved by d266be5
.
This commit is contained in:
parent
2d9ea42df2
commit
91b55824f9
|
@ -234,6 +234,8 @@ void JitArm64::Cleanup()
|
||||||
{
|
{
|
||||||
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
|
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
|
||||||
{
|
{
|
||||||
|
static_assert(PPCSTATE_OFF(gather_pipe_ptr) <= 504);
|
||||||
|
static_assert(PPCSTATE_OFF(gather_pipe_ptr) + 8 == PPCSTATE_OFF(gather_pipe_base_ptr));
|
||||||
LDP(IndexType::Signed, X0, X1, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
|
LDP(IndexType::Signed, X0, X1, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
|
||||||
SUB(X0, X0, X1);
|
SUB(X0, X0, X1);
|
||||||
CMP(X0, GPFifo::GATHER_PIPE_SIZE);
|
CMP(X0, GPFifo::GATHER_PIPE_SIZE);
|
||||||
|
|
|
@ -220,19 +220,22 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
|
||||||
if (reg1.IsDirty() && reg2.IsDirty() && reg1.GetType() == RegType::Register &&
|
if (reg1.IsDirty() && reg2.IsDirty() && reg1.GetType() == RegType::Register &&
|
||||||
reg2.GetType() == RegType::Register)
|
reg2.GetType() == RegType::Register)
|
||||||
{
|
{
|
||||||
size_t ppc_offset = GetGuestByIndex(i).ppc_offset;
|
const size_t ppc_offset = GetGuestByIndex(i).ppc_offset;
|
||||||
ARM64Reg RX1 = R(GetGuestByIndex(i));
|
if (ppc_offset <= 252)
|
||||||
ARM64Reg RX2 = R(GetGuestByIndex(i + 1));
|
|
||||||
m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset));
|
|
||||||
if (!maintain_state)
|
|
||||||
{
|
{
|
||||||
UnlockRegister(DecodeReg(RX1));
|
ARM64Reg RX1 = R(GetGuestByIndex(i));
|
||||||
UnlockRegister(DecodeReg(RX2));
|
ARM64Reg RX2 = R(GetGuestByIndex(i + 1));
|
||||||
reg1.Flush();
|
m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset));
|
||||||
reg2.Flush();
|
if (!maintain_state)
|
||||||
|
{
|
||||||
|
UnlockRegister(DecodeReg(RX1));
|
||||||
|
UnlockRegister(DecodeReg(RX2));
|
||||||
|
reg1.Flush();
|
||||||
|
reg2.Flush();
|
||||||
|
}
|
||||||
|
++i;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
++i;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -707,14 +710,18 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
|
||||||
{
|
{
|
||||||
if (dirty)
|
if (dirty)
|
||||||
{
|
{
|
||||||
// If the paired registers were at the start of ppcState we could do an STP here.
|
if (PPCSTATE_OFF(ps[preg].ps0) <= 504)
|
||||||
// Too bad moving them would break savestate compatibility between x86_64 and AArch64
|
{
|
||||||
// m_float_emit->STP(64, IndexType::Signed, host_reg, host_reg, PPC_REG,
|
m_float_emit->STP(64, IndexType::Signed, host_reg, host_reg, PPC_REG,
|
||||||
// PPCSTATE_OFF(ps[preg].ps0));
|
PPCSTATE_OFF(ps[preg].ps0));
|
||||||
m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
|
}
|
||||||
u32(PPCSTATE_OFF(ps[preg].ps0)));
|
else
|
||||||
m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
|
{
|
||||||
u32(PPCSTATE_OFF(ps[preg].ps1)));
|
m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
|
||||||
|
u32(PPCSTATE_OFF(ps[preg].ps0)));
|
||||||
|
m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
|
||||||
|
u32(PPCSTATE_OFF(ps[preg].ps1)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!maintain_state)
|
if (!maintain_state)
|
||||||
|
|
|
@ -96,10 +96,31 @@ struct PairedSingle
|
||||||
static_assert(std::is_standard_layout<PairedSingle>(), "PairedSingle must be standard layout");
|
static_assert(std::is_standard_layout<PairedSingle>(), "PairedSingle must be standard layout");
|
||||||
|
|
||||||
// This contains the entire state of the emulated PowerPC "Gekko" CPU.
|
// This contains the entire state of the emulated PowerPC "Gekko" CPU.
|
||||||
|
//
|
||||||
|
// To minimize code size on x86, we want as much useful stuff in the first 256 bytes as possible.
|
||||||
|
// ps needs to be relatively late in the struct due to it being larger than 256 bytes in itself.
|
||||||
|
//
|
||||||
|
// On AArch64, most load/store instructions support fairly large immediate offsets,
|
||||||
|
// but not LDP/STP, which we want to use for accessing certain things.
|
||||||
|
// These must be in the first 520 bytes: gather_pipe_ptr, gather_pipe_base_ptr
|
||||||
|
// Better code is generated if these are in the first 260 bytes: gpr
|
||||||
|
// Better code is generated if these are in the first 520 bytes: ps
|
||||||
|
// Unfortunately not all of those fit in 520 bytes, but we can fit most of ps and all of the rest.
|
||||||
struct PowerPCState
|
struct PowerPCState
|
||||||
{
|
{
|
||||||
|
// gather pipe pointer for JIT access
|
||||||
|
u8* gather_pipe_ptr;
|
||||||
|
u8* gather_pipe_base_ptr;
|
||||||
|
|
||||||
u32 gpr[32]; // General purpose registers. r1 = stack pointer.
|
u32 gpr[32]; // General purpose registers. r1 = stack pointer.
|
||||||
|
|
||||||
|
#ifndef _M_X86_64
|
||||||
|
// The paired singles are strange : PS0 is stored in the full 64 bits of each FPR
|
||||||
|
// but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits.
|
||||||
|
// Since we want to use SIMD, SSE2 is the only viable alternative - 2x double.
|
||||||
|
alignas(16) PairedSingle ps[32];
|
||||||
|
#endif
|
||||||
|
|
||||||
u32 pc; // program counter
|
u32 pc; // program counter
|
||||||
u32 npc;
|
u32 npc;
|
||||||
|
|
||||||
|
@ -123,23 +144,12 @@ struct PowerPCState
|
||||||
// lscbx
|
// lscbx
|
||||||
u16 xer_stringctrl;
|
u16 xer_stringctrl;
|
||||||
|
|
||||||
// gather pipe pointer for JIT access
|
|
||||||
u8* gather_pipe_ptr;
|
|
||||||
u8* gather_pipe_base_ptr;
|
|
||||||
|
|
||||||
#if _M_X86_64
|
#if _M_X86_64
|
||||||
// This member exists for the purpose of an assertion in x86 JitBase.cpp
|
// This member exists only for the purpose of an assertion that its offset <= 0x100.
|
||||||
// that its offset <= 0x100. To minimize code size on x86, we want as much
|
|
||||||
// useful stuff in the one-byte offset range as possible - which is why ps
|
|
||||||
// is sitting down here. It currently doesn't make a difference on other
|
|
||||||
// supported architectures.
|
|
||||||
std::tuple<> above_fits_in_first_0x100;
|
std::tuple<> above_fits_in_first_0x100;
|
||||||
#endif
|
|
||||||
|
|
||||||
// The paired singles are strange : PS0 is stored in the full 64 bits of each FPR
|
|
||||||
// but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits.
|
|
||||||
// Since we want to use SIMD, SSE2 is the only viable alternative - 2x double.
|
|
||||||
alignas(16) PairedSingle ps[32];
|
alignas(16) PairedSingle ps[32];
|
||||||
|
#endif
|
||||||
|
|
||||||
u32 sr[16]; // Segment registers.
|
u32 sr[16]; // Segment registers.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue