diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index de82dd4503..0dfb75f441 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -234,6 +234,8 @@ void JitArm64::Cleanup() { if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) { + static_assert(PPCSTATE_OFF(gather_pipe_ptr) <= 504); + static_assert(PPCSTATE_OFF(gather_pipe_ptr) + 8 == PPCSTATE_OFF(gather_pipe_base_ptr)); LDP(IndexType::Signed, X0, X1, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); SUB(X0, X0, X1); CMP(X0, GPFifo::GATHER_PIPE_SIZE); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index d0d861d992..2c4ca41928 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -220,19 +220,22 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, bool maintain_state) if (reg1.IsDirty() && reg2.IsDirty() && reg1.GetType() == RegType::Register && reg2.GetType() == RegType::Register) { - size_t ppc_offset = GetGuestByIndex(i).ppc_offset; - ARM64Reg RX1 = R(GetGuestByIndex(i)); - ARM64Reg RX2 = R(GetGuestByIndex(i + 1)); - m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset)); - if (!maintain_state) + const size_t ppc_offset = GetGuestByIndex(i).ppc_offset; + if (ppc_offset <= 252) { - UnlockRegister(DecodeReg(RX1)); - UnlockRegister(DecodeReg(RX2)); - reg1.Flush(); - reg2.Flush(); + ARM64Reg RX1 = R(GetGuestByIndex(i)); + ARM64Reg RX2 = R(GetGuestByIndex(i + 1)); + m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset)); + if (!maintain_state) + { + UnlockRegister(DecodeReg(RX1)); + UnlockRegister(DecodeReg(RX2)); + reg1.Flush(); + reg2.Flush(); + } + ++i; + continue; } - ++i; - continue; } } @@ -707,14 +710,18 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) { if (dirty) { - // If the paired registers were at the start of ppcState we could do an STP here. - // Too bad moving them would break savestate compatibility between x86_64 and AArch64 - // m_float_emit->STP(64, IndexType::Signed, host_reg, host_reg, PPC_REG, - // PPCSTATE_OFF(ps[preg].ps0)); - m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG, - u32(PPCSTATE_OFF(ps[preg].ps0))); - m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG, - u32(PPCSTATE_OFF(ps[preg].ps1))); + if (PPCSTATE_OFF(ps[preg].ps0) <= 504) + { + m_float_emit->STP(64, IndexType::Signed, host_reg, host_reg, PPC_REG, + PPCSTATE_OFF(ps[preg].ps0)); + } + else + { + m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG, + u32(PPCSTATE_OFF(ps[preg].ps0))); + m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG, + u32(PPCSTATE_OFF(ps[preg].ps1))); + } } if (!maintain_state) diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index 9d551de056..3579d7a19e 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -96,10 +96,31 @@ struct PairedSingle static_assert(std::is_standard_layout(), "PairedSingle must be standard layout"); // This contains the entire state of the emulated PowerPC "Gekko" CPU. +// +// To minimize code size on x86, we want as much useful stuff in the first 256 bytes as possible. +// ps needs to be relatively late in the struct due to it being larger than 256 bytes in itself. +// +// On AArch64, most load/store instructions support fairly large immediate offsets, +// but not LDP/STP, which we want to use for accessing certain things. +// These must be in the first 520 bytes: gather_pipe_ptr, gather_pipe_base_ptr +// Better code is generated if these are in the first 260 bytes: gpr +// Better code is generated if these are in the first 520 bytes: ps +// Unfortunately not all of those fit in 520 bytes, but we can fit most of ps and all of the rest. struct PowerPCState { + // gather pipe pointer for JIT access + u8* gather_pipe_ptr; + u8* gather_pipe_base_ptr; + u32 gpr[32]; // General purpose registers. r1 = stack pointer. +#ifndef _M_X86_64 + // The paired singles are strange : PS0 is stored in the full 64 bits of each FPR + // but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits. + // Since we want to use SIMD, SSE2 is the only viable alternative - 2x double. + alignas(16) PairedSingle ps[32]; +#endif + u32 pc; // program counter u32 npc; @@ -123,23 +144,12 @@ struct PowerPCState // lscbx u16 xer_stringctrl; - // gather pipe pointer for JIT access - u8* gather_pipe_ptr; - u8* gather_pipe_base_ptr; - #if _M_X86_64 - // This member exists for the purpose of an assertion in x86 JitBase.cpp - // that its offset <= 0x100. To minimize code size on x86, we want as much - // useful stuff in the one-byte offset range as possible - which is why ps - // is sitting down here. It currently doesn't make a difference on other - // supported architectures. + // This member exists only for the purpose of an assertion that its offset <= 0x100. std::tuple<> above_fits_in_first_0x100; -#endif - // The paired singles are strange : PS0 is stored in the full 64 bits of each FPR - // but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits. - // Since we want to use SIMD, SSE2 is the only viable alternative - 2x double. alignas(16) PairedSingle ps[32]; +#endif u32 sr[16]; // Segment registers.