diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
index de82dd4503..0dfb75f441 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@@ -234,6 +234,8 @@ void JitArm64::Cleanup()
 {
   if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
   {
+    static_assert(PPCSTATE_OFF(gather_pipe_ptr) <= 504);
+    static_assert(PPCSTATE_OFF(gather_pipe_ptr) + 8 == PPCSTATE_OFF(gather_pipe_base_ptr));
     LDP(IndexType::Signed, X0, X1, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
     SUB(X0, X0, X1);
     CMP(X0, GPFifo::GATHER_PIPE_SIZE);
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
index d0d861d992..2c4ca41928 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
@@ -220,19 +220,22 @@ void Arm64GPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
         if (reg1.IsDirty() && reg2.IsDirty() && reg1.GetType() == RegType::Register &&
             reg2.GetType() == RegType::Register)
         {
-          size_t ppc_offset = GetGuestByIndex(i).ppc_offset;
-          ARM64Reg RX1 = R(GetGuestByIndex(i));
-          ARM64Reg RX2 = R(GetGuestByIndex(i + 1));
-          m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset));
-          if (!maintain_state)
+          const size_t ppc_offset = GetGuestByIndex(i).ppc_offset;
+          if (ppc_offset <= 252)
           {
-            UnlockRegister(DecodeReg(RX1));
-            UnlockRegister(DecodeReg(RX2));
-            reg1.Flush();
-            reg2.Flush();
+            ARM64Reg RX1 = R(GetGuestByIndex(i));
+            ARM64Reg RX2 = R(GetGuestByIndex(i + 1));
+            m_emit->STP(IndexType::Signed, RX1, RX2, PPC_REG, u32(ppc_offset));
+            if (!maintain_state)
+            {
+              UnlockRegister(DecodeReg(RX1));
+              UnlockRegister(DecodeReg(RX2));
+              reg1.Flush();
+              reg2.Flush();
+            }
+            ++i;
+            continue;
           }
-          ++i;
-          continue;
         }
       }
 
@@ -707,14 +710,18 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
   {
     if (dirty)
     {
-      // If the paired registers were at the start of ppcState we could do an STP here.
-      // Too bad moving them would break savestate compatibility between x86_64 and AArch64
-      // m_float_emit->STP(64, IndexType::Signed, host_reg, host_reg, PPC_REG,
-      // PPCSTATE_OFF(ps[preg].ps0));
-      m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
-                        u32(PPCSTATE_OFF(ps[preg].ps0)));
-      m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
-                        u32(PPCSTATE_OFF(ps[preg].ps1)));
+      if (PPCSTATE_OFF(ps[preg].ps0) <= 504)
+      {
+        m_float_emit->STP(64, IndexType::Signed, host_reg, host_reg, PPC_REG,
+                          PPCSTATE_OFF(ps[preg].ps0));
+      }
+      else
+      {
+        m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
+                          u32(PPCSTATE_OFF(ps[preg].ps0)));
+        m_float_emit->STR(64, IndexType::Unsigned, host_reg, PPC_REG,
+                          u32(PPCSTATE_OFF(ps[preg].ps1)));
+      }
     }
 
     if (!maintain_state)
diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h
index 9d551de056..3579d7a19e 100644
--- a/Source/Core/Core/PowerPC/PowerPC.h
+++ b/Source/Core/Core/PowerPC/PowerPC.h
@@ -96,10 +96,31 @@ struct PairedSingle
 static_assert(std::is_standard_layout<PairedSingle>(), "PairedSingle must be standard layout");
 
 // This contains the entire state of the emulated PowerPC "Gekko" CPU.
+//
+// To minimize code size on x86, we want as much useful stuff in the first 256 bytes as possible.
+// ps needs to be relatively late in the struct due to it being larger than 256 bytes in itself.
+//
+// On AArch64, most load/store instructions support fairly large immediate offsets,
+// but not LDP/STP, which we want to use for accessing certain things.
+// These must be in the first 520 bytes: gather_pipe_ptr, gather_pipe_base_ptr
+// Better code is generated if these are in the first 260 bytes: gpr
+// Better code is generated if these are in the first 520 bytes: ps
+// Unfortunately not all of those fit in 520 bytes, but we can fit most of ps and all of the rest.
 struct PowerPCState
 {
+  // gather pipe pointer for JIT access
+  u8* gather_pipe_ptr;
+  u8* gather_pipe_base_ptr;
+
   u32 gpr[32];  // General purpose registers. r1 = stack pointer.
 
+#ifndef _M_X86_64
+  // The paired singles are strange : PS0 is stored in the full 64 bits of each FPR
+  // but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits.
+  // Since we want to use SIMD, SSE2 is the only viable alternative - 2x double.
+  alignas(16) PairedSingle ps[32];
+#endif
+
   u32 pc;  // program counter
   u32 npc;
 
@@ -123,23 +144,12 @@ struct PowerPCState
   // lscbx
   u16 xer_stringctrl;
 
-  // gather pipe pointer for JIT access
-  u8* gather_pipe_ptr;
-  u8* gather_pipe_base_ptr;
-
 #if _M_X86_64
-  // This member exists for the purpose of an assertion in x86 JitBase.cpp
-  // that its offset <= 0x100.  To minimize code size on x86, we want as much
-  // useful stuff in the one-byte offset range as possible - which is why ps
-  // is sitting down here.  It currently doesn't make a difference on other
-  // supported architectures.
+  // This member exists only for the purpose of an assertion that its offset <= 0x100.
   std::tuple<> above_fits_in_first_0x100;
-#endif
 
-  // The paired singles are strange : PS0 is stored in the full 64 bits of each FPR
-  // but ps calculations are only done in 32-bit precision, and PS1 is only 32 bits.
-  // Since we want to use SIMD, SSE2 is the only viable alternative - 2x double.
   alignas(16) PairedSingle ps[32];
+#endif
 
   u32 sr[16];  // Segment registers.