From 88f3fec04e3e3b277d071aa0db19cbe77db2696e Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 24 Jan 2021 14:10:04 +0100
Subject: [PATCH 1/3] JitArm64: Add asserts for unexpected single to float
 conversions

If the register pressure is high when allocating registers,
Arm64FPRCache may spill a guest register which we are going to
allocate later during the current instruction, which has the
side effect of turning it into double precision. This will have
bad consequences if we are assuming that it is single precision,
so let's add some asserts to detect if that ever happens.
---
 .../JitArm64/JitArm64_FloatingPoint.cpp       | 37 ++++++++++++++++---
 .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 16 ++++++++
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index 8a4eddaf5e..942342707d 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -32,8 +32,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
   bool use_c = op5 >= 25;  // fmul and all kind of fmaddXX
   bool use_b = op5 != 25;  // fmul uses no B
 
-  bool inputs_are_singles = fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
-                            (!use_c || fpr.IsSingle(c, !packed));
+  const auto inputs_are_singles_func = [&] {
+    return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
+           (!use_c || fpr.IsSingle(c, !packed));
+  };
+  const bool inputs_are_singles = inputs_are_singles_func();
 
   ARM64Reg VA{}, VB{}, VC{}, VD{};
 
@@ -117,6 +120,9 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
     }
   }
 
+  ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
+             "Register allocation turned singles into doubles in the middle of fp_arith");
+
   if (single || packed)
     fpr.FixSinglePrecision(d);
 }
@@ -196,6 +202,9 @@ void JitArm64::fp_logic(UGeckoInstruction inst)
       break;
     }
   }
+
+  ASSERT_MSG(DYNA_REC, single == fpr.IsSingle(b, !packed),
+             "Register allocation turned singles into doubles in the middle of fp_logic");
 }
 
 void JitArm64::fselx(UGeckoInstruction inst)
@@ -209,6 +218,7 @@ void JitArm64::fselx(UGeckoInstruction inst)
   const u32 c = inst.FC;
   const u32 d = inst.FD;
 
+  const bool a_single = fpr.IsSingle(a, true);
   if (fpr.IsSingle(a, true))
   {
     const ARM64Reg VA = fpr.R(a, RegType::LowerPairSingle);
@@ -220,15 +230,20 @@ void JitArm64::fselx(UGeckoInstruction inst)
     m_float_emit.FCMPE(EncodeRegToDouble(VA));
   }
 
-  const bool single = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
-  const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair;
-  const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble;
+  const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
+  const RegType type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair;
+  const auto reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble;
 
   const ARM64Reg VB = fpr.R(b, type);
   const ARM64Reg VC = fpr.R(c, type);
   const ARM64Reg VD = fpr.RW(d, type);
 
   m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE);
+
+  ASSERT_MSG(DYNA_REC,
+             a_single == fpr.IsSingle(a, true) &&
+                 b_and_c_singles == (fpr.IsSingle(b, true) && fpr.IsSingle(c, true)),
+             "Register allocation turned singles into doubles in the middle of fselx");
 }
 
 void JitArm64::frspx(UGeckoInstruction inst)
@@ -241,7 +256,8 @@ void JitArm64::frspx(UGeckoInstruction inst)
   const u32 b = inst.FB;
   const u32 d = inst.FD;
 
-  if (fpr.IsSingle(b, true))
+  const bool single = fpr.IsSingle(b, true);
+  if (single)
   {
     // Source is already in single precision, so no need to do anything but to copy to PSR1.
     const ARM64Reg VB = fpr.R(b, RegType::LowerPairSingle);
@@ -257,6 +273,9 @@ void JitArm64::frspx(UGeckoInstruction inst)
 
     m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
   }
+
+  ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
+             "Register allocation turned singles into doubles in the middle of frspx");
 }
 
 void JitArm64::fcmpX(UGeckoInstruction inst)
@@ -320,6 +339,9 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
     SetJumpTarget(continue3);
   }
   SetJumpTarget(continue1);
+
+  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a, true) && fpr.IsSingle(b, true)),
+             "Register allocation turned singles into doubles in the middle of fcmpX");
 }
 
 void JitArm64::fctiwzx(UGeckoInstruction inst)
@@ -357,4 +379,7 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
   }
   m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
   fpr.Unlock(V0);
+
+  ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
+             "Register allocation turned singles into doubles in the middle of fctiwzx");
 }
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
index faa58e6ad0..00cd92c39b 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@@ -66,6 +66,9 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
     ASSERT_MSG(DYNA_REC, 0, "ps_merge - invalid op");
     break;
   }
+
+  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b)),
+             "Register allocation turned singles into doubles in the middle of ps_mergeXX");
 }
 
 void JitArm64::ps_mulsX(UGeckoInstruction inst)
@@ -92,6 +95,9 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
 
   m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
 
+  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
+             "Register allocation turned singles into doubles in the middle of ps_mulsX");
+
   fpr.FixSinglePrecision(d);
 }
 
@@ -250,6 +256,10 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
     ASSERT_MSG(DYNA_REC, 0, "ps_madd - invalid op");
     break;
   }
+
+  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
+             "Register allocation turned singles into doubles in the middle of ps_maddXX");
+
   fpr.FixSinglePrecision(d);
 
   if (V0Q != INVALID_REG)
@@ -291,6 +301,9 @@ void JitArm64::ps_sel(UGeckoInstruction inst)
     m_float_emit.MOV(VD, V0);
     fpr.Unlock(V0Q);
   }
+
+  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
+             "Register allocation turned singles into doubles in the middle of ps_sel");
 }
 
 void JitArm64::ps_sumX(UGeckoInstruction inst)
@@ -330,6 +343,9 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
     m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
   }
 
+  ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
+             "Register allocation turned singles into doubles in the middle of ps_sumX");
+
   fpr.FixSinglePrecision(d);
 
   fpr.Unlock(V0);

From f17cd3750a5785033d9f2f9e80e97ee3d0a66ba8 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 24 Jan 2021 14:27:07 +0100
Subject: [PATCH 2/3] JitArm64: Remove default parameters from
 Arm64FPRCache::R/RW

It obscures more than it helps in my opinion.
---
 Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp | 2 +-
 Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
index 942342707d..08e3d2ab9b 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@@ -356,7 +356,7 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
   const bool single = fpr.IsSingle(b, true);
 
   const ARM64Reg VB = fpr.R(b, single ? RegType::LowerPairSingle : RegType::LowerPair);
-  const ARM64Reg VD = fpr.RW(d);
+  const ARM64Reg VD = fpr.RW(d, RegType::LowerPair);
 
   const ARM64Reg V0 = fpr.GetReg();
 
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
index b074001830..4e37df9528 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
@@ -281,9 +281,9 @@ public:
 
   // Returns a guest register inside of a host register
   // Will dump an immediate to the host register as well
-  Arm64Gen::ARM64Reg R(size_t preg, RegType type = RegType::LowerPair);
+  Arm64Gen::ARM64Reg R(size_t preg, RegType type);
 
-  Arm64Gen::ARM64Reg RW(size_t preg, RegType type = RegType::LowerPair);
+  Arm64Gen::ARM64Reg RW(size_t preg, RegType type);
 
   BitSet32 GetCallerSavedUsed() const override;
 

From d00430470b73888896ee30caed5c1a1a72867c30 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Sun, 24 Jan 2021 14:56:38 +0100
Subject: [PATCH 3/3] JitArm64: Update registers last used before start of
 instruction

Let's reset m_last_used for each register that will be used
in an instruction before we start allocating any of them,
so that one of the earlier allocations doesn't spill a
register that we want in a later allocation. (We must still
also increment/reset m_last_used in R and RW, otherwise we
end up in trouble when emulating lmw/stmw since those access
more guest registers than there are available host registers.)

This should ensure that the asserts added earlier in this
pull request are never triggered.
---
 Source/Core/Core/PowerPC/JitArm64/Jit.cpp          |  9 +++++++++
 .../Core/PowerPC/JitArm64/JitArm64_RegCache.cpp    | 14 +++++++++++++-
 .../Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h |  2 ++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
index 4a22d1c964..8555f2b5f2 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@@ -694,6 +694,15 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
     if (!SConfig::GetInstance().bEnableDebugging)
       js.downcountAmount += PatchEngine::GetSpeedhackCycles(js.compilerPC);
 
+    // Skip calling UpdateLastUsed for lmw/stmw - it usually hurts more than it helps
+    if (op.inst.OPCD != 46 && op.inst.OPCD != 47)
+      gpr.UpdateLastUsed(op.regsIn | op.regsOut);
+
+    BitSet32 fpr_used = op.fregsIn;
+    if (op.fregOut >= 0)
+      fpr_used[op.fregOut] = true;
+    fpr.UpdateLastUsed(fpr_used);
+
     // Gather pipe writes using a non-immediate address are discovered by profiling.
     bool gatherPipeIntCheck = js.fifoWriteAddresses.find(op.address) != js.fifoWriteAddresses.end();
 
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
index 4a3f255c71..d0d861d992 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
@@ -41,10 +41,22 @@ ARM64Reg Arm64RegCache::GetReg()
   // Holy cow, how did you run out of registers?
   // We can't return anything reasonable in this case. Return INVALID_REG and watch the failure
   // happen
-  WARN_LOG_FMT(DYNA_REC, "All available registers are locked dumb dumb");
+  ASSERT_MSG(DYNA_REC, 0, "All available registers are locked!");
   return INVALID_REG;
 }
 
+void Arm64RegCache::UpdateLastUsed(BitSet32 regs_used)
+{
+  for (size_t i = 0; i < m_guest_registers.size(); ++i)
+  {
+    OpArg& reg = m_guest_registers[i];
+    if (i < 32 && regs_used[i])
+      reg.ResetLastUsed();
+    else
+      reg.IncrementLastUsed();
+  }
+}
+
 u32 Arm64RegCache::GetUnlockedRegisterCount() const
 {
   u32 unlocked_registers = 0;
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
index 4e37df9528..55c86bd4ab 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
@@ -140,6 +140,8 @@ public:
   // Requires unlocking after done
   Arm64Gen::ARM64Reg GetReg();
 
+  void UpdateLastUsed(BitSet32 regs_used);
+
   // Locks a register so a cache cannot use it
   // Useful for function calls
   template <typename T = Arm64Gen::ARM64Reg, typename... Args>