Merge pull request #9486 from JosJuice/storesafe

PPCAnalyst: Rework the store-safe logic
2021-03-24 12:39:00 +01:00 · 2021-03-24 12:39:00 +01:00 · 75f8ce10f9
parent e2b5026652 1845c5948d
commit 75f8ce10f9
4 changed files with 64 additions and 32 deletions
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@ -97,16 +97,16 @@ static std::array<GekkoOPTemplate, 13> table4 =
 {{    //SUBOP10
 	{0,    Interpreter::ps_cmpu0,   {"ps_cmpu0",   OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
 	{32,   Interpreter::ps_cmpo0,   {"ps_cmpo0",   OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
-	{40,   Interpreter::ps_neg,     {"ps_neg",     OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{136,  Interpreter::ps_nabs,    {"ps_nabs",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{264,  Interpreter::ps_abs,     {"ps_abs",     OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{40,   Interpreter::ps_neg,     {"ps_neg",     OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{136,  Interpreter::ps_nabs,    {"ps_nabs",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{264,  Interpreter::ps_abs,     {"ps_abs",     OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{64,   Interpreter::ps_cmpu1,   {"ps_cmpu1",   OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
-	{72,   Interpreter::ps_mr,      {"ps_mr",      OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{72,   Interpreter::ps_mr,      {"ps_mr",      OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{96,   Interpreter::ps_cmpo1,   {"ps_cmpo1",   OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
-	{528,  Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{560,  Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{592,  Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{624,  Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{528,  Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{560,  Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{592,  Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{624,  Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},

 	{1014, Interpreter::dcbz_l,     {"dcbz_l",     OpType::System, FL_IN_A0B | FL_LOADSTORE, 1, 0, 0, 0}},
 }};
@ -122,7 +122,7 @@ static std::array<GekkoOPTemplate, 17> table4_2 =
 	{18, Interpreter::ps_div,       {"ps_div",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}},
 	{20, Interpreter::ps_sub,       {"ps_sub",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{21, Interpreter::ps_add,       {"ps_add",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
-	{23, Interpreter::ps_sel,       {"ps_sel",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{23, Interpreter::ps_sel,       {"ps_sel",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{24, Interpreter::ps_res,       {"ps_res",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{25, Interpreter::ps_mul,       {"ps_mul",    OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{26, Interpreter::ps_rsqrte,    {"ps_rsqrte", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}},
@ -318,7 +318,7 @@ static std::array<GekkoOPTemplate, 9> table59 =

 static std::array<GekkoOPTemplate, 15> table63 =
 {{
-	{264, Interpreter::fabsx,       {"fabsx",   OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{264, Interpreter::fabsx,       {"fabsx",   OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},

 	// FIXME: fcmp modifies the FPRF flags, but if the flags are clobbered later,
 	// we don't actually need to calculate or store them here. So FL_READ_FPRF and FL_SET_FPRF is not
@ -329,9 +329,9 @@ static std::array<GekkoOPTemplate, 15> table63 =

 	{14,  Interpreter::fctiwx,      {"fctiwx",  OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{15,  Interpreter::fctiwzx,     {"fctiwzx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{72,  Interpreter::fmrx,        {"fmrx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{136, Interpreter::fnabsx,      {"fnabsx",  OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{40,  Interpreter::fnegx,       {"fnegx",   OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{72,  Interpreter::fmrx,        {"fmrx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{136, Interpreter::fnabsx,      {"fnabsx",  OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}},
+	{40,  Interpreter::fnegx,       {"fnegx",   OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}},
 	{12,  Interpreter::frspx,       {"frspx",   OpType::DoubleFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},

 	{64,  Interpreter::mcrfs,       {"mcrfs",   OpType::SystemFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
@ -347,7 +347,7 @@ static std::array<GekkoOPTemplate, 10> table63_2 =
 	{18, Interpreter::fdivx,        {"fdivx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}},
 	{20, Interpreter::fsubx,        {"fsubx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{21, Interpreter::faddx,        {"faddx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
-	{23, Interpreter::fselx,        {"fselx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{23, Interpreter::fselx,        {"fselx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{25, Interpreter::fmulx,        {"fmulx",    OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{26, Interpreter::frsqrtex,     {"frsqrtex", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{28, Interpreter::fmsubx,       {"fmsubx",   OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -971,37 +971,63 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
    op.fprIsStoreSafe = fprIsStoreSafe;
    if (op.fregOut >= 0)
    {
-      fprIsSingle[op.fregOut] = false;
-      fprIsDuplicated[op.fregOut] = false;
-      fprIsStoreSafe[op.fregOut] = false;
-      // Single, duplicated, and doesn't need PPC_FP.
      if (op.opinfo->type == OpType::SingleFP)
      {
        fprIsSingle[op.fregOut] = true;
        fprIsDuplicated[op.fregOut] = true;
-        fprIsStoreSafe[op.fregOut] = true;
      }
-      // Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
-      // TODO: if we go directly from a load to store, skip conversion entirely?
-      // TODO: if we go directly from a load to a float instruction, and the value isn't used
-      // for anything else, we can skip PPC_FP on a load too.
-      if (!strncmp(op.opinfo->opname, "lfs", 3))
+      else if (!strncmp(op.opinfo->opname, "lfs", 3))
      {
        fprIsSingle[op.fregOut] = true;
        fprIsDuplicated[op.fregOut] = true;
      }
-      // Paired are still floats, but the top/bottom halves may differ.
-      if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
+      else if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
      {
        fprIsSingle[op.fregOut] = true;
-        fprIsStoreSafe[op.fregOut] = true;
+        fprIsDuplicated[op.fregOut] = false;
      }
-      // Careful: changing the float mode in a block breaks this optimization, since
-      // a previous float op might have had had FTZ off while the later store has FTZ
-      // on. So, discard all information we have.
+      else
+      {
+        fprIsSingle[op.fregOut] = false;
+        fprIsDuplicated[op.fregOut] = false;
+      }
+
      if (!strncmp(op.opinfo->opname, "mtfs", 4))
+      {
+        // Careful: changing the float mode in a block breaks the store-safe optimization,
+        // since a previous float op might have had FTZ off while the later store has FTZ on.
+        // So, discard all information we have.
        fprIsStoreSafe = BitSet32(0);
      }
+      else if (op.opinfo->flags &
+               (FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT))
+      {
+        // If the instruction copies bits between registers (without flushing denormals to zero
+        // or turning SNaN into QNaN), the output is store-safe if the inputs are.
+
+        BitSet32 bitexact_inputs;
+        if (op.opinfo->flags & FL_IN_FLOAT_A_BITEXACT)
+          bitexact_inputs[code->inst.FA] = true;
+        if (op.opinfo->flags & FL_IN_FLOAT_B_BITEXACT)
+          bitexact_inputs[code->inst.FB] = true;
+        if (op.opinfo->flags & FL_IN_FLOAT_C_BITEXACT)
+          bitexact_inputs[code->inst.FC] = true;
+
+        fprIsStoreSafe[op.fregOut] = (fprIsStoreSafe & bitexact_inputs) == bitexact_inputs;
+      }
+      else
+      {
+        // Other FPU instructions are store-safe if they perform a single-precision
+        // arithmetic operation.
+
+        // TODO: if we go directly from a load to store, skip conversion entirely?
+        // TODO: if we go directly from a load to a float instruction, and the value isn't used
+        // for anything else, we can use fast single -> double conversion after the load.
+
+        fprIsStoreSafe[op.fregOut] =
+            (op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS);
+      }
+    }

    if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS)
    {
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -61,8 +61,9 @@ struct CodeOp  // 16B
  // instruction)
  BitSet32 fprIsDuplicated;
  // whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can
-  // safely
-  // skip PPC_FP.
+  // convert between single and double formats by just using the host machine's instruction for it.
+  // (The reason why we can't always do this is because some games rely on the exact bits of
+  // denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.)
  BitSet32 fprIsStoreSafe;
 };

--- a/Source/Core/Core/PowerPC/PPCTables.h
+++ b/Source/Core/Core/PowerPC/PPCTables.h
@ -59,6 +59,11 @@ enum
  FL_OUT_FLOAT_D = (1 << 28),  // frD is used as a destination.
  // Used in the case of double ops (they don't modify the top half of the output)
  FL_INOUT_FLOAT_D = FL_IN_FLOAT_D | FL_OUT_FLOAT_D,
+  FL_IN_FLOAT_A_BITEXACT = (1 << 29),  // The output is based on the exact bits in frA.
+  FL_IN_FLOAT_B_BITEXACT = (1 << 30),  // The output is based on the exact bits in frB.
+  FL_IN_FLOAT_C_BITEXACT = (1 << 31),  // The output is based on the exact bits in frC.
+  FL_IN_FLOAT_AB_BITEXACT = FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT,
+  FL_IN_FLOAT_BC_BITEXACT = FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT,
 };

 enum class OpType