PPCAnalyst: Rework the store-safe logic

The output of instructions like fabsx and ps_sel is store-safe
if and only if the relevant inputs are. The old code was always
marking the output as store-safe if the output was a single,
and never otherwise.

Also, the old code was treating the output of psq_l/psq_lu as
store-safe, which seems incorrect (if dequantization is disabled).
This commit is contained in:
JosJuice 2021-02-02 20:50:29 +01:00
parent e2b5026652
commit 1845c5948d
4 changed files with 64 additions and 32 deletions

View File

@ -97,16 +97,16 @@ static std::array<GekkoOPTemplate, 13> table4 =
{{ //SUBOP10
{0, Interpreter::ps_cmpu0, {"ps_cmpu0", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
{32, Interpreter::ps_cmpo0, {"ps_cmpo0", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
{40, Interpreter::ps_neg, {"ps_neg", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{136, Interpreter::ps_nabs, {"ps_nabs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{264, Interpreter::ps_abs, {"ps_abs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{40, Interpreter::ps_neg, {"ps_neg", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{136, Interpreter::ps_nabs, {"ps_nabs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{264, Interpreter::ps_abs, {"ps_abs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{64, Interpreter::ps_cmpu1, {"ps_cmpu1", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
{72, Interpreter::ps_mr, {"ps_mr", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{72, Interpreter::ps_mr, {"ps_mr", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{96, Interpreter::ps_cmpo1, {"ps_cmpo1", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
{528, Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{560, Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{592, Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{624, Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{528, Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{560, Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{592, Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{624, Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{1014, Interpreter::dcbz_l, {"dcbz_l", OpType::System, FL_IN_A0B | FL_LOADSTORE, 1, 0, 0, 0}},
}};
@ -122,7 +122,7 @@ static std::array<GekkoOPTemplate, 17> table4_2 =
{18, Interpreter::ps_div, {"ps_div", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}},
{20, Interpreter::ps_sub, {"ps_sub", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{21, Interpreter::ps_add, {"ps_add", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{23, Interpreter::ps_sel, {"ps_sel", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{23, Interpreter::ps_sel, {"ps_sel", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{24, Interpreter::ps_res, {"ps_res", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{25, Interpreter::ps_mul, {"ps_mul", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{26, Interpreter::ps_rsqrte, {"ps_rsqrte", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}},
@ -318,7 +318,7 @@ static std::array<GekkoOPTemplate, 9> table59 =
static std::array<GekkoOPTemplate, 15> table63 =
{{
{264, Interpreter::fabsx, {"fabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{264, Interpreter::fabsx, {"fabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
// FIXME: fcmp modifies the FPRF flags, but if the flags are clobbered later,
// we don't actually need to calculate or store them here. So FL_READ_FPRF and FL_SET_FPRF is not
@ -329,9 +329,9 @@ static std::array<GekkoOPTemplate, 15> table63 =
{14, Interpreter::fctiwx, {"fctiwx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{15, Interpreter::fctiwzx, {"fctiwzx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{72, Interpreter::fmrx, {"fmrx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{136, Interpreter::fnabsx, {"fnabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{40, Interpreter::fnegx, {"fnegx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{72, Interpreter::fmrx, {"fmrx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{136, Interpreter::fnabsx, {"fnabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}},
{40, Interpreter::fnegx, {"fnegx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}},
{12, Interpreter::frspx, {"frspx", OpType::DoubleFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{64, Interpreter::mcrfs, {"mcrfs", OpType::SystemFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
@ -347,7 +347,7 @@ static std::array<GekkoOPTemplate, 10> table63_2 =
{18, Interpreter::fdivx, {"fdivx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}},
{20, Interpreter::fsubx, {"fsubx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{21, Interpreter::faddx, {"faddx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{23, Interpreter::fselx, {"fselx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{23, Interpreter::fselx, {"fselx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{25, Interpreter::fmulx, {"fmulx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{26, Interpreter::frsqrtex, {"frsqrtex", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{28, Interpreter::fmsubx, {"fmsubx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},

View File

@ -971,36 +971,62 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
op.fprIsStoreSafe = fprIsStoreSafe;
if (op.fregOut >= 0)
{
fprIsSingle[op.fregOut] = false;
fprIsDuplicated[op.fregOut] = false;
fprIsStoreSafe[op.fregOut] = false;
// Single, duplicated, and doesn't need PPC_FP.
if (op.opinfo->type == OpType::SingleFP)
{
fprIsSingle[op.fregOut] = true;
fprIsDuplicated[op.fregOut] = true;
fprIsStoreSafe[op.fregOut] = true;
}
// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
// TODO: if we go directly from a load to store, skip conversion entirely?
// TODO: if we go directly from a load to a float instruction, and the value isn't used
// for anything else, we can skip PPC_FP on a load too.
if (!strncmp(op.opinfo->opname, "lfs", 3))
else if (!strncmp(op.opinfo->opname, "lfs", 3))
{
fprIsSingle[op.fregOut] = true;
fprIsDuplicated[op.fregOut] = true;
}
// Paired are still floats, but the top/bottom halves may differ.
if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
else if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
{
fprIsSingle[op.fregOut] = true;
fprIsStoreSafe[op.fregOut] = true;
fprIsDuplicated[op.fregOut] = false;
}
// Careful: changing the float mode in a block breaks this optimization, since
// a previous float op might have had had FTZ off while the later store has FTZ
// on. So, discard all information we have.
else
{
fprIsSingle[op.fregOut] = false;
fprIsDuplicated[op.fregOut] = false;
}
if (!strncmp(op.opinfo->opname, "mtfs", 4))
{
// Careful: changing the float mode in a block breaks the store-safe optimization,
// since a previous float op might have had FTZ off while the later store has FTZ on.
// So, discard all information we have.
fprIsStoreSafe = BitSet32(0);
}
else if (op.opinfo->flags &
(FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT))
{
// If the instruction copies bits between registers (without flushing denormals to zero
// or turning SNaN into QNaN), the output is store-safe if the inputs are.
BitSet32 bitexact_inputs;
if (op.opinfo->flags & FL_IN_FLOAT_A_BITEXACT)
bitexact_inputs[code->inst.FA] = true;
if (op.opinfo->flags & FL_IN_FLOAT_B_BITEXACT)
bitexact_inputs[code->inst.FB] = true;
if (op.opinfo->flags & FL_IN_FLOAT_C_BITEXACT)
bitexact_inputs[code->inst.FC] = true;
fprIsStoreSafe[op.fregOut] = (fprIsStoreSafe & bitexact_inputs) == bitexact_inputs;
}
else
{
// Other FPU instructions are store-safe if they perform a single-precision
// arithmetic operation.
// TODO: if we go directly from a load to store, skip conversion entirely?
// TODO: if we go directly from a load to a float instruction, and the value isn't used
// for anything else, we can use fast single -> double conversion after the load.
fprIsStoreSafe[op.fregOut] =
(op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS);
}
}
if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS)

View File

@ -61,8 +61,9 @@ struct CodeOp // 16B
// instruction)
BitSet32 fprIsDuplicated;
// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can
// safely
// skip PPC_FP.
// convert between single and double formats by just using the host machine's instruction for it.
// (The reason why we can't always do this is because some games rely on the exact bits of
// denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.)
BitSet32 fprIsStoreSafe;
};

View File

@ -59,6 +59,11 @@ enum
FL_OUT_FLOAT_D = (1 << 28), // frD is used as a destination.
// Used in the case of double ops (they don't modify the top half of the output)
FL_INOUT_FLOAT_D = FL_IN_FLOAT_D | FL_OUT_FLOAT_D,
FL_IN_FLOAT_A_BITEXACT = (1 << 29), // The output is based on the exact bits in frA.
FL_IN_FLOAT_B_BITEXACT = (1 << 30), // The output is based on the exact bits in frB.
FL_IN_FLOAT_C_BITEXACT = (1 << 31), // The output is based on the exact bits in frC.
FL_IN_FLOAT_AB_BITEXACT = FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT,
FL_IN_FLOAT_BC_BITEXACT = FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT,
};
enum class OpType