PPCAnalyst: Rework the store-safe logic
The output of instructions like fabsx and ps_sel is store-safe if and only if the relevant inputs are. The old code was always marking the output as store-safe if the output was a single, and never otherwise. Also, the old code was treating the output of psq_l/psq_lu as store-safe, which seems incorrect (if dequantization is disabled).
This commit is contained in:
parent
e2b5026652
commit
1845c5948d
|
@ -97,16 +97,16 @@ static std::array<GekkoOPTemplate, 13> table4 =
|
|||
{{ //SUBOP10
|
||||
{0, Interpreter::ps_cmpu0, {"ps_cmpu0", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{32, Interpreter::ps_cmpo0, {"ps_cmpo0", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{40, Interpreter::ps_neg, {"ps_neg", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{136, Interpreter::ps_nabs, {"ps_nabs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{264, Interpreter::ps_abs, {"ps_abs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{40, Interpreter::ps_neg, {"ps_neg", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{136, Interpreter::ps_nabs, {"ps_nabs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{264, Interpreter::ps_abs, {"ps_abs", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{64, Interpreter::ps_cmpu1, {"ps_cmpu1", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{72, Interpreter::ps_mr, {"ps_mr", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{72, Interpreter::ps_mr, {"ps_mr", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{96, Interpreter::ps_cmpo1, {"ps_cmpo1", OpType::PS, FL_IN_FLOAT_AB | FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{528, Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{560, Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{592, Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{624, Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{528, Interpreter::ps_merge00, {"ps_merge00", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{560, Interpreter::ps_merge01, {"ps_merge01", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{592, Interpreter::ps_merge10, {"ps_merge10", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{624, Interpreter::ps_merge11, {"ps_merge11", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_IN_FLOAT_AB_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
|
||||
{1014, Interpreter::dcbz_l, {"dcbz_l", OpType::System, FL_IN_A0B | FL_LOADSTORE, 1, 0, 0, 0}},
|
||||
}};
|
||||
|
@ -122,7 +122,7 @@ static std::array<GekkoOPTemplate, 17> table4_2 =
|
|||
{18, Interpreter::ps_div, {"ps_div", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}},
|
||||
{20, Interpreter::ps_sub, {"ps_sub", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{21, Interpreter::ps_add, {"ps_add", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{23, Interpreter::ps_sel, {"ps_sel", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{23, Interpreter::ps_sel, {"ps_sel", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{24, Interpreter::ps_res, {"ps_res", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{25, Interpreter::ps_mul, {"ps_mul", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{26, Interpreter::ps_rsqrte, {"ps_rsqrte", OpType::PS, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}},
|
||||
|
@ -318,7 +318,7 @@ static std::array<GekkoOPTemplate, 9> table59 =
|
|||
|
||||
static std::array<GekkoOPTemplate, 15> table63 =
|
||||
{{
|
||||
{264, Interpreter::fabsx, {"fabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{264, Interpreter::fabsx, {"fabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
|
||||
// FIXME: fcmp modifies the FPRF flags, but if the flags are clobbered later,
|
||||
// we don't actually need to calculate or store them here. So FL_READ_FPRF and FL_SET_FPRF is not
|
||||
|
@ -329,9 +329,9 @@ static std::array<GekkoOPTemplate, 15> table63 =
|
|||
|
||||
{14, Interpreter::fctiwx, {"fctiwx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{15, Interpreter::fctiwzx, {"fctiwzx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{72, Interpreter::fmrx, {"fmrx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{136, Interpreter::fnabsx, {"fnabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{40, Interpreter::fnegx, {"fnegx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{72, Interpreter::fmrx, {"fmrx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_IN_FLOAT_B_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{136, Interpreter::fnabsx, {"fnabsx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{40, Interpreter::fnegx, {"fnegx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_IN_FLOAT_B_BITEXACT | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{12, Interpreter::frspx, {"frspx", OpType::DoubleFP, FL_OUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
|
||||
{64, Interpreter::mcrfs, {"mcrfs", OpType::SystemFP, FL_SET_CRn | FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
|
||||
|
@ -347,7 +347,7 @@ static std::array<GekkoOPTemplate, 10> table63_2 =
|
|||
{18, Interpreter::fdivx, {"fdivx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}},
|
||||
{20, Interpreter::fsubx, {"fsubx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{21, Interpreter::faddx, {"faddx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AB | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{23, Interpreter::fselx, {"fselx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{23, Interpreter::fselx, {"fselx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_IN_FLOAT_BC_BITEXACT | FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
|
||||
{25, Interpreter::fmulx, {"fmulx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_AC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{26, Interpreter::frsqrtex, {"frsqrtex", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_B | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
{28, Interpreter::fmsubx, {"fmsubx", OpType::DoubleFP, FL_INOUT_FLOAT_D | FL_IN_FLOAT_ABC | FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
|
||||
|
|
|
@ -971,36 +971,62 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
|||
op.fprIsStoreSafe = fprIsStoreSafe;
|
||||
if (op.fregOut >= 0)
|
||||
{
|
||||
fprIsSingle[op.fregOut] = false;
|
||||
fprIsDuplicated[op.fregOut] = false;
|
||||
fprIsStoreSafe[op.fregOut] = false;
|
||||
// Single, duplicated, and doesn't need PPC_FP.
|
||||
if (op.opinfo->type == OpType::SingleFP)
|
||||
{
|
||||
fprIsSingle[op.fregOut] = true;
|
||||
fprIsDuplicated[op.fregOut] = true;
|
||||
fprIsStoreSafe[op.fregOut] = true;
|
||||
}
|
||||
// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
|
||||
// TODO: if we go directly from a load to store, skip conversion entirely?
|
||||
// TODO: if we go directly from a load to a float instruction, and the value isn't used
|
||||
// for anything else, we can skip PPC_FP on a load too.
|
||||
if (!strncmp(op.opinfo->opname, "lfs", 3))
|
||||
else if (!strncmp(op.opinfo->opname, "lfs", 3))
|
||||
{
|
||||
fprIsSingle[op.fregOut] = true;
|
||||
fprIsDuplicated[op.fregOut] = true;
|
||||
}
|
||||
// Paired are still floats, but the top/bottom halves may differ.
|
||||
if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
|
||||
else if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
|
||||
{
|
||||
fprIsSingle[op.fregOut] = true;
|
||||
fprIsStoreSafe[op.fregOut] = true;
|
||||
fprIsDuplicated[op.fregOut] = false;
|
||||
}
|
||||
// Careful: changing the float mode in a block breaks this optimization, since
|
||||
// a previous float op might have had had FTZ off while the later store has FTZ
|
||||
// on. So, discard all information we have.
|
||||
else
|
||||
{
|
||||
fprIsSingle[op.fregOut] = false;
|
||||
fprIsDuplicated[op.fregOut] = false;
|
||||
}
|
||||
|
||||
if (!strncmp(op.opinfo->opname, "mtfs", 4))
|
||||
{
|
||||
// Careful: changing the float mode in a block breaks the store-safe optimization,
|
||||
// since a previous float op might have had FTZ off while the later store has FTZ on.
|
||||
// So, discard all information we have.
|
||||
fprIsStoreSafe = BitSet32(0);
|
||||
}
|
||||
else if (op.opinfo->flags &
|
||||
(FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT))
|
||||
{
|
||||
// If the instruction copies bits between registers (without flushing denormals to zero
|
||||
// or turning SNaN into QNaN), the output is store-safe if the inputs are.
|
||||
|
||||
BitSet32 bitexact_inputs;
|
||||
if (op.opinfo->flags & FL_IN_FLOAT_A_BITEXACT)
|
||||
bitexact_inputs[code->inst.FA] = true;
|
||||
if (op.opinfo->flags & FL_IN_FLOAT_B_BITEXACT)
|
||||
bitexact_inputs[code->inst.FB] = true;
|
||||
if (op.opinfo->flags & FL_IN_FLOAT_C_BITEXACT)
|
||||
bitexact_inputs[code->inst.FC] = true;
|
||||
|
||||
fprIsStoreSafe[op.fregOut] = (fprIsStoreSafe & bitexact_inputs) == bitexact_inputs;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Other FPU instructions are store-safe if they perform a single-precision
|
||||
// arithmetic operation.
|
||||
|
||||
// TODO: if we go directly from a load to store, skip conversion entirely?
|
||||
// TODO: if we go directly from a load to a float instruction, and the value isn't used
|
||||
// for anything else, we can use fast single -> double conversion after the load.
|
||||
|
||||
fprIsStoreSafe[op.fregOut] =
|
||||
(op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS);
|
||||
}
|
||||
}
|
||||
|
||||
if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS)
|
||||
|
|
|
@ -61,8 +61,9 @@ struct CodeOp // 16B
|
|||
// instruction)
|
||||
BitSet32 fprIsDuplicated;
|
||||
// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can
|
||||
// safely
|
||||
// skip PPC_FP.
|
||||
// convert between single and double formats by just using the host machine's instruction for it.
|
||||
// (The reason why we can't always do this is because some games rely on the exact bits of
|
||||
// denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.)
|
||||
BitSet32 fprIsStoreSafe;
|
||||
};
|
||||
|
||||
|
|
|
@ -59,6 +59,11 @@ enum
|
|||
FL_OUT_FLOAT_D = (1 << 28), // frD is used as a destination.
|
||||
// Used in the case of double ops (they don't modify the top half of the output)
|
||||
FL_INOUT_FLOAT_D = FL_IN_FLOAT_D | FL_OUT_FLOAT_D,
|
||||
FL_IN_FLOAT_A_BITEXACT = (1 << 29), // The output is based on the exact bits in frA.
|
||||
FL_IN_FLOAT_B_BITEXACT = (1 << 30), // The output is based on the exact bits in frB.
|
||||
FL_IN_FLOAT_C_BITEXACT = (1 << 31), // The output is based on the exact bits in frC.
|
||||
FL_IN_FLOAT_AB_BITEXACT = FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT,
|
||||
FL_IN_FLOAT_BC_BITEXACT = FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT,
|
||||
};
|
||||
|
||||
enum class OpType
|
||||
|
|
Loading…
Reference in New Issue