JIT: Initial FPRF support

Doesn't support all the FPSCR flags, just the FPRF ones.
Add PPCAnalyzer support to remove unnecessary FPRF calculations.

POV-ray benchmark with enableFPRF forced on for an extreme comparison:
Before: 1500s
After, fmul/fmadd only: 728s
After, all float: 753s

In real games that use FPRF, like F-Zero GX, FPRF previously cost a few percent
of total runtime.

Since FPRF is so much faster now, if enableFPRF is set, just do it for every
float instruction, not just fmul/fmadd like before. I don't know if this will
fix any games, but there's little good reason not to.
This commit is contained in:
Fiora 2014-08-20 02:22:07 -07:00
parent f52888d3ec
commit 7dbc623dc0
13 changed files with 222 additions and 91 deletions

View File

@ -383,6 +383,9 @@ union UReg_MSR
UReg_MSR() { Hex = 0; }
};
#define FPRF_SHIFT 12
#define FPRF_MASK (0x1F << FPRF_SHIFT)
// Floating Point Status and Control Register
union UReg_FPSCR
{

View File

@ -97,14 +97,14 @@ static GekkoOPTemplate primarytable[] =
static GekkoOPTemplate table4[] =
{ //SUBOP10
{0, Interpreter::ps_cmpu0, {"ps_cmpu0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU, 1, 0, 0, 0}},
{32, Interpreter::ps_cmpo0, {"ps_cmpo0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU, 1, 0, 0, 0}},
{0, Interpreter::ps_cmpu0, {"ps_cmpu0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{32, Interpreter::ps_cmpo0, {"ps_cmpo0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{40, Interpreter::ps_neg, {"ps_neg", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{136, Interpreter::ps_nabs, {"ps_nabs", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{264, Interpreter::ps_abs, {"ps_abs", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{64, Interpreter::ps_cmpu1, {"ps_cmpu1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{64, Interpreter::ps_cmpu1, {"ps_cmpu1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{72, Interpreter::ps_mr, {"ps_mr", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{96, Interpreter::ps_cmpo1, {"ps_cmpo1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{96, Interpreter::ps_cmpo1, {"ps_cmpo1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{528, Interpreter::ps_merge00, {"ps_merge00", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{560, Interpreter::ps_merge01, {"ps_merge01", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
{592, Interpreter::ps_merge10, {"ps_merge10", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
@ -115,23 +115,23 @@ static GekkoOPTemplate table4[] =
static GekkoOPTemplate table4_2[] =
{
{10, Interpreter::ps_sum0, {"ps_sum0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{11, Interpreter::ps_sum1, {"ps_sum1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{12, Interpreter::ps_muls0, {"ps_muls0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{13, Interpreter::ps_muls1, {"ps_muls1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{14, Interpreter::ps_madds0, {"ps_madds0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{15, Interpreter::ps_madds1, {"ps_madds1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{18, Interpreter::ps_div, {"ps_div", OPTYPE_PS, FL_USE_FPU, 17, 0, 0, 0}},
{20, Interpreter::ps_sub, {"ps_sub", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{21, Interpreter::ps_add, {"ps_add", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{10, Interpreter::ps_sum0, {"ps_sum0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{11, Interpreter::ps_sum1, {"ps_sum1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{12, Interpreter::ps_muls0, {"ps_muls0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{13, Interpreter::ps_muls1, {"ps_muls1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{14, Interpreter::ps_madds0, {"ps_madds0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{15, Interpreter::ps_madds1, {"ps_madds1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{18, Interpreter::ps_div, {"ps_div", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}},
{20, Interpreter::ps_sub, {"ps_sub", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{21, Interpreter::ps_add, {"ps_add", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{23, Interpreter::ps_sel, {"ps_sel", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{24, Interpreter::ps_res, {"ps_res", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{25, Interpreter::ps_mul, {"ps_mul", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{26, Interpreter::ps_rsqrte, {"ps_rsqrte", OPTYPE_PS, FL_USE_FPU, 2, 0, 0, 0}},
{28, Interpreter::ps_msub, {"ps_msub", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{29, Interpreter::ps_madd, {"ps_madd", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{30, Interpreter::ps_nmsub, {"ps_nmsub", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{31, Interpreter::ps_nmadd, {"ps_nmadd", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
{24, Interpreter::ps_res, {"ps_res", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{25, Interpreter::ps_mul, {"ps_mul", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{26, Interpreter::ps_rsqrte, {"ps_rsqrte", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}},
{28, Interpreter::ps_msub, {"ps_msub", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{29, Interpreter::ps_madd, {"ps_madd", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{30, Interpreter::ps_nmsub, {"ps_nmsub", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{31, Interpreter::ps_nmadd, {"ps_nmadd", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
};
@ -307,51 +307,51 @@ static GekkoOPTemplate table31_2[] =
static GekkoOPTemplate table59[] =
{
{18, Interpreter::fdivsx, {"fdivsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 17, 0, 0, 0}}, // TODO
{20, Interpreter::fsubsx, {"fsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{21, Interpreter::faddsx, {"faddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
//{22, Interpreter::fsqrtsx, {"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, // Not implemented on gekko
{24, Interpreter::fresx, {"fresx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{25, Interpreter::fmulsx, {"fmulsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{28, Interpreter::fmsubsx, {"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{29, Interpreter::fmaddsx, {"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{30, Interpreter::fnmsubsx, {"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{31, Interpreter::fnmaddsx, {"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{18, Interpreter::fdivsx, {"fdivsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}}, // TODO
{20, Interpreter::fsubsx, {"fsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{21, Interpreter::faddsx, {"faddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
//{22, Interpreter::fsqrtsx, {"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, // Not implemented on gekko
{24, Interpreter::fresx, {"fresx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{25, Interpreter::fmulsx, {"fmulsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{28, Interpreter::fmsubsx, {"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{29, Interpreter::fmaddsx, {"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{30, Interpreter::fnmsubsx, {"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{31, Interpreter::fnmaddsx, {"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
};
static GekkoOPTemplate table63[] =
{
{264, Interpreter::fabsx, {"fabsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{32, Interpreter::fcmpo, {"fcmpo", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{0, Interpreter::fcmpu, {"fcmpu", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{32, Interpreter::fcmpo, {"fcmpo", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{0, Interpreter::fcmpu, {"fcmpu", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{14, Interpreter::fctiwx, {"fctiwx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{15, Interpreter::fctiwzx, {"fctiwzx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{72, Interpreter::fmrx, {"fmrx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{136, Interpreter::fnabsx, {"fnabsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{40, Interpreter::fnegx, {"fnegx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{12, Interpreter::frspx, {"frspx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{12, Interpreter::frspx, {"frspx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_USE_FPU, 1, 0, 0, 0}},
{583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_USE_FPU, 1, 0, 0, 0}},
{70, Interpreter::mtfsb0x, {"mtfsb0x", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
{38, Interpreter::mtfsb1x, {"mtfsb1x", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
{134, Interpreter::mtfsfix, {"mtfsfix", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
{711, Interpreter::mtfsfx, {"mtfsfx", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
{64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
{583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
{70, Interpreter::mtfsb0x, {"mtfsb0x", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
{38, Interpreter::mtfsb1x, {"mtfsb1x", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
{134, Interpreter::mtfsfix, {"mtfsfix", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
{711, Interpreter::mtfsfx, {"mtfsfx", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
};
static GekkoOPTemplate table63_2[] =
{
{18, Interpreter::fdivx, {"fdivx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 31, 0, 0, 0}},
{20, Interpreter::fsubx, {"fsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{21, Interpreter::faddx, {"faddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{22, Interpreter::fsqrtx, {"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{18, Interpreter::fdivx, {"fdivx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}},
{20, Interpreter::fsubx, {"fsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{21, Interpreter::faddx, {"faddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{22, Interpreter::fsqrtx, {"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{23, Interpreter::fselx, {"fselx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{25, Interpreter::fmulx, {"fmulx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{26, Interpreter::frsqrtex, {"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{28, Interpreter::fmsubx, {"fmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{29, Interpreter::fmaddx, {"fmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{30, Interpreter::fnmsubx, {"fnmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{31, Interpreter::fnmaddx, {"fnmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
{25, Interpreter::fmulx, {"fmulx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{26, Interpreter::frsqrtex, {"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{28, Interpreter::fmsubx, {"fmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{29, Interpreter::fmaddx, {"fmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{30, Interpreter::fnmsubx, {"fnmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
{31, Interpreter::fnmaddx, {"fnmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
};
namespace InterpreterTables
{

View File

@ -116,11 +116,12 @@ public:
// Generates a branch that will check if a given bit of a CR register part
// is set or not.
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
// OPCODES
void unknown_instruction(UGeckoInstruction _inst);

View File

@ -14,7 +14,7 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS)
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
{
fpr.Lock(d, a, b);
if (roundRHS)
@ -88,25 +88,35 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
UNPCKLPD(fpr.RX(d), R(fpr.RX(d)));
}
}
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
// if it's going to be clobbered in a future instruction before being read, we can just
// not calculate it.
void Jit64::SetFPRFIfNeeded(UGeckoInstruction inst, X64Reg xmm)
{
// As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but
// FPRF is fast enough in JIT that we might as well just enable it for every float instruction
// if the enableFPRF flag is set.
if (Core::g_CoreStartupParameter.bEnableFPRF && js.op->wantsFPRF)
SetFPRF(xmm);
}
void Jit64::fp_arith(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
// Only the interpreter has "proper" support for (some) FP flags
FALLBACK_IF(inst.SUBOP5 == 25 && Core::g_CoreStartupParameter.bEnableFPRF);
bool single = inst.OPCD == 59;
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
}
@ -118,9 +128,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
// Only the interpreter has "proper" support for (some) FP flags
FALLBACK_IF(inst.SUBOP5 == 29 && Core::g_CoreStartupParameter.bEnableFPRF);
bool single_precision = inst.OPCD == 59;
int a = inst.FA;
@ -165,9 +172,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{
MOVSD(fpr.RX(d), R(XMM0));
}
// SMB checks flags after this op. Let's lie.
//AND(32, M(&PowerPC::ppcState.fpscr), Imm32(~((0x80000000 >> 19) | (0x80000000 >> 15))));
//OR(32, M(&PowerPC::ppcState.fpscr), Imm32((0x80000000 >> 16)));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -241,6 +246,7 @@ void Jit64::fcmpx(UGeckoInstruction inst)
fpr.Lock(a,b);
fpr.BindToRegister(b, true);
AND(32, M(&FPSCR), Imm32(~FPRF_MASK));
// Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception?
UCOMISD(fpr.R(b).GetSimpleReg(), fpr.R(a));
@ -264,10 +270,13 @@ void Jit64::fcmpx(UGeckoInstruction inst)
}
MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_EQ)));
OR(32, M(&FPSCR), Imm32(CR_EQ << FPRF_SHIFT));
continue1 = J();
SetJumpTarget(pNaN);
MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_SO)));
OR(32, M(&FPSCR), Imm32(CR_SO << FPRF_SHIFT));
if (a != b)
{
@ -275,10 +284,12 @@ void Jit64::fcmpx(UGeckoInstruction inst)
SetJumpTarget(pGreater);
MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_GT)));
OR(32, M(&FPSCR), Imm32(CR_GT << FPRF_SHIFT));
continue3 = J();
SetJumpTarget(pLesser);
MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_LT)));
OR(32, M(&FPSCR), Imm32(CR_LT << FPRF_SHIFT));
}
SetJumpTarget(continue1);

View File

@ -113,7 +113,7 @@ add a,b,a
*/
//There's still a little bit more optimization that can be squeezed out of this
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS)
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
{
fpr.Lock(d, a, b);
@ -163,6 +163,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6
(this->*op)(fpr.RX(d), fpr.R(b));
}
ForceSinglePrecisionP(fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -175,16 +176,16 @@ void Jit64::ps_arith(UGeckoInstruction inst)
switch (inst.SUBOP5)
{
case 18: // div
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD);
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst);
break;
case 20: // sub
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD);
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst);
break;
case 21: // add
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD);
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst);
break;
case 25: // mul
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true);
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true);
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@ -228,6 +229,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
PanicAlert("ps_sum WTF!!!");
}
ForceSinglePrecisionP(fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -267,6 +269,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
PanicAlert("ps_muls WTF!!!");
}
ForceSinglePrecisionP(fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}
@ -372,5 +375,6 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
fpr.BindToRegister(d, false);
MOVAPD(fpr.RX(d), Gen::R(XMM0));
ForceSinglePrecisionP(fpr.RX(d));
SetFPRFIfNeeded(inst, fpr.RX(d));
fpr.UnlockAll();
}

View File

@ -284,15 +284,6 @@ void Jit64::mfcr(UGeckoInstruction inst)
gpr.UnlockAllX();
}
// convert flags into 64-bit CR values with a lookup table
static const u64 m_crTable[16] =
{
PPCCRToInternal(0x0), PPCCRToInternal(0x1), PPCCRToInternal(0x2), PPCCRToInternal(0x3),
PPCCRToInternal(0x4), PPCCRToInternal(0x5), PPCCRToInternal(0x6), PPCCRToInternal(0x7),
PPCCRToInternal(0x8), PPCCRToInternal(0x9), PPCCRToInternal(0xA), PPCCRToInternal(0xB),
PPCCRToInternal(0xC), PPCCRToInternal(0xD), PPCCRToInternal(0xE), PPCCRToInternal(0xF),
};
void Jit64::mtcrf(UGeckoInstruction inst)
{
INSTRUCTION_START

View File

@ -6,6 +6,7 @@
#include "Common/Common.h"
#include "Common/CPUDetect.h"
#include "Common/MathUtil.h"
#include "Core/HW/MMIO.h"
#include "Core/PowerPC/JitCommon/Jit_Util.h"
@ -695,6 +696,103 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
MOVDDUP(dst, R(dst));
}
static const u64 GC_ALIGNED16(psDoubleExp[2]) = {0x7FF0000000000000ULL, 0};
static const u64 GC_ALIGNED16(psDoubleFrac[2]) = {0x000FFFFFFFFFFFFFULL, 0};
static const u64 GC_ALIGNED16(psDoubleNoSign[2]) = {0x7FFFFFFFFFFFFFFFULL, 0};
// TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer, storing
// the result of each floating point op and calculating it when needed. This is trickier than for integers
// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point values,
// which makes the whole thing rather trickier.
// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this isn't
// quite that necessary.
void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
{
AND(32, M(&FPSCR), Imm32(~FPRF_MASK));
FixupBranch continue1, continue2, continue3, continue4;
if (cpu_info.bSSE4_1)
{
MOVQ_xmm(R(RAX), xmm);
SHR(64, R(RAX), Imm8(63)); // Get the sign bit; almost all the branches need it.
PTEST(xmm, M((void*)psDoubleExp));
FixupBranch maxExponent = J_CC(CC_C);
FixupBranch zeroExponent = J_CC(CC_Z);
// Nice normalized number: sign ? PPC_FPCLASS_NN : PPC_FPCLASS_PN;
LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NN - MathUtil::PPC_FPCLASS_PN, MathUtil::PPC_FPCLASS_PN));
continue1 = J();
SetJumpTarget(maxExponent);
PTEST(xmm, M((void*)psDoubleFrac));
FixupBranch notNAN = J_CC(CC_Z);
// Max exponent + mantissa: PPC_FPCLASS_QNAN
MOV(32, R(EAX), Imm32(MathUtil::PPC_FPCLASS_QNAN));
continue2 = J();
// Max exponent + no mantissa: sign ? PPC_FPCLASS_NINF : PPC_FPCLASS_PINF;
SetJumpTarget(notNAN);
LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NINF - MathUtil::PPC_FPCLASS_PINF, MathUtil::PPC_FPCLASS_NINF));
continue3 = J();
SetJumpTarget(zeroExponent);
PTEST(xmm, R(xmm));
FixupBranch zero = J_CC(CC_Z);
// No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD;
LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_ND - MathUtil::PPC_FPCLASS_PD, MathUtil::PPC_FPCLASS_ND));
continue4 = J();
// Zero: sign ? PPC_FPCLASS_NZ : PPC_FPCLASS_PZ;
SetJumpTarget(zero);
SHL(32, R(EAX), Imm8(4));
ADD(32, R(EAX), Imm8(MathUtil::PPC_FPCLASS_PZ));
}
else
{
MOVQ_xmm(R(RAX), xmm);
TEST(64, R(RAX), M((void*)psDoubleExp));
FixupBranch zeroExponent = J_CC(CC_Z);
AND(64, R(RAX), M((void*)psDoubleNoSign));
CMP(64, R(RAX), M((void*)psDoubleExp));
FixupBranch nan = J_CC(CC_G); // This works because if the sign bit is set, RAX is negative
FixupBranch infinity = J_CC(CC_E);
MOVQ_xmm(R(RAX), xmm);
SHR(64, R(RAX), Imm8(63));
LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NN - MathUtil::PPC_FPCLASS_PN, MathUtil::PPC_FPCLASS_PN));
continue1 = J();
SetJumpTarget(nan);
MOVQ_xmm(R(RAX), xmm);
SHR(64, R(RAX), Imm8(63));
MOV(32, R(EAX), Imm32(MathUtil::PPC_FPCLASS_QNAN));
continue2 = J();
SetJumpTarget(infinity);
MOVQ_xmm(R(RAX), xmm);
SHR(64, R(RAX), Imm8(63));
LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NINF - MathUtil::PPC_FPCLASS_PINF, MathUtil::PPC_FPCLASS_NINF));
continue3 = J();
SetJumpTarget(zeroExponent);
TEST(64, R(RAX), R(RAX));
FixupBranch zero = J_CC(CC_Z);
SHR(64, R(RAX), Imm8(63));
LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_ND - MathUtil::PPC_FPCLASS_PD, MathUtil::PPC_FPCLASS_ND));
continue4 = J();
SetJumpTarget(zero);
SHR(64, R(RAX), Imm8(63));
SHL(32, R(EAX), Imm8(4));
ADD(32, R(EAX), Imm8(MathUtil::PPC_FPCLASS_PZ));
}
SetJumpTarget(continue1);
SetJumpTarget(continue2);
SetJumpTarget(continue3);
SetJumpTarget(continue4);
SHL(32, R(EAX), Imm8(FPRF_SHIFT));
OR(32, M(&FPSCR), R(EAX));
}
void EmuCodeBlock::JitClearCA()
{
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0

View File

@ -62,6 +62,7 @@ public:
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
// EAX might get trashed
void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
void SetFPRF(Gen::X64Reg xmm);
protected:
std::unordered_map<u8 *, u32> registersInUseAtLoc;
std::unordered_map<u8 *, u32> pcAtLoc;

View File

@ -453,6 +453,10 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
else
code->outputCR1 = (opinfo->flags & FL_SET_CR1) ? true : false;
code->wantsFPRF = (opinfo->flags & FL_READ_FPRF) ? true : false;
code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
int numOut = 0;
int numIn = 0;
if (opinfo->flags & FL_OUT_A)
@ -710,24 +714,25 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
}
// Scan for CR0 dependency
// assume next block wants CR0 to be safe
// assume next block wants flags to be safe
bool wantsCR0 = true;
bool wantsCR1 = true;
bool wantsPS1 = true;
bool wantsFPRF = true;
for (int i = block->m_num_instructions - 1; i >= 0; i--)
{
if (code[i].outputCR0)
wantsCR0 = false;
if (code[i].outputCR1)
wantsCR1 = false;
if (code[i].outputPS1)
wantsPS1 = false;
wantsCR0 |= code[i].wantsCR0;
wantsCR1 |= code[i].wantsCR1;
wantsPS1 |= code[i].wantsPS1;
wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
code[i].wantsCR0 = wantsCR0;
code[i].wantsCR1 = wantsCR1;
code[i].wantsPS1 = wantsPS1;
code[i].wantsFPRF = wantsFPRF;
wantsCR0 &= !code[i].outputCR0;
wantsCR1 &= !code[i].outputCR1;
wantsPS1 &= !code[i].outputPS1;
wantsFPRF &= !code[i].outputFPRF;
}
return address;
}

View File

@ -34,9 +34,12 @@ struct CodeOp //16B
bool wantsCR0;
bool wantsCR1;
bool wantsPS1;
bool wantsFPRF;
bool outputCR0;
bool outputCR1;
bool outputPS1;
bool outputFPRF;
bool canEndBlock;
bool skip; // followed BL-s for example
};

View File

@ -25,6 +25,14 @@ GekkoOPInfo *m_infoTable63[1024];
GekkoOPInfo *m_allInstructions[512];
int m_numInstructions;
const u64 m_crTable[16] =
{
PPCCRToInternal(0x0), PPCCRToInternal(0x1), PPCCRToInternal(0x2), PPCCRToInternal(0x3),
PPCCRToInternal(0x4), PPCCRToInternal(0x5), PPCCRToInternal(0x6), PPCCRToInternal(0x7),
PPCCRToInternal(0x8), PPCCRToInternal(0x9), PPCCRToInternal(0xA), PPCCRToInternal(0xB),
PPCCRToInternal(0xC), PPCCRToInternal(0xD), PPCCRToInternal(0xE), PPCCRToInternal(0xF),
};
GekkoOPInfo *GetOpInfo(UGeckoInstruction _inst)
{
GekkoOPInfo *info = m_infoTable[_inst.OPCD];

View File

@ -36,6 +36,8 @@ enum
FL_EVIL = (1<<17),
FL_USE_FPU = (1<<18),
FL_LOADSTORE = (1<<19),
FL_SET_FPRF = (1<<20),
FL_READ_FPRF = (1<<21),
};
enum

View File

@ -187,11 +187,15 @@ inline u64 PPCCRToInternal(u8 value)
return cr_val;
}
// convert flags into 64-bit CR values with a lookup table
extern const u64 m_crTable[16];
// Warning: these CR operations are fairly slow since they need to convert from
// PowerPC format (4 bit) to our internal 64 bit format. See the definition of
// ppcState.cr_val for more explanations.
inline void SetCRField(int cr_field, int value) {
PowerPC::ppcState.cr_val[cr_field] = PPCCRToInternal(value);
inline void SetCRField(int cr_field, int value)
{
PowerPC::ppcState.cr_val[cr_field] = m_crTable[value];
}
inline u32 GetCRField(int cr_field) {