JIT: Initial FPRF support

Doesn't support all the FPSCR flags, just the FPRF ones. Add PPCAnalyzer support to remove unnecessary FPRF calculations. POV-ray benchmark with enableFPRF forced on for an extreme comparison: Before: 1500s After, fmul/fmadd only: 728s After, all float: 753s In real games that use FPRF, like F-Zero GX, FPRF previously cost a few percent of total runtime. Since FPRF is so much faster now, if enableFPRF is set, just do it for every float instruction, not just fmul/fmadd like before. I don't know if this will fix any games, but there's little good reason not to.
2014-08-20 02:22:07 -07:00 · 2014-08-20 02:22:07 -07:00 · 7dbc623dc0
parent f52888d3ec
commit 7dbc623dc0
13 changed files with 222 additions and 91 deletions
--- a/Source/Core/Core/PowerPC/Gekko.h
+++ b/Source/Core/Core/PowerPC/Gekko.h
@ -383,6 +383,9 @@ union UReg_MSR
 	UReg_MSR()         { Hex = 0; }
 };

+#define FPRF_SHIFT 12
+#define FPRF_MASK (0x1F << FPRF_SHIFT)
+
 // Floating Point Status and Control Register
 union UReg_FPSCR
 {
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp
@ -97,14 +97,14 @@ static GekkoOPTemplate primarytable[] =

 static GekkoOPTemplate table4[] =
 {    //SUBOP10
-	{0,    Interpreter::ps_cmpu0,   {"ps_cmpu0",   OPTYPE_PS, FL_SET_CRn | FL_USE_FPU, 1, 0, 0, 0}},
-	{32,   Interpreter::ps_cmpo0,   {"ps_cmpo0",   OPTYPE_PS, FL_SET_CRn | FL_USE_FPU, 1, 0, 0, 0}},
+	{0,    Interpreter::ps_cmpu0,   {"ps_cmpu0",   OPTYPE_PS, FL_SET_CRn | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{32,   Interpreter::ps_cmpo0,   {"ps_cmpo0",   OPTYPE_PS, FL_SET_CRn | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{40,   Interpreter::ps_neg,     {"ps_neg",     OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
 	{136,  Interpreter::ps_nabs,    {"ps_nabs",    OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
 	{264,  Interpreter::ps_abs,     {"ps_abs",     OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
-	{64,   Interpreter::ps_cmpu1,   {"ps_cmpu1",   OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
+	{64,   Interpreter::ps_cmpu1,   {"ps_cmpu1",   OPTYPE_PS, FL_RC_BIT | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{72,   Interpreter::ps_mr,      {"ps_mr",      OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
-	{96,   Interpreter::ps_cmpo1,   {"ps_cmpo1",   OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
+	{96,   Interpreter::ps_cmpo1,   {"ps_cmpo1",   OPTYPE_PS, FL_RC_BIT | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{528,  Interpreter::ps_merge00, {"ps_merge00", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
 	{560,  Interpreter::ps_merge01, {"ps_merge01", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
 	{592,  Interpreter::ps_merge10, {"ps_merge10", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}},
@ -115,23 +115,23 @@ static GekkoOPTemplate table4[] =

 static GekkoOPTemplate table4_2[] =
 {
-	{10, Interpreter::ps_sum0,      {"ps_sum0",   OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{11, Interpreter::ps_sum1,      {"ps_sum1",   OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{12, Interpreter::ps_muls0,     {"ps_muls0",  OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{13, Interpreter::ps_muls1,     {"ps_muls1",  OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{14, Interpreter::ps_madds0,    {"ps_madds0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{15, Interpreter::ps_madds1,    {"ps_madds1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{18, Interpreter::ps_div,       {"ps_div",    OPTYPE_PS, FL_USE_FPU, 17, 0, 0, 0}},
-	{20, Interpreter::ps_sub,       {"ps_sub",    OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{21, Interpreter::ps_add,       {"ps_add",    OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
+	{10, Interpreter::ps_sum0,      {"ps_sum0",   OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{11, Interpreter::ps_sum1,      {"ps_sum1",   OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{12, Interpreter::ps_muls0,     {"ps_muls0",  OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{13, Interpreter::ps_muls1,     {"ps_muls1",  OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{14, Interpreter::ps_madds0,    {"ps_madds0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{15, Interpreter::ps_madds1,    {"ps_madds1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{18, Interpreter::ps_div,       {"ps_div",    OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}},
+	{20, Interpreter::ps_sub,       {"ps_sub",    OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{21, Interpreter::ps_add,       {"ps_add",    OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{23, Interpreter::ps_sel,       {"ps_sel",    OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{24, Interpreter::ps_res,       {"ps_res",    OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{25, Interpreter::ps_mul,       {"ps_mul",    OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{26, Interpreter::ps_rsqrte,    {"ps_rsqrte", OPTYPE_PS, FL_USE_FPU, 2, 0, 0, 0}},
-	{28, Interpreter::ps_msub,      {"ps_msub",   OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{29, Interpreter::ps_madd,      {"ps_madd",   OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{30, Interpreter::ps_nmsub,     {"ps_nmsub",  OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
-	{31, Interpreter::ps_nmadd,     {"ps_nmadd",  OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}},
+	{24, Interpreter::ps_res,       {"ps_res",    OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{25, Interpreter::ps_mul,       {"ps_mul",    OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{26, Interpreter::ps_rsqrte,    {"ps_rsqrte", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}},
+	{28, Interpreter::ps_msub,      {"ps_msub",   OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{29, Interpreter::ps_madd,      {"ps_madd",   OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{30, Interpreter::ps_nmsub,     {"ps_nmsub",  OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{31, Interpreter::ps_nmadd,     {"ps_nmadd",  OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 };


@ -307,51 +307,51 @@ static GekkoOPTemplate table31_2[] =

 static GekkoOPTemplate table59[] =
 {
-	{18, Interpreter::fdivsx,       {"fdivsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 17, 0, 0, 0}}, // TODO
-	{20, Interpreter::fsubsx,       {"fsubsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{21, Interpreter::faddsx,       {"faddsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	//{22, Interpreter::fsqrtsx,      {"fsqrtsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, // Not implemented on gekko
-	{24, Interpreter::fresx,        {"fresx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{25, Interpreter::fmulsx,       {"fmulsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{28, Interpreter::fmsubsx,      {"fmsubsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{29, Interpreter::fmaddsx,      {"fmaddsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{30, Interpreter::fnmsubsx,     {"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{31, Interpreter::fnmaddsx,     {"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{18, Interpreter::fdivsx,       {"fdivsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}}, // TODO
+	{20, Interpreter::fsubsx,       {"fsubsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{21, Interpreter::faddsx,       {"faddsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	//{22, Interpreter::fsqrtsx,      {"fsqrtsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, // Not implemented on gekko
+	{24, Interpreter::fresx,        {"fresx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{25, Interpreter::fmulsx,       {"fmulsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{28, Interpreter::fmsubsx,      {"fmsubsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{29, Interpreter::fmaddsx,      {"fmaddsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{30, Interpreter::fnmsubsx,     {"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{31, Interpreter::fnmaddsx,     {"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 };

 static GekkoOPTemplate table63[] =
 {
 	{264, Interpreter::fabsx,       {"fabsx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{32,  Interpreter::fcmpo,       {"fcmpo",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{0,   Interpreter::fcmpu,       {"fcmpu",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{32,  Interpreter::fcmpo,       {"fcmpo",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{0,   Interpreter::fcmpu,       {"fcmpu",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{14,  Interpreter::fctiwx,      {"fctiwx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{15,  Interpreter::fctiwzx,     {"fctiwzx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{72,  Interpreter::fmrx,        {"fmrx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{136, Interpreter::fnabsx,      {"fnabsx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
 	{40,  Interpreter::fnegx,       {"fnegx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{12,  Interpreter::frspx,       {"frspx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{12,  Interpreter::frspx,       {"frspx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},

-	{64,  Interpreter::mcrfs,       {"mcrfs",   OPTYPE_SYSTEMFP, FL_USE_FPU, 1, 0, 0, 0}},
-	{583, Interpreter::mffsx,       {"mffsx",   OPTYPE_SYSTEMFP, FL_USE_FPU, 1, 0, 0, 0}},
-	{70,  Interpreter::mtfsb0x,     {"mtfsb0x", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
-	{38,  Interpreter::mtfsb1x,     {"mtfsb1x", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
-	{134, Interpreter::mtfsfix,     {"mtfsfix", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
-	{711, Interpreter::mtfsfx,      {"mtfsfx",  OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}},
+	{64,  Interpreter::mcrfs,       {"mcrfs",   OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
+	{583, Interpreter::mffsx,       {"mffsx",   OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}},
+	{70,  Interpreter::mtfsb0x,     {"mtfsb0x", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
+	{38,  Interpreter::mtfsb1x,     {"mtfsb1x", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
+	{134, Interpreter::mtfsfix,     {"mtfsfix", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
+	{711, Interpreter::mtfsfx,      {"mtfsfx",  OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}},
 };

 static GekkoOPTemplate table63_2[] =
 {
-	{18, Interpreter::fdivx,        {"fdivx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 31, 0, 0, 0}},
-	{20, Interpreter::fsubx,        {"fsubx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{21, Interpreter::faddx,        {"faddx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{22, Interpreter::fsqrtx,       {"fsqrtx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{18, Interpreter::fdivx,        {"fdivx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}},
+	{20, Interpreter::fsubx,        {"fsubx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{21, Interpreter::faddx,        {"faddx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{22, Interpreter::fsqrtx,       {"fsqrtx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 	{23, Interpreter::fselx,        {"fselx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{25, Interpreter::fmulx,        {"fmulx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{26, Interpreter::frsqrtex,     {"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{28, Interpreter::fmsubx,       {"fmsubx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{29, Interpreter::fmaddx,       {"fmaddx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{30, Interpreter::fnmsubx,      {"fnmsubx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
-	{31, Interpreter::fnmaddx,      {"fnmaddx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}},
+	{25, Interpreter::fmulx,        {"fmulx",    OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{26, Interpreter::frsqrtex,     {"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{28, Interpreter::fmsubx,       {"fmsubx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{29, Interpreter::fmaddx,       {"fmaddx",   OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{30, Interpreter::fnmsubx,      {"fnmsubx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
+	{31, Interpreter::fnmaddx,      {"fnmaddx",  OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}},
 };
 namespace InterpreterTables
 {
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -116,11 +116,12 @@ public:
 	// Generates a branch that will check if a given bit of a CR register part
 	// is set or not.
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
+	void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);

-	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
+	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
-	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false);
+	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);

 	// OPCODES
 	void unknown_instruction(UGeckoInstruction _inst);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@ -14,7 +14,7 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000
 static const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};

-void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS)
+void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
 {
 	fpr.Lock(d, a, b);
 	if (roundRHS)
@ -88,25 +88,35 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
 			UNPCKLPD(fpr.RX(d), R(fpr.RX(d)));
 		}
 	}
+	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
 }

+// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
+// if it's going to be clobbered in a future instruction before being read, we can just
+// not calculate it.
+void Jit64::SetFPRFIfNeeded(UGeckoInstruction inst, X64Reg xmm)
+{
+	// As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but
+	// FPRF is fast enough in JIT that we might as well just enable it for every float instruction
+	// if the enableFPRF flag is set.
+	if (Core::g_CoreStartupParameter.bEnableFPRF && js.op->wantsFPRF)
+		SetFPRF(xmm);
+}
+
 void Jit64::fp_arith(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
 	JITDISABLE(bJITFloatingPointOff);
 	FALLBACK_IF(inst.Rc);

-	// Only the interpreter has "proper" support for (some) FP flags
-	FALLBACK_IF(inst.SUBOP5 == 25 && Core::g_CoreStartupParameter.bEnableFPRF);
-
 	bool single = inst.OPCD == 59;
 	switch (inst.SUBOP5)
 	{
-	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div
-	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub
-	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  single, &XEmitter::ADDSD); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul
+	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div
+	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub
+	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add
+	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@ -118,9 +128,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	JITDISABLE(bJITFloatingPointOff);
 	FALLBACK_IF(inst.Rc);

-	// Only the interpreter has "proper" support for (some) FP flags
-	FALLBACK_IF(inst.SUBOP5 == 29 && Core::g_CoreStartupParameter.bEnableFPRF);
-
 	bool single_precision = inst.OPCD == 59;

 	int a = inst.FA;
@ -165,9 +172,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	{
 		MOVSD(fpr.RX(d), R(XMM0));
 	}
-	// SMB checks flags after this op. Let's lie.
-	//AND(32, M(&PowerPC::ppcState.fpscr), Imm32(~((0x80000000 >> 19) | (0x80000000 >> 15))));
-	//OR(32, M(&PowerPC::ppcState.fpscr), Imm32((0x80000000 >> 16)));
+	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
 }

@ -241,6 +246,7 @@ void Jit64::fcmpx(UGeckoInstruction inst)
 	fpr.Lock(a,b);
 	fpr.BindToRegister(b, true);

+	AND(32, M(&FPSCR), Imm32(~FPRF_MASK));
 	// Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception?
 	UCOMISD(fpr.R(b).GetSimpleReg(), fpr.R(a));

@ -264,10 +270,13 @@ void Jit64::fcmpx(UGeckoInstruction inst)
 	}

 	MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_EQ)));
+	OR(32, M(&FPSCR), Imm32(CR_EQ << FPRF_SHIFT));
+
 	continue1 = J();

 	SetJumpTarget(pNaN);
 	MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_SO)));
+	OR(32, M(&FPSCR), Imm32(CR_SO << FPRF_SHIFT));

 	if (a != b)
 	{
@ -275,10 +284,12 @@ void Jit64::fcmpx(UGeckoInstruction inst)

 		SetJumpTarget(pGreater);
 		MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_GT)));
+		OR(32, M(&FPSCR), Imm32(CR_GT << FPRF_SHIFT));
 		continue3 = J();

 		SetJumpTarget(pLesser);
 		MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_LT)));
+		OR(32, M(&FPSCR), Imm32(CR_LT << FPRF_SHIFT));
 	}

 	SetJumpTarget(continue1);
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp
@ -113,7 +113,7 @@ add a,b,a
 */

 //There's still a little bit more optimization that can be squeezed out of this
-void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS)
+void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
 {
 	fpr.Lock(d, a, b);

@ -163,6 +163,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6
 		(this->*op)(fpr.RX(d), fpr.R(b));
 	}
 	ForceSinglePrecisionP(fpr.RX(d));
+	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
 }

@ -175,16 +176,16 @@ void Jit64::ps_arith(UGeckoInstruction inst)
 	switch (inst.SUBOP5)
 	{
 	case 18: // div
-		tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD);
+		tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst);
 		break;
 	case 20: // sub
-		tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD);
+		tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst);
 		break;
 	case 21: // add
-		tri_op(inst.FD, inst.FA, inst.FB, true,  &XEmitter::ADDPD);
+		tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst);
 		break;
 	case 25: // mul
-		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true);
+		tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true);
 		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
@ -228,6 +229,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
 		PanicAlert("ps_sum WTF!!!");
 	}
 	ForceSinglePrecisionP(fpr.RX(d));
+	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
 }

@ -267,6 +269,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
 		PanicAlert("ps_muls WTF!!!");
 	}
 	ForceSinglePrecisionP(fpr.RX(d));
+	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
 }

@ -372,5 +375,6 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
 	fpr.BindToRegister(d, false);
 	MOVAPD(fpr.RX(d), Gen::R(XMM0));
 	ForceSinglePrecisionP(fpr.RX(d));
+	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
 }
--- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
@ -284,15 +284,6 @@ void Jit64::mfcr(UGeckoInstruction inst)
 	gpr.UnlockAllX();
 }

-// convert flags into 64-bit CR values with a lookup table
-static const u64 m_crTable[16] =
-{
-	PPCCRToInternal(0x0), PPCCRToInternal(0x1), PPCCRToInternal(0x2), PPCCRToInternal(0x3),
-	PPCCRToInternal(0x4), PPCCRToInternal(0x5), PPCCRToInternal(0x6), PPCCRToInternal(0x7),
-	PPCCRToInternal(0x8), PPCCRToInternal(0x9), PPCCRToInternal(0xA), PPCCRToInternal(0xB),
-	PPCCRToInternal(0xC), PPCCRToInternal(0xD), PPCCRToInternal(0xE), PPCCRToInternal(0xF),
-};
-
 void Jit64::mtcrf(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -6,6 +6,7 @@

 #include "Common/Common.h"
 #include "Common/CPUDetect.h"
+#include "Common/MathUtil.h"

 #include "Core/HW/MMIO.h"
 #include "Core/PowerPC/JitCommon/Jit_Util.h"
@ -695,6 +696,103 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
 	MOVDDUP(dst, R(dst));
 }

+static const u64 GC_ALIGNED16(psDoubleExp[2])  = {0x7FF0000000000000ULL, 0};
+static const u64 GC_ALIGNED16(psDoubleFrac[2]) = {0x000FFFFFFFFFFFFFULL, 0};
+static const u64 GC_ALIGNED16(psDoubleNoSign[2]) = {0x7FFFFFFFFFFFFFFFULL, 0};
+
+// TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer, storing
+// the result of each floating point op and calculating it when needed. This is trickier than for integers
+// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point values,
+// which makes the whole thing rather trickier.
+// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this isn't
+// quite that necessary.
+void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
+{
+	AND(32, M(&FPSCR), Imm32(~FPRF_MASK));
+
+	FixupBranch continue1, continue2, continue3, continue4;
+	if (cpu_info.bSSE4_1)
+	{
+		MOVQ_xmm(R(RAX), xmm);
+		SHR(64, R(RAX), Imm8(63)); // Get the sign bit; almost all the branches need it.
+		PTEST(xmm, M((void*)psDoubleExp));
+		FixupBranch maxExponent = J_CC(CC_C);
+		FixupBranch zeroExponent = J_CC(CC_Z);
+
+		// Nice normalized number: sign ? PPC_FPCLASS_NN : PPC_FPCLASS_PN;
+		LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NN - MathUtil::PPC_FPCLASS_PN, MathUtil::PPC_FPCLASS_PN));
+		continue1 = J();
+
+		SetJumpTarget(maxExponent);
+		PTEST(xmm, M((void*)psDoubleFrac));
+		FixupBranch notNAN = J_CC(CC_Z);
+
+		// Max exponent + mantissa: PPC_FPCLASS_QNAN
+		MOV(32, R(EAX), Imm32(MathUtil::PPC_FPCLASS_QNAN));
+		continue2 = J();
+
+		// Max exponent + no mantissa: sign ? PPC_FPCLASS_NINF : PPC_FPCLASS_PINF;
+		SetJumpTarget(notNAN);
+		LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NINF - MathUtil::PPC_FPCLASS_PINF, MathUtil::PPC_FPCLASS_NINF));
+		continue3 = J();
+
+		SetJumpTarget(zeroExponent);
+		PTEST(xmm, R(xmm));
+		FixupBranch zero = J_CC(CC_Z);
+
+		// No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD;
+		LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_ND - MathUtil::PPC_FPCLASS_PD, MathUtil::PPC_FPCLASS_ND));
+		continue4 = J();
+
+		// Zero: sign ? PPC_FPCLASS_NZ : PPC_FPCLASS_PZ;
+		SetJumpTarget(zero);
+		SHL(32, R(EAX), Imm8(4));
+		ADD(32, R(EAX), Imm8(MathUtil::PPC_FPCLASS_PZ));
+	}
+	else
+	{
+		MOVQ_xmm(R(RAX), xmm);
+		TEST(64, R(RAX), M((void*)psDoubleExp));
+		FixupBranch zeroExponent = J_CC(CC_Z);
+		AND(64, R(RAX), M((void*)psDoubleNoSign));
+		CMP(64, R(RAX), M((void*)psDoubleExp));
+		FixupBranch nan = J_CC(CC_G); // This works because if the sign bit is set, RAX is negative
+		FixupBranch infinity = J_CC(CC_E);
+		MOVQ_xmm(R(RAX), xmm);
+		SHR(64, R(RAX), Imm8(63));
+		LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NN - MathUtil::PPC_FPCLASS_PN, MathUtil::PPC_FPCLASS_PN));
+		continue1 = J();
+		SetJumpTarget(nan);
+		MOVQ_xmm(R(RAX), xmm);
+		SHR(64, R(RAX), Imm8(63));
+		MOV(32, R(EAX), Imm32(MathUtil::PPC_FPCLASS_QNAN));
+		continue2 = J();
+		SetJumpTarget(infinity);
+		MOVQ_xmm(R(RAX), xmm);
+		SHR(64, R(RAX), Imm8(63));
+		LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NINF - MathUtil::PPC_FPCLASS_PINF, MathUtil::PPC_FPCLASS_NINF));
+		continue3 = J();
+		SetJumpTarget(zeroExponent);
+		TEST(64, R(RAX), R(RAX));
+		FixupBranch zero = J_CC(CC_Z);
+		SHR(64, R(RAX), Imm8(63));
+		LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_ND - MathUtil::PPC_FPCLASS_PD, MathUtil::PPC_FPCLASS_ND));
+		continue4 = J();
+		SetJumpTarget(zero);
+		SHR(64, R(RAX), Imm8(63));
+		SHL(32, R(EAX), Imm8(4));
+		ADD(32, R(EAX), Imm8(MathUtil::PPC_FPCLASS_PZ));
+	}
+
+	SetJumpTarget(continue1);
+	SetJumpTarget(continue2);
+	SetJumpTarget(continue3);
+	SetJumpTarget(continue4);
+	SHL(32, R(EAX), Imm8(FPRF_SHIFT));
+	OR(32, M(&FPSCR), R(EAX));
+}
+
+
 void EmuCodeBlock::JitClearCA()
 {
 	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -62,6 +62,7 @@ public:
 	void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
 	// EAX might get trashed
 	void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
+	void SetFPRF(Gen::X64Reg xmm);
 protected:
 	std::unordered_map<u8 *, u32> registersInUseAtLoc;
 	std::unordered_map<u8 *, u32> pcAtLoc;
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@ -453,6 +453,10 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
 	else
 		code->outputCR1 = (opinfo->flags & FL_SET_CR1) ? true : false;

+	code->wantsFPRF = (opinfo->flags & FL_READ_FPRF) ? true : false;
+	code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
+	code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
+
 	int numOut = 0;
 	int numIn = 0;
 	if (opinfo->flags & FL_OUT_A)
@ -710,24 +714,25 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 	}

 	// Scan for CR0 dependency
-	// assume next block wants CR0 to be safe
+	// assume next block wants flags to be safe
 	bool wantsCR0 = true;
 	bool wantsCR1 = true;
 	bool wantsPS1 = true;
+	bool wantsFPRF = true;
 	for (int i = block->m_num_instructions - 1; i >= 0; i--)
 	{
-		if (code[i].outputCR0)
-			wantsCR0 = false;
-		if (code[i].outputCR1)
-			wantsCR1 = false;
-		if (code[i].outputPS1)
-			wantsPS1 = false;
-		wantsCR0 |= code[i].wantsCR0;
-		wantsCR1 |= code[i].wantsCR1;
-		wantsPS1 |= code[i].wantsPS1;
+		wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
+		wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
+		wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
+		wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
 		code[i].wantsCR0 = wantsCR0;
 		code[i].wantsCR1 = wantsCR1;
 		code[i].wantsPS1 = wantsPS1;
+		code[i].wantsFPRF = wantsFPRF;
+		wantsCR0 &= !code[i].outputCR0;
+		wantsCR1 &= !code[i].outputCR1;
+		wantsPS1 &= !code[i].outputPS1;
+		wantsFPRF &= !code[i].outputFPRF;
 	}
 	return address;
 }
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@ -34,9 +34,12 @@ struct CodeOp //16B
 	bool wantsCR0;
 	bool wantsCR1;
 	bool wantsPS1;
+	bool wantsFPRF;
 	bool outputCR0;
 	bool outputCR1;
 	bool outputPS1;
+	bool outputFPRF;
+	bool canEndBlock;
 	bool skip;  // followed BL-s for example
 };

--- a/Source/Core/Core/PowerPC/PPCTables.cpp
+++ b/Source/Core/Core/PowerPC/PPCTables.cpp
@ -25,6 +25,14 @@ GekkoOPInfo *m_infoTable63[1024];
 GekkoOPInfo *m_allInstructions[512];
 int m_numInstructions;

+const u64 m_crTable[16] =
+{
+	PPCCRToInternal(0x0), PPCCRToInternal(0x1), PPCCRToInternal(0x2), PPCCRToInternal(0x3),
+	PPCCRToInternal(0x4), PPCCRToInternal(0x5), PPCCRToInternal(0x6), PPCCRToInternal(0x7),
+	PPCCRToInternal(0x8), PPCCRToInternal(0x9), PPCCRToInternal(0xA), PPCCRToInternal(0xB),
+	PPCCRToInternal(0xC), PPCCRToInternal(0xD), PPCCRToInternal(0xE), PPCCRToInternal(0xF),
+};
+
 GekkoOPInfo *GetOpInfo(UGeckoInstruction _inst)
 {
 	GekkoOPInfo *info = m_infoTable[_inst.OPCD];
--- a/Source/Core/Core/PowerPC/PPCTables.h
+++ b/Source/Core/Core/PowerPC/PPCTables.h
@ -36,6 +36,8 @@ enum
 	FL_EVIL            = (1<<17),
 	FL_USE_FPU         = (1<<18),
 	FL_LOADSTORE       = (1<<19),
+	FL_SET_FPRF        = (1<<20),
+	FL_READ_FPRF       = (1<<21),
 };

 enum
--- a/Source/Core/Core/PowerPC/PowerPC.h
+++ b/Source/Core/Core/PowerPC/PowerPC.h
@ -187,11 +187,15 @@ inline u64 PPCCRToInternal(u8 value)
 	return cr_val;
 }

+// convert flags into 64-bit CR values with a lookup table
+extern const u64 m_crTable[16];
+
 // Warning: these CR operations are fairly slow since they need to convert from
 // PowerPC format (4 bit) to our internal 64 bit format. See the definition of
 // ppcState.cr_val for more explanations.
-inline void SetCRField(int cr_field, int value) {
-	PowerPC::ppcState.cr_val[cr_field] = PPCCRToInternal(value);
+inline void SetCRField(int cr_field, int value)
+{
+	PowerPC::ppcState.cr_val[cr_field] = m_crTable[value];
 }

 inline u32 GetCRField(int cr_field) {