From 7dbc623dc0eb27efc98f001dd6b4e8153c1e413a Mon Sep 17 00:00:00 2001 From: Fiora Date: Wed, 20 Aug 2014 02:22:07 -0700 Subject: [PATCH] JIT: Initial FPRF support Doesn't support all the FPSCR flags, just the FPRF ones. Add PPCAnalyzer support to remove unnecessary FPRF calculations. POV-ray benchmark with enableFPRF forced on for an extreme comparison: Before: 1500s After, fmul/fmadd only: 728s After, all float: 753s In real games that use FPRF, like F-Zero GX, FPRF previously cost a few percent of total runtime. Since FPRF is so much faster now, if enableFPRF is set, just do it for every float instruction, not just fmul/fmadd like before. I don't know if this will fix any games, but there's little good reason not to. --- Source/Core/Core/PowerPC/Gekko.h | 3 + .../Interpreter/Interpreter_Tables.cpp | 98 +++++++++---------- Source/Core/Core/PowerPC/Jit64/Jit.h | 5 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 39 +++++--- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 14 ++- .../PowerPC/Jit64/Jit_SystemRegisters.cpp | 9 -- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 98 +++++++++++++++++++ Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 1 + Source/Core/Core/PowerPC/PPCAnalyst.cpp | 25 +++-- Source/Core/Core/PowerPC/PPCAnalyst.h | 3 + Source/Core/Core/PowerPC/PPCTables.cpp | 8 ++ Source/Core/Core/PowerPC/PPCTables.h | 2 + Source/Core/Core/PowerPC/PowerPC.h | 8 +- 13 files changed, 222 insertions(+), 91 deletions(-) diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h index 56c3e1bb57..99cc750ee1 100644 --- a/Source/Core/Core/PowerPC/Gekko.h +++ b/Source/Core/Core/PowerPC/Gekko.h @@ -383,6 +383,9 @@ union UReg_MSR UReg_MSR() { Hex = 0; } }; +#define FPRF_SHIFT 12 +#define FPRF_MASK (0x1F << FPRF_SHIFT) + // Floating Point Status and Control Register union UReg_FPSCR { diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 0422462e98..2bf66ae99b 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -97,14 +97,14 @@ static GekkoOPTemplate primarytable[] = static GekkoOPTemplate table4[] = { //SUBOP10 - {0, Interpreter::ps_cmpu0, {"ps_cmpu0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU, 1, 0, 0, 0}}, - {32, Interpreter::ps_cmpo0, {"ps_cmpo0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU, 1, 0, 0, 0}}, + {0, Interpreter::ps_cmpu0, {"ps_cmpu0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {32, Interpreter::ps_cmpo0, {"ps_cmpo0", OPTYPE_PS, FL_SET_CRn | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {40, Interpreter::ps_neg, {"ps_neg", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, {136, Interpreter::ps_nabs, {"ps_nabs", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, {264, Interpreter::ps_abs, {"ps_abs", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, - {64, Interpreter::ps_cmpu1, {"ps_cmpu1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, + {64, Interpreter::ps_cmpu1, {"ps_cmpu1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {72, Interpreter::ps_mr, {"ps_mr", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, - {96, Interpreter::ps_cmpo1, {"ps_cmpo1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, + {96, Interpreter::ps_cmpo1, {"ps_cmpo1", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {528, Interpreter::ps_merge00, {"ps_merge00", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, {560, Interpreter::ps_merge01, {"ps_merge01", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, {592, Interpreter::ps_merge10, {"ps_merge10", OPTYPE_PS, FL_RC_BIT | FL_USE_FPU, 1, 0, 0, 0}}, @@ -115,23 +115,23 @@ static GekkoOPTemplate table4[] = static GekkoOPTemplate table4_2[] = { - {10, Interpreter::ps_sum0, {"ps_sum0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {11, Interpreter::ps_sum1, {"ps_sum1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {12, Interpreter::ps_muls0, {"ps_muls0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {13, Interpreter::ps_muls1, {"ps_muls1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {14, Interpreter::ps_madds0, {"ps_madds0", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {15, Interpreter::ps_madds1, {"ps_madds1", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {18, Interpreter::ps_div, {"ps_div", OPTYPE_PS, FL_USE_FPU, 17, 0, 0, 0}}, - {20, Interpreter::ps_sub, {"ps_sub", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {21, Interpreter::ps_add, {"ps_add", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, + {10, Interpreter::ps_sum0, {"ps_sum0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {11, Interpreter::ps_sum1, {"ps_sum1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {12, Interpreter::ps_muls0, {"ps_muls0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {13, Interpreter::ps_muls1, {"ps_muls1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {14, Interpreter::ps_madds0, {"ps_madds0", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {15, Interpreter::ps_madds1, {"ps_madds1", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {18, Interpreter::ps_div, {"ps_div", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}}, + {20, Interpreter::ps_sub, {"ps_sub", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {21, Interpreter::ps_add, {"ps_add", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {23, Interpreter::ps_sel, {"ps_sel", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {24, Interpreter::ps_res, {"ps_res", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {25, Interpreter::ps_mul, {"ps_mul", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {26, Interpreter::ps_rsqrte, {"ps_rsqrte", OPTYPE_PS, FL_USE_FPU, 2, 0, 0, 0}}, - {28, Interpreter::ps_msub, {"ps_msub", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {29, Interpreter::ps_madd, {"ps_madd", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {30, Interpreter::ps_nmsub, {"ps_nmsub", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, - {31, Interpreter::ps_nmadd, {"ps_nmadd", OPTYPE_PS, FL_USE_FPU, 1, 0, 0, 0}}, + {24, Interpreter::ps_res, {"ps_res", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {25, Interpreter::ps_mul, {"ps_mul", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {26, Interpreter::ps_rsqrte, {"ps_rsqrte", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 2, 0, 0, 0}}, + {28, Interpreter::ps_msub, {"ps_msub", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {29, Interpreter::ps_madd, {"ps_madd", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {30, Interpreter::ps_nmsub, {"ps_nmsub", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {31, Interpreter::ps_nmadd, {"ps_nmadd", OPTYPE_PS, FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, }; @@ -307,51 +307,51 @@ static GekkoOPTemplate table31_2[] = static GekkoOPTemplate table59[] = { - {18, Interpreter::fdivsx, {"fdivsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 17, 0, 0, 0}}, // TODO - {20, Interpreter::fsubsx, {"fsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {21, Interpreter::faddsx, {"faddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - //{22, Interpreter::fsqrtsx, {"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, // Not implemented on gekko - {24, Interpreter::fresx, {"fresx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {25, Interpreter::fmulsx, {"fmulsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {28, Interpreter::fmsubsx, {"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {29, Interpreter::fmaddsx, {"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {30, Interpreter::fnmsubsx, {"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {31, Interpreter::fnmaddsx, {"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {18, Interpreter::fdivsx, {"fdivsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 17, 0, 0, 0}}, // TODO + {20, Interpreter::fsubsx, {"fsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {21, Interpreter::faddsx, {"faddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + //{22, Interpreter::fsqrtsx, {"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, // Not implemented on gekko + {24, Interpreter::fresx, {"fresx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {25, Interpreter::fmulsx, {"fmulsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {28, Interpreter::fmsubsx, {"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {29, Interpreter::fmaddsx, {"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {30, Interpreter::fnmsubsx, {"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {31, Interpreter::fnmaddsx, {"fnmaddsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, }; static GekkoOPTemplate table63[] = { {264, Interpreter::fabsx, {"fabsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {32, Interpreter::fcmpo, {"fcmpo", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {0, Interpreter::fcmpu, {"fcmpu", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {32, Interpreter::fcmpo, {"fcmpo", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {0, Interpreter::fcmpu, {"fcmpu", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {14, Interpreter::fctiwx, {"fctiwx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {15, Interpreter::fctiwzx, {"fctiwzx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {72, Interpreter::fmrx, {"fmrx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {136, Interpreter::fnabsx, {"fnabsx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, {40, Interpreter::fnegx, {"fnegx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {12, Interpreter::frspx, {"frspx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {12, Interpreter::frspx, {"frspx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, - {64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_USE_FPU, 1, 0, 0, 0}}, - {583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_USE_FPU, 1, 0, 0, 0}}, - {70, Interpreter::mtfsb0x, {"mtfsb0x", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}}, - {38, Interpreter::mtfsb1x, {"mtfsb1x", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}}, - {134, Interpreter::mtfsfix, {"mtfsfix", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}}, - {711, Interpreter::mtfsfx, {"mtfsfx", OPTYPE_SYSTEMFP, FL_USE_FPU, 3, 0, 0, 0}}, + {64, Interpreter::mcrfs, {"mcrfs", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}}, + {583, Interpreter::mffsx, {"mffsx", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 1, 0, 0, 0}}, + {70, Interpreter::mtfsb0x, {"mtfsb0x", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}}, + {38, Interpreter::mtfsb1x, {"mtfsb1x", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}}, + {134, Interpreter::mtfsfix, {"mtfsfix", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}}, + {711, Interpreter::mtfsfx, {"mtfsfx", OPTYPE_SYSTEMFP, FL_USE_FPU | FL_READ_FPRF, 3, 0, 0, 0}}, }; static GekkoOPTemplate table63_2[] = { - {18, Interpreter::fdivx, {"fdivx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 31, 0, 0, 0}}, - {20, Interpreter::fsubx, {"fsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {21, Interpreter::faddx, {"faddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {22, Interpreter::fsqrtx, {"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {18, Interpreter::fdivx, {"fdivx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 31, 0, 0, 0}}, + {20, Interpreter::fsubx, {"fsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {21, Interpreter::faddx, {"faddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {22, Interpreter::fsqrtx, {"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, {23, Interpreter::fselx, {"fselx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {25, Interpreter::fmulx, {"fmulx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {26, Interpreter::frsqrtex, {"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {28, Interpreter::fmsubx, {"fmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {29, Interpreter::fmaddx, {"fmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {30, Interpreter::fnmsubx, {"fnmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, - {31, Interpreter::fnmaddx, {"fnmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU, 1, 0, 0, 0}}, + {25, Interpreter::fmulx, {"fmulx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {26, Interpreter::frsqrtex, {"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {28, Interpreter::fmsubx, {"fmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {29, Interpreter::fmaddx, {"fmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {30, Interpreter::fnmsubx, {"fnmsubx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, + {31, Interpreter::fnmaddx, {"fnmaddx", OPTYPE_FPU, FL_RC_BIT_F | FL_USE_FPU | FL_SET_FPRF, 1, 0, 0, 0}}, }; namespace InterpreterTables { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 1c6e082fd1..8962a66761 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -116,11 +116,12 @@ public: // Generates a branch that will check if a given bit of a CR register part // is set or not. Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); + void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm); - void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false); + void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); - void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS = false); + void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); // OPCODES void unknown_instruction(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index e35d36e814..18bb8e94db 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -14,7 +14,7 @@ static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x800000 static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; -void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), bool roundRHS) +void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS) { fpr.Lock(d, a, b); if (roundRHS) @@ -88,25 +88,35 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X UNPCKLPD(fpr.RX(d), R(fpr.RX(d))); } } + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } +// We can avoid calculating FPRF if it's not needed; every float operation resets it, so +// if it's going to be clobbered in a future instruction before being read, we can just +// not calculate it. +void Jit64::SetFPRFIfNeeded(UGeckoInstruction inst, X64Reg xmm) +{ + // As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but + // FPRF is fast enough in JIT that we might as well just enable it for every float instruction + // if the enableFPRF flag is set. + if (Core::g_CoreStartupParameter.bEnableFPRF && js.op->wantsFPRF) + SetFPRF(xmm); +} + void Jit64::fp_arith(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); - // Only the interpreter has "proper" support for (some) FP flags - FALLBACK_IF(inst.SUBOP5 == 25 && Core::g_CoreStartupParameter.bEnableFPRF); - bool single = inst.OPCD == 59; switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, single); break; //mul + case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div + case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub + case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -118,9 +128,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); - // Only the interpreter has "proper" support for (some) FP flags - FALLBACK_IF(inst.SUBOP5 == 29 && Core::g_CoreStartupParameter.bEnableFPRF); - bool single_precision = inst.OPCD == 59; int a = inst.FA; @@ -165,9 +172,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) { MOVSD(fpr.RX(d), R(XMM0)); } - // SMB checks flags after this op. Let's lie. - //AND(32, M(&PowerPC::ppcState.fpscr), Imm32(~((0x80000000 >> 19) | (0x80000000 >> 15)))); - //OR(32, M(&PowerPC::ppcState.fpscr), Imm32((0x80000000 >> 16))); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -241,6 +246,7 @@ void Jit64::fcmpx(UGeckoInstruction inst) fpr.Lock(a,b); fpr.BindToRegister(b, true); + AND(32, M(&FPSCR), Imm32(~FPRF_MASK)); // Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception? UCOMISD(fpr.R(b).GetSimpleReg(), fpr.R(a)); @@ -264,10 +270,13 @@ void Jit64::fcmpx(UGeckoInstruction inst) } MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_EQ))); + OR(32, M(&FPSCR), Imm32(CR_EQ << FPRF_SHIFT)); + continue1 = J(); SetJumpTarget(pNaN); MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_SO))); + OR(32, M(&FPSCR), Imm32(CR_SO << FPRF_SHIFT)); if (a != b) { @@ -275,10 +284,12 @@ void Jit64::fcmpx(UGeckoInstruction inst) SetJumpTarget(pGreater); MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_GT))); + OR(32, M(&FPSCR), Imm32(CR_GT << FPRF_SHIFT)); continue3 = J(); SetJumpTarget(pLesser); MOV(64, R(RAX), Imm64(PPCCRToInternal(CR_LT))); + OR(32, M(&FPSCR), Imm32(CR_LT << FPRF_SHIFT)); } SetJumpTarget(continue1); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index b2fc5e4150..88b65fbd72 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -113,7 +113,7 @@ add a,b,a */ //There's still a little bit more optimization that can be squeezed out of this -void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), bool roundRHS) +void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) { fpr.Lock(d, a, b); @@ -163,6 +163,7 @@ void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X6 (this->*op)(fpr.RX(d), fpr.R(b)); } ForceSinglePrecisionP(fpr.RX(d)); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -175,16 +176,16 @@ void Jit64::ps_arith(UGeckoInstruction inst) switch (inst.SUBOP5) { case 18: // div - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); + tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst); break; case 20: // sub - tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); + tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst); break; case 21: // add - tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); + tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst); break; case 25: // mul - tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, true); + tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true); break; default: _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); @@ -228,6 +229,7 @@ void Jit64::ps_sum(UGeckoInstruction inst) PanicAlert("ps_sum WTF!!!"); } ForceSinglePrecisionP(fpr.RX(d)); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -267,6 +269,7 @@ void Jit64::ps_muls(UGeckoInstruction inst) PanicAlert("ps_muls WTF!!!"); } ForceSinglePrecisionP(fpr.RX(d)); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } @@ -372,5 +375,6 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) fpr.BindToRegister(d, false); MOVAPD(fpr.RX(d), Gen::R(XMM0)); ForceSinglePrecisionP(fpr.RX(d)); + SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index e3e9c7df15..703341ba8a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -284,15 +284,6 @@ void Jit64::mfcr(UGeckoInstruction inst) gpr.UnlockAllX(); } -// convert flags into 64-bit CR values with a lookup table -static const u64 m_crTable[16] = -{ - PPCCRToInternal(0x0), PPCCRToInternal(0x1), PPCCRToInternal(0x2), PPCCRToInternal(0x3), - PPCCRToInternal(0x4), PPCCRToInternal(0x5), PPCCRToInternal(0x6), PPCCRToInternal(0x7), - PPCCRToInternal(0x8), PPCCRToInternal(0x9), PPCCRToInternal(0xA), PPCCRToInternal(0xB), - PPCCRToInternal(0xC), PPCCRToInternal(0xD), PPCCRToInternal(0xE), PPCCRToInternal(0xF), -}; - void Jit64::mtcrf(UGeckoInstruction inst) { INSTRUCTION_START diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 01da7f8b1c..e0a2bf7de6 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -6,6 +6,7 @@ #include "Common/Common.h" #include "Common/CPUDetect.h" +#include "Common/MathUtil.h" #include "Core/HW/MMIO.h" #include "Core/PowerPC/JitCommon/Jit_Util.h" @@ -695,6 +696,103 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr MOVDDUP(dst, R(dst)); } +static const u64 GC_ALIGNED16(psDoubleExp[2]) = {0x7FF0000000000000ULL, 0}; +static const u64 GC_ALIGNED16(psDoubleFrac[2]) = {0x000FFFFFFFFFFFFFULL, 0}; +static const u64 GC_ALIGNED16(psDoubleNoSign[2]) = {0x7FFFFFFFFFFFFFFFULL, 0}; + +// TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer, storing +// the result of each floating point op and calculating it when needed. This is trickier than for integers +// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point values, +// which makes the whole thing rather trickier. +// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this isn't +// quite that necessary. +void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) +{ + AND(32, M(&FPSCR), Imm32(~FPRF_MASK)); + + FixupBranch continue1, continue2, continue3, continue4; + if (cpu_info.bSSE4_1) + { + MOVQ_xmm(R(RAX), xmm); + SHR(64, R(RAX), Imm8(63)); // Get the sign bit; almost all the branches need it. + PTEST(xmm, M((void*)psDoubleExp)); + FixupBranch maxExponent = J_CC(CC_C); + FixupBranch zeroExponent = J_CC(CC_Z); + + // Nice normalized number: sign ? PPC_FPCLASS_NN : PPC_FPCLASS_PN; + LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NN - MathUtil::PPC_FPCLASS_PN, MathUtil::PPC_FPCLASS_PN)); + continue1 = J(); + + SetJumpTarget(maxExponent); + PTEST(xmm, M((void*)psDoubleFrac)); + FixupBranch notNAN = J_CC(CC_Z); + + // Max exponent + mantissa: PPC_FPCLASS_QNAN + MOV(32, R(EAX), Imm32(MathUtil::PPC_FPCLASS_QNAN)); + continue2 = J(); + + // Max exponent + no mantissa: sign ? PPC_FPCLASS_NINF : PPC_FPCLASS_PINF; + SetJumpTarget(notNAN); + LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NINF - MathUtil::PPC_FPCLASS_PINF, MathUtil::PPC_FPCLASS_NINF)); + continue3 = J(); + + SetJumpTarget(zeroExponent); + PTEST(xmm, R(xmm)); + FixupBranch zero = J_CC(CC_Z); + + // No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD; + LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_ND - MathUtil::PPC_FPCLASS_PD, MathUtil::PPC_FPCLASS_ND)); + continue4 = J(); + + // Zero: sign ? PPC_FPCLASS_NZ : PPC_FPCLASS_PZ; + SetJumpTarget(zero); + SHL(32, R(EAX), Imm8(4)); + ADD(32, R(EAX), Imm8(MathUtil::PPC_FPCLASS_PZ)); + } + else + { + MOVQ_xmm(R(RAX), xmm); + TEST(64, R(RAX), M((void*)psDoubleExp)); + FixupBranch zeroExponent = J_CC(CC_Z); + AND(64, R(RAX), M((void*)psDoubleNoSign)); + CMP(64, R(RAX), M((void*)psDoubleExp)); + FixupBranch nan = J_CC(CC_G); // This works because if the sign bit is set, RAX is negative + FixupBranch infinity = J_CC(CC_E); + MOVQ_xmm(R(RAX), xmm); + SHR(64, R(RAX), Imm8(63)); + LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NN - MathUtil::PPC_FPCLASS_PN, MathUtil::PPC_FPCLASS_PN)); + continue1 = J(); + SetJumpTarget(nan); + MOVQ_xmm(R(RAX), xmm); + SHR(64, R(RAX), Imm8(63)); + MOV(32, R(EAX), Imm32(MathUtil::PPC_FPCLASS_QNAN)); + continue2 = J(); + SetJumpTarget(infinity); + MOVQ_xmm(R(RAX), xmm); + SHR(64, R(RAX), Imm8(63)); + LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_NINF - MathUtil::PPC_FPCLASS_PINF, MathUtil::PPC_FPCLASS_NINF)); + continue3 = J(); + SetJumpTarget(zeroExponent); + TEST(64, R(RAX), R(RAX)); + FixupBranch zero = J_CC(CC_Z); + SHR(64, R(RAX), Imm8(63)); + LEA(32, EAX, MScaled(EAX, MathUtil::PPC_FPCLASS_ND - MathUtil::PPC_FPCLASS_PD, MathUtil::PPC_FPCLASS_ND)); + continue4 = J(); + SetJumpTarget(zero); + SHR(64, R(RAX), Imm8(63)); + SHL(32, R(EAX), Imm8(4)); + ADD(32, R(EAX), Imm8(MathUtil::PPC_FPCLASS_PZ)); + } + + SetJumpTarget(continue1); + SetJumpTarget(continue2); + SetJumpTarget(continue3); + SetJumpTarget(continue4); + SHL(32, R(EAX), Imm8(FPRF_SHIFT)); + OR(32, M(&FPSCR), R(EAX)); +} + + void EmuCodeBlock::JitClearCA() { AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 22bd922d30..addce16e93 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -62,6 +62,7 @@ public: void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false); // EAX might get trashed void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src); + void SetFPRF(Gen::X64Reg xmm); protected: std::unordered_map registersInUseAtLoc; std::unordered_map pcAtLoc; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 19aaf078f8..3217f5e355 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -453,6 +453,10 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf else code->outputCR1 = (opinfo->flags & FL_SET_CR1) ? true : false; + code->wantsFPRF = (opinfo->flags & FL_READ_FPRF) ? true : false; + code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false; + code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false; + int numOut = 0; int numIn = 0; if (opinfo->flags & FL_OUT_A) @@ -710,24 +714,25 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 } // Scan for CR0 dependency - // assume next block wants CR0 to be safe + // assume next block wants flags to be safe bool wantsCR0 = true; bool wantsCR1 = true; bool wantsPS1 = true; + bool wantsFPRF = true; for (int i = block->m_num_instructions - 1; i >= 0; i--) { - if (code[i].outputCR0) - wantsCR0 = false; - if (code[i].outputCR1) - wantsCR1 = false; - if (code[i].outputPS1) - wantsPS1 = false; - wantsCR0 |= code[i].wantsCR0; - wantsCR1 |= code[i].wantsCR1; - wantsPS1 |= code[i].wantsPS1; + wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock; + wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock; + wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock; + wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock; code[i].wantsCR0 = wantsCR0; code[i].wantsCR1 = wantsCR1; code[i].wantsPS1 = wantsPS1; + code[i].wantsFPRF = wantsFPRF; + wantsCR0 &= !code[i].outputCR0; + wantsCR1 &= !code[i].outputCR1; + wantsPS1 &= !code[i].outputPS1; + wantsFPRF &= !code[i].outputFPRF; } return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index f2b6ec7afa..0f7506ba21 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -34,9 +34,12 @@ struct CodeOp //16B bool wantsCR0; bool wantsCR1; bool wantsPS1; + bool wantsFPRF; bool outputCR0; bool outputCR1; bool outputPS1; + bool outputFPRF; + bool canEndBlock; bool skip; // followed BL-s for example }; diff --git a/Source/Core/Core/PowerPC/PPCTables.cpp b/Source/Core/Core/PowerPC/PPCTables.cpp index 685a9e04e0..e82754ee19 100644 --- a/Source/Core/Core/PowerPC/PPCTables.cpp +++ b/Source/Core/Core/PowerPC/PPCTables.cpp @@ -25,6 +25,14 @@ GekkoOPInfo *m_infoTable63[1024]; GekkoOPInfo *m_allInstructions[512]; int m_numInstructions; +const u64 m_crTable[16] = +{ + PPCCRToInternal(0x0), PPCCRToInternal(0x1), PPCCRToInternal(0x2), PPCCRToInternal(0x3), + PPCCRToInternal(0x4), PPCCRToInternal(0x5), PPCCRToInternal(0x6), PPCCRToInternal(0x7), + PPCCRToInternal(0x8), PPCCRToInternal(0x9), PPCCRToInternal(0xA), PPCCRToInternal(0xB), + PPCCRToInternal(0xC), PPCCRToInternal(0xD), PPCCRToInternal(0xE), PPCCRToInternal(0xF), +}; + GekkoOPInfo *GetOpInfo(UGeckoInstruction _inst) { GekkoOPInfo *info = m_infoTable[_inst.OPCD]; diff --git a/Source/Core/Core/PowerPC/PPCTables.h b/Source/Core/Core/PowerPC/PPCTables.h index a590134d0e..f535817c94 100644 --- a/Source/Core/Core/PowerPC/PPCTables.h +++ b/Source/Core/Core/PowerPC/PPCTables.h @@ -36,6 +36,8 @@ enum FL_EVIL = (1<<17), FL_USE_FPU = (1<<18), FL_LOADSTORE = (1<<19), + FL_SET_FPRF = (1<<20), + FL_READ_FPRF = (1<<21), }; enum diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index aa4e351d94..4b6ade70d4 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -187,11 +187,15 @@ inline u64 PPCCRToInternal(u8 value) return cr_val; } +// convert flags into 64-bit CR values with a lookup table +extern const u64 m_crTable[16]; + // Warning: these CR operations are fairly slow since they need to convert from // PowerPC format (4 bit) to our internal 64 bit format. See the definition of // ppcState.cr_val for more explanations. -inline void SetCRField(int cr_field, int value) { - PowerPC::ppcState.cr_val[cr_field] = PPCCRToInternal(value); +inline void SetCRField(int cr_field, int value) +{ + PowerPC::ppcState.cr_val[cr_field] = m_crTable[value]; } inline u32 GetCRField(int cr_field) {