diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 50b68c3bee..d34333761b 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -285,8 +285,8 @@ static GekkoOPTemplate table31_2[] = {522, Interpreter::addcx, {"addcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {138, Interpreter::addex, {"addex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {650, Interpreter::addex, {"addeox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}}, - {234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {234, Interpreter::addmex, {"addmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {202, Interpreter::addzex, {"addzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {491, Interpreter::divwx, {"divwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}}, {1003, Interpreter::divwx, {"divwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 40, 0, 0, 0}}, {459, Interpreter::divwux, {"divwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 40, 0, 0, 0}}, @@ -295,14 +295,14 @@ static GekkoOPTemplate table31_2[] = {11, Interpreter::mulhwux, {"mulhwux", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}}, {235, Interpreter::mullwx, {"mullwx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 5, 0, 0, 0}}, {747, Interpreter::mullwx, {"mullwox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 5, 0, 0, 0}}, - {104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, + {104, Interpreter::negx, {"negx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_RC_BIT, 1, 0, 0, 0}}, {40, Interpreter::subfx, {"subfx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT, 1, 0, 0, 0}}, {552, Interpreter::subfx, {"subox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}}, {8, Interpreter::subfcx, {"subfcx", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {520, Interpreter::subfcx, {"subfcox", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_SET_CA | FL_RC_BIT | FL_SET_OE, 1, 0, 0, 0}}, {136, Interpreter::subfex, {"subfex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, - {200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_AB | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {232, Interpreter::subfmex, {"subfmex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {200, Interpreter::subfzex, {"subfzex", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_READ_CA | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, }; static GekkoOPTemplate table59[] = diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index c9e8057847..2ee1652fa6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -414,10 +414,10 @@ void Jit64::WriteBLRExit() bool disturbed = Cleanup(); if (disturbed) MOV(32, R(RSCRATCH), PPCSTATE(pc)); + MOV(32, R(RSCRATCH2), Imm32(js.downcountAmount)); CMP(64, R(RSCRATCH), MDisp(RSP, 8)); - MOV(32, R(RSCRATCH), Imm32(js.downcountAmount)); J_CC(CC_NE, asm_routines.dispatcherMispredictedBLR); - SUB(32, PPCSTATE(downcount), R(RSCRATCH)); + SUB(32, PPCSTATE(downcount), R(RSCRATCH2)); RET(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 3ec5d83d35..b1efdbd06d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -137,6 +137,7 @@ public: void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); // Clobbers RDX. void SetCRFieldBit(int field, int bit, Gen::X64Reg in); + void ClearCRFieldBit(int field, int bit); // Generates a branch that will check if a given bit of a CR register part // is set or not. diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index 6e3799e387..6ff2ca3489 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -48,17 +48,18 @@ void Jit64AsmRoutineManager::Generate() ABI_PopRegistersAndAdjustStack({}, 0); FixupBranch skipToRealDispatch = J(SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging); //skip the sync and compare first time dispatcherMispredictedBLR = GetCodePtr(); + AND(32, PPCSTATE(pc), Imm32(0xFFFFFFFC)); #if 0 // debug mispredicts MOV(32, R(ABI_PARAM1), MDisp(RSP, 8)); // guessed_pc - ABI_PushRegistersAndAdjustStack(1 << RSCRATCH, 0); + ABI_PushRegistersAndAdjustStack(1 << RSCRATCH2, 0); CALL(reinterpret_cast(&ReportMispredict)); - ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0); + ABI_PopRegistersAndAdjustStack(1 << RSCRATCH2, 0); #endif ResetStack(); - SUB(32, PPCSTATE(downcount), R(RSCRATCH)); + SUB(32, PPCSTATE(downcount), R(RSCRATCH2)); dispatcher = GetCodePtr(); // The result of slice decrementation should be in flags if somebody jumped here diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index 2508fe1417..4e615cafdc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -229,7 +229,11 @@ void Jit64::bclrx(UGeckoInstruction inst) #endif MOV(32, R(RSCRATCH), PPCSTATE_LR); - AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); + // We don't have to do this because WriteBLRExit handles it for us. Specifically, since we only ever push + // divisible-by-four instruction addresses onto the stack, if the return address matches, we're already + // good. If it doesn't match, the mispredicted-BLR code handles the fixup. + if (!m_enable_blr_optimization) + AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); if (inst.LK) MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4)); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 4e926b4faa..72f278434c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -276,6 +276,18 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper) int a = inst.FA; int b = inst.FB; int crf = inst.CRFD; + int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT }; + + // Merge neighboring fcmp and cror (the primary use of cror). + UGeckoInstruction next = js.next_inst; + if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf) + { + js.skipnext = true; + js.downcountAmount++; + int dst = 3 - (next.CRBD & 3); + output[3 - (next.CRBA & 3)] |= 1 << dst; + output[3 - (next.CRBB & 3)] |= 1 << dst; + } fpr.Lock(a, b); fpr.BindToRegister(b, true, false); @@ -315,14 +327,14 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper) pGreater = J_CC(CC_B); } - MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_EQ))); + MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_EQ_BIT]))); if (fprf) OR(32, PPCSTATE(fpscr), Imm32(CR_EQ << FPRF_SHIFT)); continue1 = J(); SetJumpTarget(pNaN); - MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_SO))); + MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_SO_BIT]))); if (fprf) OR(32, PPCSTATE(fpscr), Imm32(CR_SO << FPRF_SHIFT)); @@ -331,13 +343,13 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper) continue2 = J(); SetJumpTarget(pGreater); - MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_GT))); + MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_GT_BIT]))); if (fprf) OR(32, PPCSTATE(fpscr), Imm32(CR_GT << FPRF_SHIFT)); continue3 = J(); SetJumpTarget(pLesser); - MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(CR_LT))); + MOV(64, R(RSCRATCH), Imm64(PPCCRToInternal(output[CR_LT_BIT]))); if (fprf) OR(32, PPCSTATE(fpscr), Imm32(CR_LT << FPRF_SHIFT)); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 399020b03d..f28f668bee 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -153,7 +153,18 @@ void Jit64::ComputeRC(const Gen::OpArg & arg, bool needs_test, bool needs_sext) else { if (needs_test) + { TEST(32, arg, arg); + } + else + { + // If an operand to the cmp/rc op we're merging with the branch isn't used anymore, it'd be + // better to flush it here so that we don't have to flush it on both sides of the branch. + // We don't want to do this if a test is needed though, because it would interrupt macro-op + // fusion. + for (int j : js.op->gprInUse) + gpr.StoreFromRegister(j); + } DoMergedBranchCondition(); } } @@ -355,7 +366,8 @@ void Jit64::DoMergedBranch() else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx { MOV(32, R(RSCRATCH), M(&LR)); - AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); + if (!m_enable_blr_optimization) + AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); if (js.next_inst.LK) MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); WriteBLRExit(); @@ -544,7 +556,16 @@ void Jit64::cmpXX(UGeckoInstruction inst) MOV(64, PPCSTATE(cr_val[crf]), R(input)); // Place the comparison next to the branch for macro-op fusion if (merge_branch) - TEST(64, R(input), R(input)); + { + // We only need to do a 32-bit compare, since the flags set will be the same as a sign-extended + // result. + // We should also test against gpr.R(a) if it's bound, since that's one less cycle of latency + // (the CPU doesn't have to wait for the movsxd to finish to resolve the branch). + if (gpr.R(a).IsSimpleReg()) + TEST(32, gpr.R(a), gpr.R(a)); + else + TEST(32, R(input), R(input)); + } } else { @@ -1007,20 +1028,30 @@ void Jit64::mulhwXx(UGeckoInstruction inst) else gpr.SetImmediate32(d, (u32)((gpr.R(a).offset * gpr.R(b).offset) >> 32)); } - else + else if (sign) { gpr.Lock(a, b, d); // no register choice gpr.FlushLockX(EDX, EAX); - gpr.BindToRegister(d, (d == a || d == b), true); + gpr.BindToRegister(d, d == a || d == b, true); MOV(32, R(EAX), gpr.R(a)); gpr.KillImmediate(b, true, false); - if (sign) - IMUL(32, gpr.R(b)); - else - MUL(32, gpr.R(b)); + IMUL(32, gpr.R(b)); MOV(32, gpr.R(d), R(EDX)); } + else + { + // Not faster for signed because we'd need two movsx. + gpr.Lock(a, b, d); + // We need to bind everything to registers since the top 32 bits need to be zero. + int src = d == b ? a : b; + gpr.BindToRegister(d, d == a || d == b, true); + gpr.BindToRegister(src, true, false); + if (d != a && d != b) + MOV(32, gpr.R(d), gpr.R(a)); + IMUL(64, gpr.RX(d), gpr.R(src)); + SHR(64, gpr.R(d), Imm8(32)); + } if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index ecedf88be5..69014150d2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -89,6 +89,42 @@ void Jit64::SetCRFieldBit(int field, int bit, Gen::X64Reg in) MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2)); } +void Jit64::ClearCRFieldBit(int field, int bit) +{ + MOV(64, R(RSCRATCH2), PPCSTATE(cr_val[field])); + + if (bit != CR_GT_BIT) + { + TEST(64, R(RSCRATCH2), R(RSCRATCH2)); + FixupBranch dont_clear_gt = J_CC(CC_NZ); + BTS(64, R(RSCRATCH2), Imm8(63)); + SetJumpTarget(dont_clear_gt); + } + + switch (bit) + { + case CR_SO_BIT: // set bit 61 to input + BTR(64, R(RSCRATCH2), Imm8(61)); + break; + + case CR_EQ_BIT: // clear low 32 bits, set bit 0 to !input + SHR(64, R(RSCRATCH2), Imm8(32)); + SHL(64, R(RSCRATCH2), Imm8(32)); + break; + + case CR_GT_BIT: // set bit 63 to !input + BTR(64, R(RSCRATCH2), Imm8(63)); + break; + + case CR_LT_BIT: // set bit 62 to input + BTR(64, R(RSCRATCH2), Imm8(62)); + break; + } + + BTS(64, R(RSCRATCH2), Imm8(32)); + MOV(64, PPCSTATE(cr_val[field]), R(RSCRATCH2)); +} + FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set) { switch (bit) @@ -472,6 +508,13 @@ void Jit64::crXXX(UGeckoInstruction inst) JITDISABLE(bJITSystemRegistersOff); _dbg_assert_msg_(DYNA_REC, inst.OPCD == 19, "Invalid crXXX"); + // Special case: crclr + if (inst.CRBA == inst.CRBB && inst.CRBA == inst.CRBD && inst.SUBOP10 == 193) + { + ClearCRFieldBit(inst.CRBD >> 2, 3 - (inst.CRBD & 3)); + return; + } + // TODO(delroth): Potential optimizations could be applied here. For // instance, if the two CR bits being loaded are the same, two loads are // not required. diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index acc5d372ed..cefba76d93 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -406,6 +406,11 @@ static bool isCarryOp(const CodeOp& a) return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && a.opinfo->type == OPTYPE_INTEGER; } +static bool isCror(const CodeOp& a) +{ + return a.inst.OPCD == 19 && a.inst.SUBOP10 == 449; +} + void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type) { // Bubbling an instruction sometimes reveals another opportunity to bubble an instruction, so do @@ -426,7 +431,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r CodeOp &b = code[i + increment]; // Reorder integer compares, rlwinm., and carry-affecting ops // (if we add more merged branch instructions, add them here!) - if ((type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0))) + if ((type == REORDER_CROR && isCror(a)) || (type == REORDER_CARRY && isCarryOp(a)) || (type == REORDER_CMP && (isCmp(a) || a.outputCR0))) { // once we're next to a carry instruction, don't move away! if (type == REORDER_CARRY && i != start) @@ -454,6 +459,10 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code) { + // Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more + // picky about this, but cror seems to almost solely be used for this purpose in real code. + // Additionally, the other boolean ops seem to almost never be used. + ReorderInstructionsCore(instructions, code, true, REORDER_CROR); // For carry, bubble instructions *towards* each other; one direction often isn't enough // to get pairs like addc/adde next to each other. if (HasOption(OPTION_CARRY_MERGE)) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 8b3f4bc85a..8abf4bbdfe 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -156,7 +156,8 @@ private: enum ReorderType { REORDER_CARRY, - REORDER_CMP + REORDER_CMP, + REORDER_CROR }; void ReorderInstructionsCore(u32 instructions, CodeOp* code, bool reverse, ReorderType type);