diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 2ccd045291..dc992a0129 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -522,6 +522,7 @@ void Jit64::Jit(u32 em_address) jo.enableBlocklink = false; analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE); + analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); } Trace(); @@ -603,7 +604,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging) js.downcountAmount += PatchEngine::GetSpeedhackCycles(code_block.m_address); - js.skipnext = false; + js.skipInstructions = 0; js.carryFlagSet = false; js.carryFlagInverted = false; js.assumeNoPairedQuantize = false; @@ -651,12 +652,9 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc if (i == (code_block.m_num_instructions - 1)) { - // WARNING - cmp->branch merging will screw this up. - js.isLastInstruction = true; - js.next_inst = 0; - js.next_inst_bp = false; if (Profiler::g_ProfileBlocks) { + // WARNING - cmp->branch merging will screw this up. PROFILER_VPUSH; // get end tic PROFILER_QUERY_PERFORMANCE_COUNTER(&b->ticStop); @@ -664,14 +662,7 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc PROFILER_UPDATE_TIME(b); PROFILER_VPOP; } - } - else - { - // help peephole optimizations - js.next_inst = ops[i + 1].inst; - js.next_compilerPC = ops[i + 1].address; - js.next_op = &ops[i + 1]; - js.next_inst_bp = SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging && breakpoints.IsAddressBreakPoint(ops[i + 1].address); + js.isLastInstruction = true; } if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) @@ -856,11 +847,8 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc //NOTICE_LOG(DYNA_REC, "Unflushed register: %s", ppc_inst.c_str()); } #endif - if (js.skipnext) - { - js.skipnext = false; - i++; // Skip next instruction - } + i += js.skipInstructions; + js.skipInstructions = 0; } u32 function = HLE::GetFunctionIndex(js.blockStart); @@ -919,5 +907,6 @@ void Jit64::EnableOptimization() { analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE); + analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 7da231d70e..a2d6c3026f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -115,6 +115,7 @@ public: void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); void GenerateOverflow(); + bool MergeAllowedNextInstructions(int count); void FinalizeCarryOverflow(bool oe, bool inv = false); void FinalizeCarry(Gen::CCFlags cond); void FinalizeCarry(bool ca); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 5eb13bec7f..55de795673 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -346,10 +346,12 @@ void Jit64::FloatCompare(UGeckoInstruction inst, bool upper) int output[4] = { CR_SO, CR_EQ, CR_GT, CR_LT }; // Merge neighboring fcmp and cror (the primary use of cror). - UGeckoInstruction next = js.next_inst; - if (next.OPCD == 19 && next.SUBOP10 == 449 && (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf) + UGeckoInstruction next = js.op[1].inst; + if (analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE) && + MergeAllowedNextInstructions(1) && next.OPCD == 19 && next.SUBOP10 == 449 && + (next.CRBA >> 2) == crf && (next.CRBB >> 2) == crf && (next.CRBD >> 2) == crf) { - js.skipnext = true; + js.skipInstructions = 1; js.downcountAmount++; int dst = 3 - (next.CRBD & 3); output[3 - (next.CRBD & 3)] &= ~(1 << dst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index c0015a19e0..57e3d110f1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -50,14 +50,30 @@ void Jit64::GenerateOverflow() SetJumpTarget(exit); } +bool Jit64::MergeAllowedNextInstructions(int count) +{ + if (PowerPC::GetState() == PowerPC::CPU_STEPPING || js.instructionsLeft < count) + return false; + // Be careful: a breakpoint kills flags in between instructions + for (int i = 1; i <= count; i++) + { + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging && + PowerPC::breakpoints.IsAddressBreakPoint(js.op[i].address)) + return false; + if (js.op[i].isBranchTarget) + return false; + } + return true; +} + void Jit64::FinalizeCarry(CCFlags cond) { js.carryFlagSet = false; js.carryFlagInverted = false; if (js.op->wantsCA) { - // Be careful: a breakpoint kills flags in between instructions - if (!js.isLastInstruction && js.next_op->wantsCAInFlags && !js.next_inst_bp) + // Not actually merging instructions, but the effect is equivalent (we can't have breakpoints/etc in between). + if (MergeAllowedNextInstructions(1) && js.op[1].wantsCAInFlags) { if (cond == CC_C || cond == CC_NC) { @@ -86,7 +102,7 @@ void Jit64::FinalizeCarry(bool ca) js.carryFlagInverted = false; if (js.op->wantsCA) { - if (!js.isLastInstruction && js.next_op->wantsCAInFlags && !js.next_inst_bp) + if (MergeAllowedNextInstructions(1) && js.op[1].wantsCAInFlags) { if (ca) STC(); @@ -331,7 +347,10 @@ bool Jit64::CheckMergedBranch(int crf) if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE)) return false; - const UGeckoInstruction& next = js.next_inst; + if (!MergeAllowedNextInstructions(1)) + return false; + + const UGeckoInstruction& next = js.op[1].inst; return (((next.OPCD == 16 /* bcx */) || ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) || ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) && @@ -343,33 +362,35 @@ bool Jit64::CheckMergedBranch(int crf) void Jit64::DoMergedBranch() { // Code that handles successful PPC branching. - if (js.next_inst.OPCD == 16) // bcx + const UGeckoInstruction& next = js.op[1].inst; + const u32 nextPC = js.op[1].address; + if (next.OPCD == 16) // bcx { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); + if (next.LK) + MOV(32, M(&LR), Imm32(nextPC + 4)); u32 destination; - if (js.next_inst.AA) - destination = SignExt16(js.next_inst.BD << 2); + if (next.AA) + destination = SignExt16(next.BD << 2); else - destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination, js.next_inst.LK, js.next_compilerPC + 4); + destination = nextPC + SignExt16(next.BD << 2); + WriteExit(destination, next.LK, nextPC + 4); } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx + else if ((next.OPCD == 19) && (next.SUBOP10 == 528)) // bcctrx { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); + if (next.LK) + MOV(32, M(&LR), Imm32(nextPC + 4)); MOV(32, R(RSCRATCH), M(&CTR)); AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - WriteExitDestInRSCRATCH(js.next_inst.LK, js.next_compilerPC + 4); + WriteExitDestInRSCRATCH(next.LK, nextPC + 4); } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx + else if ((next.OPCD == 19) && (next.SUBOP10 == 16)) // bclrx { MOV(32, R(RSCRATCH), M(&LR)); if (!m_enable_blr_optimization) AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); + if (next.LK) + MOV(32, M(&LR), Imm32(nextPC + 4)); WriteBLRExit(); } else @@ -381,9 +402,11 @@ void Jit64::DoMergedBranch() void Jit64::DoMergedBranchCondition() { js.downcountAmount++; - js.skipnext = true; - int test_bit = 8 >> (js.next_inst.BI & 3); - bool condition = !!(js.next_inst.BO & BO_BRANCH_IF_TRUE); + js.skipInstructions = 1; + const UGeckoInstruction& next = js.op[1].inst; + int test_bit = 8 >> (next.BI & 3); + bool condition = !!(next.BO & BO_BRANCH_IF_TRUE); + const u32 nextPC = js.op[1].address; gpr.UnlockAll(); gpr.UnlockAllX(); @@ -408,16 +431,18 @@ void Jit64::DoMergedBranchCondition() { gpr.Flush(); fpr.Flush(); - WriteExit(js.next_compilerPC + 4); + WriteExit(nextPC + 4); } } void Jit64::DoMergedBranchImmediate(s64 val) { js.downcountAmount++; - js.skipnext = true; - int test_bit = 8 >> (js.next_inst.BI & 3); - bool condition = !!(js.next_inst.BO & BO_BRANCH_IF_TRUE); + js.skipInstructions = 1; + const UGeckoInstruction& next = js.op[1].inst; + int test_bit = 8 >> (next.BI & 3); + bool condition = !!(next.BO & BO_BRANCH_IF_TRUE); + const u32 nextPC = js.op[1].address; gpr.UnlockAll(); gpr.UnlockAllX(); @@ -441,7 +466,7 @@ void Jit64::DoMergedBranchImmediate(s64 val) { gpr.Flush(); fpr.Flush(); - WriteExit(js.next_compilerPC + 4); + WriteExit(nextPC + 4); } } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index c322c2248f..0c0a23acfc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -95,15 +95,12 @@ void Jit64::lXXx(UGeckoInstruction inst) } // PowerPC has no 8-bit sign extended load, but x86 does, so merge extsb with the load if we find it. - if (accessSize == 8 && js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 954 && - js.next_inst.RS == inst.RD && js.next_inst.RA == inst.RD && !js.next_inst.Rc) + if (MergeAllowedNextInstructions(1) && accessSize == 8 && js.op[1].inst.OPCD == 31 && js.op[1].inst.SUBOP10 == 954 && + js.op[1].inst.RS == inst.RD && js.op[1].inst.RA == inst.RD && !js.op[1].inst.Rc) { - if (PowerPC::GetState() != PowerPC::CPU_STEPPING) - { - js.downcountAmount++; - js.skipnext = true; - signExtend = true; - } + js.downcountAmount++; + js.skipInstructions = 1; + signExtend = true; } // TODO(ector): Make it dynamically enable/disable idle skipping where appropriate diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index 1b3772ff55..46aaddb1f6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -282,38 +282,38 @@ void Jit64::mfspr(UGeckoInstruction inst) ADD(64, R(RAX), R(RDX)); MOV(64, PPCSTATE(spr[SPR_TL]), R(RAX)); - // Two calls of TU/TL next to each other are extremely common in typical usage, so merge them - // if we can. - u32 nextIndex = (js.next_inst.SPRU << 5) | (js.next_inst.SPRL & 0x1F); - // Be careful; the actual opcode is for mftb (371), not mfspr (339) - int n = js.next_inst.RD; - if (js.next_inst.OPCD == 31 && js.next_inst.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL) && - PowerPC::GetState() != PowerPC::CPU_STEPPING && n != d) + if (MergeAllowedNextInstructions(1)) { - js.downcountAmount++; - js.skipnext = true; - gpr.Lock(d, n); - gpr.BindToRegister(d, false); - gpr.BindToRegister(n, false); - if (iIndex == SPR_TL) - MOV(32, gpr.R(d), R(RAX)); - if (nextIndex == SPR_TL) - MOV(32, gpr.R(n), R(RAX)); - SHR(64, R(RAX), Imm8(32)); - if (iIndex == SPR_TU) - MOV(32, gpr.R(d), R(RAX)); - if (nextIndex == SPR_TU) - MOV(32, gpr.R(n), R(RAX)); - } - else - { - gpr.Lock(d); - gpr.BindToRegister(d, false); - if (iIndex == SPR_TU) + const UGeckoInstruction& next = js.op[1].inst; + // Two calls of TU/TL next to each other are extremely common in typical usage, so merge them + // if we can. + u32 nextIndex = (next.SPRU << 5) | (next.SPRL & 0x1F); + // Be careful; the actual opcode is for mftb (371), not mfspr (339) + int n = next.RD; + if (next.OPCD == 31 && next.SUBOP10 == 371 && (nextIndex == SPR_TU || nextIndex == SPR_TL) && n != d) + { + js.downcountAmount++; + js.skipInstructions = 1; + gpr.Lock(d, n); + gpr.BindToRegister(d, false); + gpr.BindToRegister(n, false); + if (iIndex == SPR_TL) + MOV(32, gpr.R(d), R(RAX)); + if (nextIndex == SPR_TL) + MOV(32, gpr.R(n), R(RAX)); SHR(64, R(RAX), Imm8(32)); - MOV(32, gpr.R(d), R(RAX)); + if (iIndex == SPR_TU) + MOV(32, gpr.R(d), R(RAX)); + if (nextIndex == SPR_TU) + MOV(32, gpr.R(n), R(RAX)); + break; + } } - gpr.UnlockAllX(); + gpr.Lock(d); + gpr.BindToRegister(d, false); + if (iIndex == SPR_TU) + SHR(64, R(RAX), Imm8(32)); + MOV(32, gpr.R(d), R(RAX)); break; } case SPR_XER: @@ -341,6 +341,7 @@ void Jit64::mfspr(UGeckoInstruction inst) MOV(32, gpr.R(d), PPCSTATE(spr[iIndex])); break; } + gpr.UnlockAllX(); gpr.UnlockAll(); } diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index b30515c101..49863e0104 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -610,16 +610,7 @@ const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc js.downcountAmount += opinfo->numCycles; if (i == (code_block.m_num_instructions - 1)) - { js.isLastInstruction = true; - js.next_inst = 0; - } - else - { - // help peephole optimizations - js.next_inst = ops[i + 1].inst; - js.next_compilerPC = ops[i + 1].address; - } u32 function = HLE::GetFunctionIndex(ops[i].address); if (function != 0) diff --git a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp index e3deb71448..ab796df34c 100644 --- a/Source/Core/Core/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm32/Jit.cpp @@ -443,7 +443,7 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo if (!SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging) js.downcountAmount += PatchEngine::GetSpeedhackCycles(em_address); - js.skipnext = false; + js.skipInstructions = 0; js.compilerPC = nextPC; // Translate instructions @@ -459,13 +459,6 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo { // WARNING - cmp->branch merging will screw this up. js.isLastInstruction = true; - js.next_inst = 0; - } - else - { - // help peephole optimizations - js.next_inst = ops[i + 1].inst; - js.next_compilerPC = ops[i + 1].address; } if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 29cbb62e1d..543d186588 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -232,7 +232,7 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB js.blockStart = em_address; js.fifoBytesThisBlock = 0; js.downcountAmount = 0; - js.skipnext = false; + js.skipInstructions = 0; js.curBlock = b; u32 nextPC = em_address; @@ -281,13 +281,6 @@ const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitB { // WARNING - cmp->branch merging will screw this up. js.isLastInstruction = true; - js.next_inst = 0; - } - else - { - // help peephole optimizations - js.next_inst = ops[i + 1].inst; - js.next_compilerPC = ops[i + 1].address; } if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 96dfdf7510..6205f7d420 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -65,9 +65,7 @@ protected: struct JitState { u32 compilerPC; - u32 next_compilerPC; u32 blockStart; - UGeckoInstruction next_inst; // for easy peephole opt. int instructionNumber; int instructionsLeft; int downcountAmount; @@ -88,10 +86,9 @@ protected: bool firstFPInstructionFound; bool isLastInstruction; bool memcheck; - bool skipnext; + int skipInstructions; bool carryFlagSet; bool carryFlagInverted; - bool next_inst_bp; int fifoBytesThisBlock; @@ -99,7 +96,6 @@ protected: PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats fpa; PPCAnalyst::CodeOp* op; - PPCAnalyst::CodeOp* next_op; u8* rewriteStart; JitBlock *curBlock; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index c2068a2e61..5233df6c05 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -219,6 +219,11 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) const GekkoOPInfo *b_info = b.opinfo; int a_flags = a_info->flags; int b_flags = b_info->flags; + + // can't reorder around breakpoints + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging && + (PowerPC::breakpoints.IsAddressBreakPoint(a.address) || PowerPC::breakpoints.IsAddressBreakPoint(b.address))) + return false; if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) return false; if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc)) @@ -462,7 +467,8 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp *code) // Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more // picky about this, but cror seems to almost solely be used for this purpose in real code. // Additionally, the other boolean ops seem to almost never be used. - ReorderInstructionsCore(instructions, code, true, REORDER_CROR); + if (HasOption(OPTION_CROR_MERGE)) + ReorderInstructionsCore(instructions, code, true, REORDER_CROR); // For carry, bubble instructions *towards* each other; one direction often isn't enough // to get pairs like addc/adde next to each other. if (HasOption(OPTION_CARRY_MERGE)) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 468d036a8e..6fc074bebb 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -214,6 +214,9 @@ public: // Reorder carry instructions next to their associated branches and pass // carry flags in the x86 flags between them, instead of in XER. OPTION_CARRY_MERGE = (1 << 5), + + // Reorder cror instructions next to their associated fcmp. + OPTION_CROR_MERGE = (1 << 6), };