diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index e00f2a7065..a3a428c889 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -245,9 +245,7 @@ void Jit64::dcbx(UGeckoInstruction inst) RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0); RCX64Reg Rb = gpr.Bind(inst.RB, make_loop ? RCMode::ReadWrite : RCMode::Read); - RCX64Reg tmp = gpr.Scratch(); - RCX64Reg effective_address = gpr.Scratch(); - RegCache::Realize(Ra, Rb, tmp, effective_address); + RegCache::Realize(Ra, Rb); RCX64Reg loop_counter; if (make_loop) @@ -259,10 +257,10 @@ void Jit64::dcbx(UGeckoInstruction inst) // bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at // all, if we invalidate 2 cachelines we adjust the registers by one step, and so on. - RCX64Reg& reg_cycle_count = tmp; - RCX64Reg& reg_downcount = effective_address; + RCX64Reg reg_cycle_count = gpr.Scratch(); + RCX64Reg reg_downcount = gpr.Scratch(); loop_counter = gpr.Scratch(); - RegCache::Realize(loop_counter); + RegCache::Realize(reg_cycle_count, reg_downcount, loop_counter); // This must be true in order for us to pick up the DIV results and not trash any data. static_assert(RSCRATCH == Gen::EAX && RSCRATCH2 == Gen::EDX); @@ -304,8 +302,8 @@ void Jit64::dcbx(UGeckoInstruction inst) LEA(32, loop_counter, MDisp(RSCRATCH2, 1)); } - X64Reg value = RSCRATCH; - MOV_sum(32, value, Ra, Rb); + X64Reg addr = RSCRATCH; + MOV_sum(32, addr, Ra, Rb); if (make_loop) { @@ -315,33 +313,36 @@ void Jit64::dcbx(UGeckoInstruction inst) ADD(32, R(Rb), R(RSCRATCH2)); // Rb += (RSCRATCH2 * 32) } - X64Reg addr = RSCRATCH2; + X64Reg tmp = RSCRATCH2; + RCX64Reg effective_address = gpr.Scratch(); + RegCache::Realize(effective_address); + FixupBranch bat_lookup_failed; - MOV(32, R(effective_address), R(value)); + MOV(32, R(effective_address), R(addr)); const u8* loop_start = GetCodePtr(); if (MSR.IR) { // Translate effective address to physical address. - bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data()); - MOV(32, R(addr), R(effective_address)); - AND(32, R(addr), Imm32(0x0001ffff)); - AND(32, R(value), Imm32(0xfffe0000)); - OR(32, R(value), R(addr)); + bat_lookup_failed = BATAddressLookup(addr, tmp, PowerPC::ibat_table.data()); + MOV(32, R(tmp), R(effective_address)); + AND(32, R(tmp), Imm32(0x0001ffff)); + AND(32, R(addr), Imm32(0xfffe0000)); + OR(32, R(addr), R(tmp)); } - MOV(32, R(addr), R(value)); // Check whether a JIT cache line needs to be invalidated. - SHR(32, R(value), Imm8(5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset + SHR(32, R(addr), Imm8(5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet())); - MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); - SHR(32, R(addr), Imm8(5)); - BT(32, R(value), R(addr)); + MOV(32, R(addr), MComplex(tmp, addr, SCALE_4, 0)); + MOV(32, R(tmp), R(effective_address)); + SHR(32, R(tmp), Imm8(5)); + BT(32, R(addr), R(tmp)); FixupBranch invalidate_needed = J_CC(CC_C, true); if (make_loop) { ADD(32, R(effective_address), Imm8(32)); - MOV(32, R(value), R(effective_address)); + MOV(32, R(addr), R(effective_address)); SUB(32, R(loop_counter), Imm8(1)); J_CC(CC_NZ, loop_start); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 29aae5ac59..fc4603811f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -553,21 +553,96 @@ void JitArm64::dcbx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); + u32 a = inst.RA, b = inst.RB; + + // Check if the next instructions match a known looping pattern: + // - dcbx rX + // - addi rX,rX,32 + // - bdnz+ -8 + const bool make_loop = a == 0 && b != 0 && CanMergeNextInstructions(2) && + (js.op[1].inst.hex & 0xfc00'ffff) == 0x38000020 && + js.op[1].inst.RA_6 == b && js.op[1].inst.RD_2 == b && + js.op[2].inst.hex == 0x4200fff8; + + gpr.Lock(ARM64Reg::W0); + if (make_loop) + gpr.Lock(ARM64Reg::W1); + + ARM64Reg WA = gpr.GetReg(); + + if (make_loop) + gpr.BindToRegister(b, true); + + ARM64Reg loop_counter = ARM64Reg::INVALID_REG; + if (make_loop) + { + // We'll execute somewhere between one single cacheline invalidation and however many are needed + // to reduce the downcount to zero, never exceeding the amount requested by the game. + // To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb) + // by the amount of cache lines we invalidate minus one -- since we'll run the regular addi and + // bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at + // all, if we invalidate 2 cachelines we adjust the registers by one step, and so on. + + ARM64Reg reg_cycle_count = gpr.GetReg(); + ARM64Reg reg_downcount = gpr.GetReg(); + loop_counter = ARM64Reg::W1; + ARM64Reg WB = ARM64Reg::W0; + + // Figure out how many loops we want to do. + const u8 cycle_count_per_loop = + js.op[0].opinfo->numCycles + js.op[1].opinfo->numCycles + js.op[2].opinfo->numCycles; + + LDR(IndexType::Unsigned, reg_downcount, PPC_REG, PPCSTATE_OFF(downcount)); + MOVI2R(WA, 0); + CMP(reg_downcount, 0); // if (downcount <= 0) + FixupBranch downcount_is_zero_or_negative = B(CCFlags::CC_LE); // only do 1 invalidation; else: + LDR(IndexType::Unsigned, loop_counter, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR)); + MOVI2R(reg_cycle_count, cycle_count_per_loop); + SDIV(WB, reg_downcount, reg_cycle_count); // WB = downcount / cycle_count + SUB(WA, loop_counter, 1); // WA = CTR - 1 + // ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly. + CMP(WB, WA); + CSEL(WA, WB, WA, CCFlags::CC_LO); // WA = min(WB, WA) + + // WA now holds the amount of loops to execute minus 1, which is the amount we need to adjust + // downcount, CTR, and Rb by to exit the loop construct with the right values in those + // registers. + + // CTR -= WA + SUB(loop_counter, loop_counter, WA); + STR(IndexType::Unsigned, loop_counter, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR)); + + // downcount -= (WA * reg_cycle_count) + MUL(WB, WA, reg_cycle_count); + // ^ Note that this cannot overflow because it's limited by (downcount/cycle_count). + SUB(reg_downcount, reg_downcount, WB); + STR(IndexType::Unsigned, reg_downcount, PPC_REG, PPCSTATE_OFF(downcount)); + + SetJumpTarget(downcount_is_zero_or_negative); + + // Load the loop_counter register with the amount of invalidations to execute. + ADD(loop_counter, WA, 1); + + gpr.Unlock(reg_cycle_count, reg_downcount); + } ARM64Reg effective_addr = ARM64Reg::W0; - ARM64Reg physical_addr = MSR.IR ? gpr.GetReg() : effective_addr; - ARM64Reg value = gpr.GetReg(); - ARM64Reg WA = ARM64Reg::W30; - - u32 a = inst.RA, b = inst.RB; + ARM64Reg physical_addr = gpr.GetReg(); if (a) ADD(effective_addr, gpr.R(a), gpr.R(b)); else MOV(effective_addr, gpr.R(b)); + if (make_loop) + { + // This is the best place to adjust Rb to what it should be since WA still has the + // adjusted loop count and we're done reading from Rb. + ADD(gpr.R(b), gpr.R(b), WA, ArithOption(WA, ShiftType::LSL, 5)); // Rb += (WA * 32) + } + // Translate effective address to physical address. + const u8* loop_start = GetCodePtr(); FixupBranch bat_lookup_failed; if (MSR.IR) { @@ -577,18 +652,27 @@ void JitArm64::dcbx(UGeckoInstruction inst) } // Check whether a JIT cache line needs to be invalidated. - LSR(value, physical_addr, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset + LSR(physical_addr, physical_addr, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet()); - LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true)); + LDR(physical_addr, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(physical_addr), true)); - LSR(WA, physical_addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset + LSR(WA, effective_addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset - LSRV(value, value, WA); // move current bit to bit 0 + LSRV(physical_addr, physical_addr, WA); // move current bit to bit 0 + + FixupBranch bit_not_set = TBZ(physical_addr, 0); + FixupBranch invalidate_needed = B(); + SetJumpTarget(bit_not_set); + + if (make_loop) + { + ADD(effective_addr, effective_addr, 32); + SUBS(loop_counter, loop_counter, 1); + B(CCFlags::CC_NEQ, loop_start); + } - FixupBranch bit_not_set = TBZ(value, 0); - FixupBranch far_addr = B(); SwitchToFarCode(); - SetJumpTarget(far_addr); + SetJumpTarget(invalidate_needed); if (MSR.IR) SetJumpTarget(bat_lookup_failed); @@ -596,27 +680,30 @@ void JitArm64::dcbx(UGeckoInstruction inst) BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); gprs_to_push[DecodeReg(effective_addr)] = false; gprs_to_push[DecodeReg(physical_addr)] = false; - gprs_to_push[DecodeReg(value)] = false; gprs_to_push[DecodeReg(WA)] = false; + if (make_loop) + gprs_to_push[DecodeReg(loop_counter)] = false; ABI_PushRegisters(gprs_to_push); - m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); + m_float_emit.ABI_PushRegisters(fprs_to_push, WA); - // W0 (the function call argument) was already set earlier - MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine); + // The function call arguments are already in the correct registers + if (make_loop) + MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLines); + else + MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine); BLR(ARM64Reg::X8); - m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); + m_float_emit.ABI_PopRegisters(fprs_to_push, WA); ABI_PopRegisters(gprs_to_push); FixupBranch near_addr = B(); SwitchToNearCode(); - SetJumpTarget(bit_not_set); SetJumpTarget(near_addr); - gpr.Unlock(effective_addr, value, WA); - if (MSR.IR) - gpr.Unlock(physical_addr); + gpr.Unlock(effective_addr, physical_addr, WA); + if (make_loop) + gpr.Unlock(loop_counter); } void JitArm64::dcbt(UGeckoInstruction inst)