Merge pull request #10035 from JosJuice/jitarm64-dcbx-in-loop

JitArm64: dcbx loop detection for improved performance when invalidating large memory regions
2021-08-30 22:39:20 +02:00 · 2021-08-30 22:39:20 +02:00 · 6659f8019e
parent d94aa913f5 90fcaf7e96
commit 6659f8019e
2 changed files with 131 additions and 43 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp
@ -245,9 +245,7 @@ void Jit64::dcbx(UGeckoInstruction inst)

  RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0);
  RCX64Reg Rb = gpr.Bind(inst.RB, make_loop ? RCMode::ReadWrite : RCMode::Read);
-  RCX64Reg tmp = gpr.Scratch();
-  RCX64Reg effective_address = gpr.Scratch();
-  RegCache::Realize(Ra, Rb, tmp, effective_address);
+  RegCache::Realize(Ra, Rb);

  RCX64Reg loop_counter;
  if (make_loop)
@ -259,10 +257,10 @@ void Jit64::dcbx(UGeckoInstruction inst)
    // bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at
    // all, if we invalidate 2 cachelines we adjust the registers by one step, and so on.

-    RCX64Reg& reg_cycle_count = tmp;
-    RCX64Reg& reg_downcount = effective_address;
+    RCX64Reg reg_cycle_count = gpr.Scratch();
+    RCX64Reg reg_downcount = gpr.Scratch();
    loop_counter = gpr.Scratch();
-    RegCache::Realize(loop_counter);
+    RegCache::Realize(reg_cycle_count, reg_downcount, loop_counter);

    // This must be true in order for us to pick up the DIV results and not trash any data.
    static_assert(RSCRATCH == Gen::EAX && RSCRATCH2 == Gen::EDX);
@ -304,8 +302,8 @@ void Jit64::dcbx(UGeckoInstruction inst)
    LEA(32, loop_counter, MDisp(RSCRATCH2, 1));
  }

-  X64Reg value = RSCRATCH;
-  MOV_sum(32, value, Ra, Rb);
+  X64Reg addr = RSCRATCH;
+  MOV_sum(32, addr, Ra, Rb);

  if (make_loop)
  {
@ -315,33 +313,36 @@ void Jit64::dcbx(UGeckoInstruction inst)
    ADD(32, R(Rb), R(RSCRATCH2));  // Rb += (RSCRATCH2 * 32)
  }

-  X64Reg addr = RSCRATCH2;
+  X64Reg tmp = RSCRATCH2;
+  RCX64Reg effective_address = gpr.Scratch();
+  RegCache::Realize(effective_address);
+
  FixupBranch bat_lookup_failed;
-  MOV(32, R(effective_address), R(value));
+  MOV(32, R(effective_address), R(addr));
  const u8* loop_start = GetCodePtr();
  if (MSR.IR)
  {
    // Translate effective address to physical address.
-    bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data());
-    MOV(32, R(addr), R(effective_address));
-    AND(32, R(addr), Imm32(0x0001ffff));
-    AND(32, R(value), Imm32(0xfffe0000));
-    OR(32, R(value), R(addr));
+    bat_lookup_failed = BATAddressLookup(addr, tmp, PowerPC::ibat_table.data());
+    MOV(32, R(tmp), R(effective_address));
+    AND(32, R(tmp), Imm32(0x0001ffff));
+    AND(32, R(addr), Imm32(0xfffe0000));
+    OR(32, R(addr), R(tmp));
  }
-  MOV(32, R(addr), R(value));

  // Check whether a JIT cache line needs to be invalidated.
-  SHR(32, R(value), Imm8(5 + 5));  // >> 5 for cache line size, >> 5 for width of bitset
+  SHR(32, R(addr), Imm8(5 + 5));  // >> 5 for cache line size, >> 5 for width of bitset
  MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet()));
-  MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0));
-  SHR(32, R(addr), Imm8(5));
-  BT(32, R(value), R(addr));
+  MOV(32, R(addr), MComplex(tmp, addr, SCALE_4, 0));
+  MOV(32, R(tmp), R(effective_address));
+  SHR(32, R(tmp), Imm8(5));
+  BT(32, R(addr), R(tmp));
  FixupBranch invalidate_needed = J_CC(CC_C, true);

  if (make_loop)
  {
    ADD(32, R(effective_address), Imm8(32));
-    MOV(32, R(value), R(effective_address));
+    MOV(32, R(addr), R(effective_address));
    SUB(32, R(loop_counter), Imm8(1));
    J_CC(CC_NZ, loop_start);
  }
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp
@ -553,21 +553,96 @@ void JitArm64::dcbx(UGeckoInstruction inst)
  INSTRUCTION_START
  JITDISABLE(bJITLoadStoreOff);

-  gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
+  u32 a = inst.RA, b = inst.RB;
+
+  // Check if the next instructions match a known looping pattern:
+  // - dcbx rX
+  // - addi rX,rX,32
+  // - bdnz+ -8
+  const bool make_loop = a == 0 && b != 0 && CanMergeNextInstructions(2) &&
+                         (js.op[1].inst.hex & 0xfc00'ffff) == 0x38000020 &&
+                         js.op[1].inst.RA_6 == b && js.op[1].inst.RD_2 == b &&
+                         js.op[2].inst.hex == 0x4200fff8;
+
+  gpr.Lock(ARM64Reg::W0);
+  if (make_loop)
+    gpr.Lock(ARM64Reg::W1);
+
+  ARM64Reg WA = gpr.GetReg();
+
+  if (make_loop)
+    gpr.BindToRegister(b, true);
+
+  ARM64Reg loop_counter = ARM64Reg::INVALID_REG;
+  if (make_loop)
+  {
+    // We'll execute somewhere between one single cacheline invalidation and however many are needed
+    // to reduce the downcount to zero, never exceeding the amount requested by the game.
+    // To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb)
+    // by the amount of cache lines we invalidate minus one -- since we'll run the regular addi and
+    // bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at
+    // all, if we invalidate 2 cachelines we adjust the registers by one step, and so on.
+
+    ARM64Reg reg_cycle_count = gpr.GetReg();
+    ARM64Reg reg_downcount = gpr.GetReg();
+    loop_counter = ARM64Reg::W1;
+    ARM64Reg WB = ARM64Reg::W0;
+
+    // Figure out how many loops we want to do.
+    const u8 cycle_count_per_loop =
+        js.op[0].opinfo->numCycles + js.op[1].opinfo->numCycles + js.op[2].opinfo->numCycles;
+
+    LDR(IndexType::Unsigned, reg_downcount, PPC_REG, PPCSTATE_OFF(downcount));
+    MOVI2R(WA, 0);
+    CMP(reg_downcount, 0);                                          // if (downcount <= 0)
+    FixupBranch downcount_is_zero_or_negative = B(CCFlags::CC_LE);  // only do 1 invalidation; else:
+    LDR(IndexType::Unsigned, loop_counter, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR));
+    MOVI2R(reg_cycle_count, cycle_count_per_loop);
+    SDIV(WB, reg_downcount, reg_cycle_count);  // WB = downcount / cycle_count
+    SUB(WA, loop_counter, 1);                  // WA = CTR - 1
+    // ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly.
+    CMP(WB, WA);
+    CSEL(WA, WB, WA, CCFlags::CC_LO);  // WA = min(WB, WA)
+
+    // WA now holds the amount of loops to execute minus 1, which is the amount we need to adjust
+    // downcount, CTR, and Rb by to exit the loop construct with the right values in those
+    // registers.
+
+    // CTR -= WA
+    SUB(loop_counter, loop_counter, WA);
+    STR(IndexType::Unsigned, loop_counter, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR));
+
+    // downcount -= (WA * reg_cycle_count)
+    MUL(WB, WA, reg_cycle_count);
+    // ^ Note that this cannot overflow because it's limited by (downcount/cycle_count).
+    SUB(reg_downcount, reg_downcount, WB);
+    STR(IndexType::Unsigned, reg_downcount, PPC_REG, PPCSTATE_OFF(downcount));
+
+    SetJumpTarget(downcount_is_zero_or_negative);
+
+    // Load the loop_counter register with the amount of invalidations to execute.
+    ADD(loop_counter, WA, 1);
+
+    gpr.Unlock(reg_cycle_count, reg_downcount);
+  }

  ARM64Reg effective_addr = ARM64Reg::W0;
-  ARM64Reg physical_addr = MSR.IR ? gpr.GetReg() : effective_addr;
-  ARM64Reg value = gpr.GetReg();
-  ARM64Reg WA = ARM64Reg::W30;
-
-  u32 a = inst.RA, b = inst.RB;
+  ARM64Reg physical_addr = gpr.GetReg();

  if (a)
    ADD(effective_addr, gpr.R(a), gpr.R(b));
  else
    MOV(effective_addr, gpr.R(b));

+  if (make_loop)
+  {
+    // This is the best place to adjust Rb to what it should be since WA still has the
+    // adjusted loop count and we're done reading from Rb.
+    ADD(gpr.R(b), gpr.R(b), WA, ArithOption(WA, ShiftType::LSL, 5));  // Rb += (WA * 32)
+  }
+
  // Translate effective address to physical address.
+  const u8* loop_start = GetCodePtr();
  FixupBranch bat_lookup_failed;
  if (MSR.IR)
  {
@ -577,18 +652,27 @@ void JitArm64::dcbx(UGeckoInstruction inst)
  }

  // Check whether a JIT cache line needs to be invalidated.
-  LSR(value, physical_addr, 5 + 5);  // >> 5 for cache line size, >> 5 for width of bitset
+  LSR(physical_addr, physical_addr, 5 + 5);  // >> 5 for cache line size, >> 5 for width of bitset
  MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet());
-  LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true));
+  LDR(physical_addr, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(physical_addr), true));

-  LSR(WA, physical_addr, 5);  // mask sizeof cacheline, & 0x1f is the position within the bitset
+  LSR(WA, effective_addr, 5);  // mask sizeof cacheline, & 0x1f is the position within the bitset

-  LSRV(value, value, WA);  // move current bit to bit 0
+  LSRV(physical_addr, physical_addr, WA);  // move current bit to bit 0
+
+  FixupBranch bit_not_set = TBZ(physical_addr, 0);
+  FixupBranch invalidate_needed = B();
+  SetJumpTarget(bit_not_set);
+
+  if (make_loop)
+  {
+    ADD(effective_addr, effective_addr, 32);
+    SUBS(loop_counter, loop_counter, 1);
+    B(CCFlags::CC_NEQ, loop_start);
+  }

-  FixupBranch bit_not_set = TBZ(value, 0);
-  FixupBranch far_addr = B();
  SwitchToFarCode();
-  SetJumpTarget(far_addr);
+  SetJumpTarget(invalidate_needed);
  if (MSR.IR)
    SetJumpTarget(bat_lookup_failed);

@ -596,27 +680,30 @@ void JitArm64::dcbx(UGeckoInstruction inst)
  BitSet32 fprs_to_push = fpr.GetCallerSavedUsed();
  gprs_to_push[DecodeReg(effective_addr)] = false;
  gprs_to_push[DecodeReg(physical_addr)] = false;
-  gprs_to_push[DecodeReg(value)] = false;
  gprs_to_push[DecodeReg(WA)] = false;
+  if (make_loop)
+    gprs_to_push[DecodeReg(loop_counter)] = false;

  ABI_PushRegisters(gprs_to_push);
-  m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30);
+  m_float_emit.ABI_PushRegisters(fprs_to_push, WA);

-  // W0 (the function call argument) was already set earlier
-  MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine);
+  // The function call arguments are already in the correct registers
+  if (make_loop)
+    MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLines);
+  else
+    MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine);
  BLR(ARM64Reg::X8);

-  m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30);
+  m_float_emit.ABI_PopRegisters(fprs_to_push, WA);
  ABI_PopRegisters(gprs_to_push);

  FixupBranch near_addr = B();
  SwitchToNearCode();
-  SetJumpTarget(bit_not_set);
  SetJumpTarget(near_addr);

-  gpr.Unlock(effective_addr, value, WA);
-  if (MSR.IR)
-    gpr.Unlock(physical_addr);
+  gpr.Unlock(effective_addr, physical_addr, WA);
+  if (make_loop)
+    gpr.Unlock(loop_counter);
 }

 void JitArm64::dcbt(UGeckoInstruction inst)