diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index bbba42c90f..af9ec677c7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -498,24 +498,41 @@ void JitArm64::WriteExceptionExit(ARM64Reg dest, bool only_external, bool always B(dispatcher); } -void JitArm64::WriteConditionalExceptionExit(int exception) +void JitArm64::WriteConditionalExceptionExit(int exception, u64 increment_sp_on_exit) { ARM64Reg WA = gpr.GetReg(); - LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); - FixupBranch noException = TBZ(WA, IntLog2(exception)); + WriteConditionalExceptionExit(exception, WA, Arm64Gen::ARM64Reg::INVALID_REG, + increment_sp_on_exit); + gpr.Unlock(WA); +} - FixupBranch handleException = B(); - SwitchToFarCode(); - SetJumpTarget(handleException); +void JitArm64::WriteConditionalExceptionExit(int exception, ARM64Reg temp_gpr, ARM64Reg temp_fpr, + u64 increment_sp_on_exit) +{ + LDR(IndexType::Unsigned, temp_gpr, PPC_REG, PPCSTATE_OFF(Exceptions)); + FixupBranch no_exception = TBZ(temp_gpr, IntLog2(exception)); - gpr.Flush(FlushMode::MaintainState, WA); - fpr.Flush(FlushMode::MaintainState, ARM64Reg::INVALID_REG); + const bool switch_to_far_code = !IsInFarCode(); + + if (switch_to_far_code) + { + FixupBranch handle_exception = B(); + SwitchToFarCode(); + SetJumpTarget(handle_exception); + } + + if (increment_sp_on_exit != 0) + ADDI2R(ARM64Reg::SP, ARM64Reg::SP, increment_sp_on_exit, temp_gpr); + + gpr.Flush(FlushMode::MaintainState, temp_gpr); + fpr.Flush(FlushMode::MaintainState, temp_fpr); WriteExceptionExit(js.compilerPC, false, true); - SwitchToNearCode(); - SetJumpTarget(noException); - gpr.Unlock(WA); + if (switch_to_far_code) + SwitchToNearCode(); + + SetJumpTarget(no_exception); } bool JitArm64::HandleFunctionHooking(u32 address) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 8d9b140dd0..3ef1955e37 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -221,7 +221,7 @@ protected: BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false); // Loadstore routines void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); - void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); + void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset, bool update); // If lookup succeeds, writes upper 15 bits of physical address to addr_out. If not, // jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out. Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in, @@ -265,7 +265,10 @@ protected: bool always_exception = false); void WriteExceptionExit(Arm64Gen::ARM64Reg dest, bool only_external = false, bool always_exception = false); - void WriteConditionalExceptionExit(int exception); + void WriteConditionalExceptionExit(int exception, u64 increment_sp_on_exit = 0); + void WriteConditionalExceptionExit(int exception, Arm64Gen::ARM64Reg temp_gpr, + Arm64Gen::ARM64Reg temp_fpr = Arm64Gen::ARM64Reg::INVALID_REG, + u64 increment_sp_on_exit = 0); void FakeLKExit(u32 exit_address_after_return); void WriteBLRExit(Arm64Gen::ARM64Reg dest); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index db1f18759a..8bcfac7cbf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -6,6 +6,7 @@ #include #include +#include "Common/Align.h" #include "Common/BitSet.h" #include "Common/CommonFuncs.h" #include "Common/CommonTypes.h" @@ -15,6 +16,7 @@ #include "Common/Swap.h" #include "Core/HW/Memmap.h" +#include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/Jit_Util.h" #include "Core/PowerPC/JitArmCommon/BackPatch.h" @@ -56,6 +58,8 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push, bool emitting_routine) { + const u32 access_size = BackPatchInfo::GetFlagSize(flags); + bool in_far_code = false; const u8* fastmem_start = GetCodePtr(); std::optional slowmem_fixup; @@ -75,11 +79,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR ARM64Reg temp = ARM64Reg::D0; temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true); - m_float_emit.STR(BackPatchInfo::GetFlagSize(flags), temp, MEM_REG, addr); + m_float_emit.STR(access_size, temp, MEM_REG, addr); } else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) { - m_float_emit.LDR(BackPatchInfo::GetFlagSize(flags), EncodeRegToDouble(RS), MEM_REG, addr); + m_float_emit.LDR(access_size, EncodeRegToDouble(RS), MEM_REG, addr); ByteswapAfterLoad(this, &m_float_emit, EncodeRegToDouble(RS), EncodeRegToDouble(RS), flags, true, false); @@ -120,6 +124,8 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (!fastmem || do_farcode) { + const bool memcheck = jo.memcheck && !emitting_routine; + if (fastmem && do_farcode) { in_far_code = true; @@ -136,12 +142,28 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (slowmem_fixup) SetJumpTarget(*slowmem_fixup); - ABI_PushRegisters(gprs_to_push); + const ARM64Reg temp_gpr = flags & BackPatchInfo::FLAG_LOAD ? ARM64Reg::W30 : ARM64Reg::W0; + const int temp_gpr_index = DecodeReg(temp_gpr); + + BitSet32 gprs_to_push_early = {}; + if (memcheck) + gprs_to_push_early[temp_gpr_index] = true; + if (flags & BackPatchInfo::FLAG_LOAD) + gprs_to_push_early[0] = true; + + // If we're already pushing one register in the first PushRegisters call, we can push a + // second one for free. Let's do so, since it might save one instruction in the second + // PushRegisters call. (Do not do this for caller-saved registers which may be in the register + // cache, or else EmitMemcheck will not be able to flush the register cache correctly!) + if (gprs_to_push & gprs_to_push_early) + gprs_to_push_early[30] = true; + + ABI_PushRegisters(gprs_to_push & gprs_to_push_early); + ABI_PushRegisters(gprs_to_push & ~gprs_to_push_early); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); if (flags & BackPatchInfo::FLAG_STORE) { - const u32 access_size = BackPatchInfo::GetFlagSize(flags); ARM64Reg src_reg = RS; const ARM64Reg dst_reg = access_size == 64 ? ARM64Reg::X0 : ARM64Reg::W0; @@ -185,8 +207,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR } else { - const u32 access_size = BackPatchInfo::GetFlagSize(flags); - if (access_size == 64) MOVP2R(ARM64Reg::X8, &PowerPC::Read_U64); else if (access_size == 32) @@ -197,7 +217,21 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR MOVP2R(ARM64Reg::X8, &PowerPC::Read_U8); BLR(ARM64Reg::X8); + } + m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); + ABI_PopRegisters(gprs_to_push & ~gprs_to_push_early); + + if (memcheck) + { + const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0; + const u64 early_push_size = Common::AlignUp(gprs_to_push_early.Count(), 2) * 8; + + WriteConditionalExceptionExit(EXCEPTION_DSI, temp_gpr, temp_fpr, early_push_size); + } + + if (flags & BackPatchInfo::FLAG_LOAD) + { ARM64Reg src_reg = access_size == 64 ? ARM64Reg::X0 : ARM64Reg::W0; if (flags & BackPatchInfo::FLAG_PAIR) @@ -221,8 +255,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR ByteswapAfterLoad(this, &m_float_emit, RS, src_reg, flags, false, false); } - m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); - ABI_PopRegisters(gprs_to_push); + ABI_PopRegisters(gprs_to_push & gprs_to_push_early); } if (in_far_code) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 6fb5cad056..d61f79af7e 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -26,7 +26,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o // We want to make sure to not get LR as a temp register gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); - gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg); + gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false); ARM64Reg dest_reg = gpr.R(dest); ARM64Reg up_reg = ARM64Reg::INVALID_REG; ARM64Reg off_reg = ARM64Reg::INVALID_REG; @@ -101,19 +101,26 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o ARM64Reg XA = EncodeRegTo64(addr_reg); - if (is_immediate) - MOVI2R(XA, imm_addr); + bool addr_reg_set = !is_immediate; + const auto set_addr_reg_if_needed = [&] { + if (!addr_reg_set) + MOVI2R(XA, imm_addr); + }; - if (update) + const bool early_update = !jo.memcheck && dest != static_cast(addr); + if (update && early_update) { gpr.BindToRegister(addr, false); + set_addr_reg_if_needed(); MOV(gpr.R(addr), addr_reg); } BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; - regs_in_use[DecodeReg(dest_reg)] = 0; + if (!update || early_update) + regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + if (!jo.memcheck) + regs_in_use[DecodeReg(dest_reg)] = 0; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -122,6 +129,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (jo.fastmem_arena && is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr)) { + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, true, false, dest_reg, XA, BitSet32(0), BitSet32(0)); } else if (mmio_address) @@ -131,13 +139,25 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o } else { + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, dest_reg, XA, regs_in_use, fprs_in_use); } + gpr.BindToRegister(dest, false, true); + ASSERT(dest_reg == gpr.R(dest)); + + if (update && !early_update) + { + gpr.BindToRegister(addr, false); + set_addr_reg_if_needed(); + MOV(gpr.R(addr), addr_reg); + } + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); } -void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset) +void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset, + bool update) { // We want to make sure to not get LR as a temp register gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); @@ -152,11 +172,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s if (dest != -1 && !gpr.IsImm(dest)) reg_dest = gpr.R(dest); - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; - ARM64Reg addr_reg = ARM64Reg::W1; u32 imm_addr = 0; @@ -222,6 +237,26 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s ARM64Reg XA = EncodeRegTo64(addr_reg); + bool addr_reg_set = !is_immediate; + const auto set_addr_reg_if_needed = [&] { + if (!addr_reg_set) + MOVI2R(XA, imm_addr); + }; + + const bool early_update = !jo.memcheck && value != static_cast(dest); + if (update && early_update) + { + gpr.BindToRegister(dest, false); + set_addr_reg_if_needed(); + MOV(gpr.R(dest), addr_reg); + } + + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + if (!update || early_update) + regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; + u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; if (is_immediate) @@ -255,7 +290,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } else if (jo.fastmem_arena && is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr)) { - MOVI2R(XA, imm_addr); + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, true, false, RS, XA, BitSet32(0), BitSet32(0)); } else if (mmio_address) @@ -265,12 +300,17 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } else { - if (is_immediate) - MOVI2R(XA, imm_addr); - + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, RS, XA, regs_in_use, fprs_in_use); } + if (update && !early_update) + { + gpr.BindToRegister(dest, false); + set_addr_reg_if_needed(); + MOV(gpr.R(dest), addr_reg); + } + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); } @@ -306,7 +346,6 @@ void JitArm64::lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - FALLBACK_IF(jo.memcheck); u32 a = inst.RA, b = inst.RB, d = inst.RD; s32 offset = inst.SIMM_16; @@ -385,7 +424,6 @@ void JitArm64::stX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - FALLBACK_IF(jo.memcheck); u32 a = inst.RA, b = inst.RB, s = inst.RS; s32 offset = inst.SIMM_16; @@ -444,122 +482,104 @@ void JitArm64::stX(UGeckoInstruction inst) break; } - SafeStoreFromReg(update ? a : (a ? a : -1), s, regOffset, flags, offset); - - if (update) - { - gpr.BindToRegister(a, false); - - ARM64Reg WA = gpr.GetReg(); - ARM64Reg RB = {}; - ARM64Reg RA = gpr.R(a); - if (regOffset != -1) - RB = gpr.R(regOffset); - if (regOffset == -1) - { - ADDI2R(RA, RA, offset, WA); - } - else - { - ADD(RA, RA, RB); - } - gpr.Unlock(WA); - } + SafeStoreFromReg(update ? a : (a ? a : -1), s, regOffset, flags, offset, update); } void JitArm64::lmw(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - FALLBACK_IF(!jo.fastmem || jo.memcheck); - u32 a = inst.RA; + u32 a = inst.RA, d = inst.RD; + s32 offset = inst.SIMM_16; - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); + + // MMU games make use of a >= d despite this being invalid according to the PEM. + // Because of this, make sure to not re-read rA after starting doing the loads. + ARM64Reg addr_reg = ARM64Reg::W0; if (a) { - ADDI2R(WA, gpr.R(a), inst.SIMM_16, WA); - ADD(XA, XA, MEM_REG); + if (gpr.IsImm(a)) + MOVI2R(addr_reg, gpr.GetImm(a) + offset); + else + ADDI2R(addr_reg, gpr.R(a), offset, addr_reg); } else { - ADDI2R(XA, MEM_REG, (u32)(s32)(s16)inst.SIMM_16, XA); + MOVI2R(addr_reg, offset); } - for (int i = inst.RD; i < 32; i++) + // TODO: This doesn't handle rollback on DSI correctly + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_SIZE_32; + for (u32 i = d; i < 32; i++) { - int remaining = 32 - i; - if (remaining >= 4) - { - gpr.BindToRegister(i + 3, false); - gpr.BindToRegister(i + 2, false); - gpr.BindToRegister(i + 1, false); - gpr.BindToRegister(i, false); - ARM64Reg RX4 = gpr.R(i + 3); - ARM64Reg RX3 = gpr.R(i + 2); - ARM64Reg RX2 = gpr.R(i + 1); - ARM64Reg RX1 = gpr.R(i); - LDP(IndexType::Post, EncodeRegTo64(RX1), EncodeRegTo64(RX3), XA, 16); - REV32(EncodeRegTo64(RX1), EncodeRegTo64(RX1)); - REV32(EncodeRegTo64(RX3), EncodeRegTo64(RX3)); - LSR(EncodeRegTo64(RX2), EncodeRegTo64(RX1), 32); - LSR(EncodeRegTo64(RX4), EncodeRegTo64(RX3), 32); - i += 3; - } - else if (remaining >= 2) - { - gpr.BindToRegister(i + 1, false); - gpr.BindToRegister(i, false); - ARM64Reg RX2 = gpr.R(i + 1); - ARM64Reg RX1 = gpr.R(i); - LDP(IndexType::Post, RX1, RX2, XA, 8); - REV32(RX1, RX1); - REV32(RX2, RX2); - ++i; - } - else - { - gpr.BindToRegister(i, false); - ARM64Reg RX = gpr.R(i); - LDR(IndexType::Post, RX, XA, 4); - REV32(RX, RX); - } + gpr.BindToRegister(i, false, false); + ARM64Reg dest_reg = gpr.R(i); + + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + if (i == 31) + regs_in_use[DecodeReg(addr_reg)] = 0; + if (!jo.memcheck) + regs_in_use[DecodeReg(dest_reg)] = 0; + + EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, dest_reg, EncodeRegTo64(addr_reg), + regs_in_use, fprs_in_use); + + gpr.BindToRegister(i, false, true); + ASSERT(dest_reg == gpr.R(i)); + + if (i != 31) + ADD(addr_reg, addr_reg, 4); } - gpr.Unlock(WA); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); } void JitArm64::stmw(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - FALLBACK_IF(!jo.fastmem || jo.memcheck); - u32 a = inst.RA; + u32 a = inst.RA, s = inst.RS; + s32 offset = inst.SIMM_16; - ARM64Reg WA = gpr.GetReg(); - ARM64Reg XA = EncodeRegTo64(WA); - ARM64Reg WB = gpr.GetReg(); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); + ARM64Reg addr_reg = ARM64Reg::W1; if (a) { - ADDI2R(WA, gpr.R(a), inst.SIMM_16, WA); - ADD(XA, XA, MEM_REG); + if (gpr.IsImm(a)) + MOVI2R(addr_reg, gpr.GetImm(a) + offset); + else + ADDI2R(addr_reg, gpr.R(a), offset, addr_reg); } else { - ADDI2R(XA, MEM_REG, (u32)(s32)(s16)inst.SIMM_16, XA); + MOVI2R(addr_reg, offset); } - for (int i = inst.RD; i < 32; i++) + // TODO: This doesn't handle rollback on DSI correctly + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_SIZE_32; + for (u32 i = s; i < 32; i++) { - ARM64Reg RX = gpr.R(i); - REV32(WB, RX); - STR(IndexType::Unsigned, WB, XA, (i - inst.RD) * 4); + ARM64Reg src_reg = gpr.R(i); + + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + if (i == 31) + regs_in_use[DecodeReg(addr_reg)] = 0; + + EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, src_reg, EncodeRegTo64(addr_reg), + regs_in_use, fprs_in_use); + + if (i != 31) + ADD(addr_reg, addr_reg, 4); } - gpr.Unlock(WA, WB); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); } void JitArm64::dcbx(UGeckoInstruction inst) @@ -743,7 +763,6 @@ void JitArm64::dcbz(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); - FALLBACK_IF(jo.memcheck || !jo.fastmem_arena); FALLBACK_IF(SConfig::GetInstance().bLowDCBZHack); int a = inst.RA, b = inst.RB; @@ -796,7 +815,7 @@ void JitArm64::dcbz(UGeckoInstruction inst) BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); gprs_to_push[DecodeReg(ARM64Reg::W0)] = 0; - EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, true, true, ARM64Reg::W0, + EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, jo.fastmem, jo.fastmem, ARM64Reg::W0, EncodeRegTo64(addr_reg), gprs_to_push, fprs_to_push); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 79add2f227..3e03ef35ef 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -21,7 +21,6 @@ void JitArm64::lfXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(jo.memcheck); u32 a = inst.RA, b = inst.RB; @@ -80,7 +79,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0); - const ARM64Reg VD = fpr.RW(inst.FD, type); + const ARM64Reg VD = fpr.RW(inst.FD, type, false); ARM64Reg addr_reg = ARM64Reg::W0; if (update) @@ -155,7 +154,8 @@ void JitArm64::lfXX(UGeckoInstruction inst) if (is_immediate) MOVI2R(XA, imm_addr); - if (update) + const bool early_update = !jo.memcheck; + if (update && early_update) { gpr.BindToRegister(a, false); MOV(gpr.R(a), addr_reg); @@ -163,9 +163,11 @@ void JitArm64::lfXX(UGeckoInstruction inst) BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + if (!update || early_update) + regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; fprs_in_use[DecodeReg(ARM64Reg::Q0)] = 0; - fprs_in_use[DecodeReg(VD)] = 0; + if (!jo.memcheck) + fprs_in_use[DecodeReg(VD)] = 0; if (jo.fastmem_arena && is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr)) { @@ -176,6 +178,15 @@ void JitArm64::lfXX(UGeckoInstruction inst) EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VD, XA, regs_in_use, fprs_in_use); } + const ARM64Reg VD_again = fpr.RW(inst.FD, type, true); + ASSERT(VD == VD_again); + + if (update && !early_update) + { + gpr.BindToRegister(a, false); + MOV(gpr.R(a), addr_reg); + } + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); } @@ -184,7 +195,6 @@ void JitArm64::stfXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); - FALLBACK_IF(jo.memcheck); u32 a = inst.RA, b = inst.RB; @@ -334,26 +344,25 @@ void JitArm64::stfXX(UGeckoInstruction inst) ARM64Reg XA = EncodeRegTo64(addr_reg); - if (is_immediate && !(jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr))) - { - MOVI2R(XA, imm_addr); + bool addr_reg_set = !is_immediate; + const auto set_addr_reg_if_needed = [&] { + if (!addr_reg_set) + MOVI2R(XA, imm_addr); + }; - if (update) - { - gpr.BindToRegister(a, false); - MOV(gpr.R(a), addr_reg); - } - } - else if (!is_immediate && update) + const bool early_update = !jo.memcheck; + if (update && early_update) { gpr.BindToRegister(a, false); + set_addr_reg_if_needed(); MOV(gpr.R(a), addr_reg); } BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; + if (!update || early_update) + regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; fprs_in_use[DecodeReg(ARM64Reg::Q0)] = 0; if (is_immediate) @@ -378,28 +387,31 @@ void JitArm64::stfXX(UGeckoInstruction inst) STR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); js.fifoBytesSinceCheck += accessSize >> 3; - - if (update) - { - // Chance of this happening is fairly low, but support it - gpr.BindToRegister(a, false); - MOVI2R(gpr.R(a), imm_addr); - } } else if (jo.fastmem_arena && PowerPC::IsOptimizableRAMAddress(imm_addr)) { + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, true, false, V0, XA, BitSet32(0), BitSet32(0)); } else { + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, false, false, V0, XA, regs_in_use, fprs_in_use); } } else { + set_addr_reg_if_needed(); EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, V0, XA, regs_in_use, fprs_in_use); } + if (update && !early_update) + { + gpr.BindToRegister(a, false); + set_addr_reg_if_needed(); + MOV(gpr.R(a), addr_reg); + } + if (want_single && !have_single) fpr.Unlock(V0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 141db6f487..5a4daa58cc 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -8,6 +8,7 @@ #include "Core/Core.h" #include "Core/CoreTiming.h" +#include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" #include "Core/PowerPC/PPCTables.h" @@ -19,7 +20,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(jo.memcheck); // If we have a fastmem arena, the asm routines assume address translation is on. FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR); @@ -47,7 +47,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) constexpr ARM64Reg addr_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; constexpr ARM64Reg type_reg = ARM64Reg::W2; - ARM64Reg VS = fpr.RW(inst.RS, RegType::Single); + ARM64Reg VS = fpr.RW(inst.RS, RegType::Single, false); if (inst.RA || update) // Always uses the register on update { @@ -66,7 +66,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) MOVI2R(addr_reg, (u32)offset); } - if (update) + const bool early_update = !jo.memcheck; + if (update && early_update) { gpr.BindToRegister(inst.RA, false); MOV(gpr.R(inst.RA), addr_reg); @@ -78,9 +79,11 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); // Wipe the registers we are using as temporaries - gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; + if (!update || early_update) + gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false; - fprs_in_use[DecodeReg(VS)] = 0; + if (!jo.memcheck) + fprs_in_use[DecodeReg(VS)] = 0; u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -99,6 +102,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); + WriteConditionalExceptionExit(EXCEPTION_DSI, ARM64Reg::X30, ARM64Reg::Q1); + m_float_emit.ORR(EncodeRegToDouble(VS), ARM64Reg::D0, ARM64Reg::D0); } @@ -108,6 +113,15 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) m_float_emit.INS(32, VS, 1, ARM64Reg::Q0, 0); } + const ARM64Reg VS_again = fpr.RW(inst.RS, RegType::Single, true); + ASSERT(VS == VS_again); + + if (update && !early_update) + { + gpr.BindToRegister(inst.RA, false); + MOV(gpr.R(inst.RA), addr_reg); + } + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) @@ -121,7 +135,6 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(jo.memcheck); // If we have a fastmem arena, the asm routines assume address translation is on. FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR); @@ -198,7 +211,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOVI2R(addr_reg, (u32)offset); } - if (update) + const bool early_update = !jo.memcheck; + if (update && early_update) { gpr.BindToRegister(inst.RA, false); MOV(gpr.R(inst.RA), addr_reg); @@ -211,7 +225,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) // Wipe the registers we are using as temporaries gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; - gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; + if (!update || early_update) + gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -229,6 +244,14 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized); LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); + + WriteConditionalExceptionExit(EXCEPTION_DSI, ARM64Reg::X30, ARM64Reg::Q1); + } + + if (update && !early_update) + { + gpr.BindToRegister(inst.RA, false); + MOV(gpr.R(inst.RA), addr_reg); } if (js.assumeNoPairedQuantize && !have_single) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 45249d92ed..24aae47c28 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -343,25 +343,29 @@ void Arm64GPRCache::SetImmediate(const GuestRegInfo& guest_reg, u32 imm) reg.LoadToImm(imm); } -void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool do_load) +void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool do_load, bool set_dirty) { OpArg& reg = guest_reg.reg; const size_t bitsize = guest_reg.bitsize; reg.ResetLastUsed(); - reg.SetDirty(true); const RegType reg_type = reg.GetType(); if (reg_type == RegType::NotLoaded || reg_type == RegType::Discarded) { const ARM64Reg host_reg = bitsize != 64 ? GetReg() : EncodeRegTo64(GetReg()); reg.Load(host_reg); + reg.SetDirty(set_dirty); if (do_load) { ASSERT_MSG(DYNA_REC, reg_type != RegType::Discarded, "Attempted to load a discarded value"); m_emit->LDR(IndexType::Unsigned, host_reg, PPC_REG, u32(guest_reg.ppc_offset)); } } + else if (set_dirty) + { + reg.SetDirty(true); + } } void Arm64GPRCache::GetAllocationOrder() @@ -570,26 +574,15 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return ARM64Reg::INVALID_REG; } -ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) +ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type, bool set_dirty) { OpArg& reg = m_guest_registers[preg]; - bool was_dirty = reg.IsDirty(); - IncrementAllUsed(); reg.ResetLastUsed(); - reg.SetDirty(true); - - // If not loaded at all, just alloc a new one. - if (reg.GetType() == RegType::NotLoaded || reg.GetType() == RegType::Discarded) - { - reg.Load(GetReg(), type); - return reg.GetReg(); - } - // Only the lower value will be overwritten, so we must be extra careful to store PSR1 if dirty. - if ((type == RegType::LowerPair || type == RegType::LowerPairSingle) && was_dirty) + if (reg.IsDirty() && (type == RegType::LowerPair || type == RegType::LowerPairSingle)) { // We must *not* change host_reg as this register might still be in use. So it's fine to // store this register, but it's *not* fine to convert it to double. So for double conversion, @@ -612,6 +605,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) m_jit->ConvertSingleToDoubleLower(preg, flush_reg, flush_reg, scratch_reg); m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG, u32(PPCSTATE_OFF_PS1(preg))); Unlock(scratch_reg); + reg.Load(host_reg, RegType::LowerPairSingle); break; } else @@ -619,6 +613,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg); m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG, u32(PPCSTATE_OFF_PS0(preg))); + reg.SetDirty(false); } break; case RegType::Register: @@ -627,6 +622,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) // It would take longer to do an insert to a temporary and a 64bit store than to just do this. m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG, static_cast(PPCSTATE_OFF_PS0(preg))); + reg.SetDirty(false); break; case RegType::DuplicatedSingle: flush_reg = GetReg(); @@ -636,6 +632,8 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) // Store PSR1 (which is equal to PSR0) in memory. m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG, static_cast(PPCSTATE_OFF_PS1(preg))); + reg.Load(host_reg, reg.GetType() == RegType::DuplicatedSingle ? RegType::LowerPairSingle : + RegType::LowerPair); break; default: // All other types doesn't store anything in PSR1. @@ -646,7 +644,18 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) Unlock(flush_reg); } - reg.Load(reg.GetReg(), type); + if (reg.GetType() == RegType::NotLoaded || reg.GetType() == RegType::Discarded) + { + // If not loaded at all, just alloc a new one. + reg.Load(GetReg(), type); + reg.SetDirty(set_dirty); + } + else if (set_dirty) + { + reg.Load(reg.GetReg(), type); + reg.SetDirty(true); + } + return reg.GetReg(); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 2ecbcbbffa..16678a04b1 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -266,9 +266,15 @@ public: // Gets the immediate that a register is set to, only valid for guest GPRs u32 GetImm(size_t preg) const { return GetGuestGPROpArg(preg).GetImm(); } // Binds a guest GPR to a host register, optionally loading its value - void BindToRegister(size_t preg, bool do_load) { BindToRegister(GetGuestGPR(preg), do_load); } + void BindToRegister(size_t preg, bool do_load, bool set_dirty = true) + { + BindToRegister(GetGuestGPR(preg), do_load, set_dirty); + } // Binds a guest CR to a host register, optionally loading its value - void BindCRToRegister(size_t preg, bool do_load) { BindToRegister(GetGuestCR(preg), do_load); } + void BindCRToRegister(size_t preg, bool do_load, bool set_dirty = true) + { + BindToRegister(GetGuestCR(preg), do_load, set_dirty); + } BitSet32 GetCallerSavedUsed() const override; void StoreRegisters(BitSet32 regs, Arm64Gen::ARM64Reg tmp_reg = Arm64Gen::ARM64Reg::INVALID_REG) @@ -307,7 +313,7 @@ private: Arm64Gen::ARM64Reg R(const GuestRegInfo& guest_reg); void SetImmediate(const GuestRegInfo& guest_reg, u32 imm); - void BindToRegister(const GuestRegInfo& guest_reg, bool do_load); + void BindToRegister(const GuestRegInfo& guest_reg, bool do_load, bool set_dirty = true); void FlushRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg); void FlushCRRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg); @@ -326,7 +332,7 @@ public: // Will dump an immediate to the host register as well Arm64Gen::ARM64Reg R(size_t preg, RegType type); - Arm64Gen::ARM64Reg RW(size_t preg, RegType type); + Arm64Gen::ARM64Reg RW(size_t preg, RegType type, bool set_dirty = true); BitSet32 GetCallerSavedUsed() const override; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 5f5b8826fd..bf650baf15 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -495,7 +495,9 @@ void JitArm64::GenerateQuantizedLoads() // Q1 is a temporary ARM64Reg addr_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2, 3}; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{2, 3}; + if (!jo.memcheck) + gprs_to_push &= ~BitSet32{0}; BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this); @@ -524,8 +526,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -542,8 +544,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -559,8 +561,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -576,8 +578,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -605,8 +607,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -623,8 +625,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -640,8 +642,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -657,8 +659,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -701,7 +703,9 @@ void JitArm64::GenerateQuantizedStores() // Q1 is a temporary ARM64Reg scale_reg = ARM64Reg::X0; ARM64Reg addr_reg = ARM64Reg::X1; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1, 2}; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2}; + if (!jo.memcheck) + gprs_to_push &= ~BitSet32{1}; BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this);