From cd84339dfd3044c663a1eceeda82aaaf4a292bf5 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 9 Jul 2021 10:59:53 +0200 Subject: [PATCH] JitArm64: Use EmitBackpatchRoutine more for psq_l/psq_st In the case of the JitAsm routines, we can't actually use backpatching. Still, I would like to gather all the load and store instructions in one place to make future changes easier. --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 130 ++++++----- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 2 +- .../JitArm64/JitArm64_LoadStorePaired.cpp | 26 +-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 202 ++++++++++-------- .../Core/Core/PowerPC/JitArm64/Jit_Util.cpp | 80 +++++-- Source/Core/Core/PowerPC/JitArm64/Jit_Util.h | 14 +- 6 files changed, 269 insertions(+), 185 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index cf5712eaab..fa5ed3e67d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -60,39 +60,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR { if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if ((flags & BackPatchInfo::FLAG_SIZE_32) && !(flags & BackPatchInfo::FLAG_PAIR)) - { - m_float_emit.REV32(8, ARM64Reg::D0, RS); - m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr); - } - else if ((flags & BackPatchInfo::FLAG_SIZE_32) && (flags & BackPatchInfo::FLAG_PAIR)) - { - m_float_emit.REV32(8, ARM64Reg::D0, RS); - m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); - } - else - { - m_float_emit.REV64(8, ARM64Reg::Q0, RS); - m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); - } + ARM64Reg temp = ARM64Reg::D0; + temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true); + + m_float_emit.STR(BackPatchInfo::GetFlagSize(flags), temp, MEM_REG, addr); } else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) { - if (flags & BackPatchInfo::FLAG_SIZE_32) - { - m_float_emit.LDR(32, EncodeRegToDouble(RS), MEM_REG, addr); - m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - } - else - { - m_float_emit.LDR(64, EncodeRegToDouble(RS), MEM_REG, addr); - m_float_emit.REV64(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - } + m_float_emit.LDR(BackPatchInfo::GetFlagSize(flags), EncodeRegToDouble(RS), MEM_REG, addr); + + ByteswapAfterLoad(this, &m_float_emit, EncodeRegToDouble(RS), EncodeRegToDouble(RS), flags, + true, false); } else if (flags & BackPatchInfo::FLAG_STORE) { ARM64Reg temp = ARM64Reg::W0; - temp = ByteswapBeforeStore(this, temp, RS, flags, true); + temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); if (flags & BackPatchInfo::FLAG_SIZE_32) STR(temp, MEM_REG, addr); @@ -118,7 +101,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR else if (flags & BackPatchInfo::FLAG_SIZE_8) LDRB(RS, MEM_REG, addr); - ByteswapAfterLoad(this, RS, RS, flags, true, false); + ByteswapAfterLoad(this, &m_float_emit, RS, RS, flags, true, false); } } const u8* fastmem_end = GetCodePtr(); @@ -158,52 +141,39 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); - if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) + if (flags & BackPatchInfo::FLAG_STORE) { - if ((flags & BackPatchInfo::FLAG_SIZE_32) && !(flags & BackPatchInfo::FLAG_PAIR)) + const u32 access_size = BackPatchInfo::GetFlagSize(flags); + ARM64Reg src_reg = RS; + const ARM64Reg dst_reg = access_size == 64 ? ARM64Reg::X0 : ARM64Reg::W0; + + if (flags & BackPatchInfo::FLAG_FLOAT) { - m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); - BLR(ARM64Reg::X8); + if (access_size == 64) + m_float_emit.FMOV(dst_reg, EncodeRegToDouble(RS)); + else + m_float_emit.FMOV(dst_reg, EncodeRegToSingle(RS)); + + src_reg = dst_reg; } - else if ((flags & BackPatchInfo::FLAG_SIZE_32) && (flags & BackPatchInfo::FLAG_PAIR)) + + if (flags & BackPatchInfo::FLAG_PAIR) { - m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); - ROR(ARM64Reg::X0, ARM64Reg::X0, 32); - BLR(ARM64Reg::X8); + // Compensate for the Write_ functions swapping the whole write instead of each pair + SwapPairs(this, dst_reg, src_reg, flags); + src_reg = dst_reg; } - else - { - m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); - BLR(ARM64Reg::X8); - } - } - else if ((flags & BackPatchInfo::FLAG_LOAD) && (flags & BackPatchInfo::FLAG_FLOAT)) - { - if (flags & BackPatchInfo::FLAG_SIZE_32) - { - MOVP2R(ARM64Reg::X8, &PowerPC::Read_U32); - BLR(ARM64Reg::X8); - m_float_emit.INS(32, RS, 0, ARM64Reg::X0); - } - else - { - MOVP2R(ARM64Reg::X8, &PowerPC::Read_F64); - BLR(ARM64Reg::X8); - m_float_emit.INS(64, RS, 0, ARM64Reg::X0); - } - } - else if (flags & BackPatchInfo::FLAG_STORE) - { - MOV(ARM64Reg::W0, RS); + + if (dst_reg != src_reg) + MOV(dst_reg, src_reg); const bool reverse = (flags & BackPatchInfo::FLAG_REVERSE) != 0; - if (flags & BackPatchInfo::FLAG_SIZE_32) + if (access_size == 64) + MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U64_Swap : &PowerPC::Write_U64); + else if (access_size == 32) MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U32_Swap : &PowerPC::Write_U32); - else if (flags & BackPatchInfo::FLAG_SIZE_16) + else if (access_size == 16) MOVP2R(ARM64Reg::X8, reverse ? &PowerPC::Write_U16_Swap : &PowerPC::Write_U16); else MOVP2R(ARM64Reg::X8, &PowerPC::Write_U8); @@ -217,16 +187,40 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR } else { - if (flags & BackPatchInfo::FLAG_SIZE_32) + const u32 access_size = BackPatchInfo::GetFlagSize(flags); + + if (access_size == 64) + MOVP2R(ARM64Reg::X8, &PowerPC::Read_U64); + else if (access_size == 32) MOVP2R(ARM64Reg::X8, &PowerPC::Read_U32); - else if (flags & BackPatchInfo::FLAG_SIZE_16) + else if (access_size == 16) MOVP2R(ARM64Reg::X8, &PowerPC::Read_U16); - else if (flags & BackPatchInfo::FLAG_SIZE_8) + else MOVP2R(ARM64Reg::X8, &PowerPC::Read_U8); BLR(ARM64Reg::X8); - ByteswapAfterLoad(this, RS, ARM64Reg::W0, flags, false, false); + ARM64Reg src_reg = access_size == 64 ? ARM64Reg::X0 : ARM64Reg::W0; + + if (flags & BackPatchInfo::FLAG_PAIR) + { + // Compensate for the Read_ functions swapping the whole read instead of each pair + const ARM64Reg dst_reg = flags & BackPatchInfo::FLAG_FLOAT ? src_reg : RS; + SwapPairs(this, dst_reg, src_reg, flags); + src_reg = dst_reg; + } + + if (flags & BackPatchInfo::FLAG_FLOAT) + { + if (access_size == 64) + m_float_emit.FMOV(EncodeRegToDouble(RS), src_reg); + else + m_float_emit.FMOV(EncodeRegToSingle(RS), src_reg); + + src_reg = RS; + } + + ByteswapAfterLoad(this, &m_float_emit, RS, src_reg, flags, false, false); } m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index fc4603811f..0cedefac45 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -240,7 +240,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); ARM64Reg temp = ARM64Reg::W1; - temp = ByteswapBeforeStore(this, temp, RS, flags, true); + temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); if (accessSize == 32) STR(IndexType::Post, temp, ARM64Reg::X0, 4); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index da6af010d9..305eb8a9f2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -42,7 +42,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) constexpr ARM64Reg addr_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; constexpr ARM64Reg type_reg = ARM64Reg::W2; - ARM64Reg VS; + ARM64Reg VS = fpr.RW(inst.RS, RegType::Single); if (inst.RA || update) // Always uses the register on update { @@ -69,17 +69,20 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { - VS = fpr.RW(inst.RS, RegType::Single); + BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + + // Wipe the registers we are using as temporaries + gprs_in_use &= BitSet32(~7); + fprs_in_use &= BitSet32(~3); + fprs_in_use[DecodeReg(VS)] = 0; + + u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) - { - ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), MEM_REG); - m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg)); - } - else - { - m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), MEM_REG); - } - m_float_emit.REV32(8, EncodeRegToDouble(VS), EncodeRegToDouble(VS)); + flags |= BackPatchInfo::FLAG_PAIR; + + EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use, + fprs_in_use); } else { @@ -91,7 +94,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); - VS = fpr.RW(inst.RS, RegType::Single); m_float_emit.ORR(EncodeRegToDouble(VS), ARM64Reg::D0, ARM64Reg::D0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 40129ce202..f083ba3dee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -488,11 +488,14 @@ void JitArm64::GenerateQuantizedLoads() { // X0 is the address // X1 is the scale + // X2 is a temporary // X30 is LR // Q0 is the return // Q1 is a temporary ARM64Reg addr_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2}; + BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -500,15 +503,20 @@ void JitArm64::GenerateQuantizedLoads() BRK(100); const u8* loadPairedFloatTwo = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LD1(32, 1, ARM64Reg::D0, addr_reg); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, + fprs_to_push); RET(ARM64Reg::X30); } const u8* loadPairedU8Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -521,8 +529,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS8Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -535,9 +546,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedU16Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LD1(16, 1, ARM64Reg::D0, addr_reg); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -549,9 +562,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS16Two = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LD1(16, 1, ARM64Reg::D0, addr_reg); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -564,15 +579,20 @@ void JitArm64::GenerateQuantizedLoads() const u8* loadPairedFloatOne = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, + fprs_to_push); RET(ARM64Reg::X30); } const u8* loadPairedU8One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(8, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -585,8 +605,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS8One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(8, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -599,9 +622,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedU16One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -613,9 +638,11 @@ void JitArm64::GenerateQuantizedLoads() } const u8* loadPairedS16One = GetCodePtr(); { - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.LDR(16, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); + constexpr u32 flags = + BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -663,6 +690,8 @@ void JitArm64::GenerateQuantizedStores() // Q1 is a temporary ARM64Reg scale_reg = ARM64Reg::X0; ARM64Reg addr_reg = ARM64Reg::X1; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1, 2}; + BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -671,17 +700,16 @@ void JitArm64::GenerateQuantizedStores() const u8* storePairedFloat; const u8* storePairedFloatSlow; { + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; + storePairedFloat = GetCodePtr(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(64, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedFloatSlow = GetCodePtr(); - float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::Q0, 0); - ROR(ARM64Reg::X0, ARM64Reg::X0, 32); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U64); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedU8; @@ -698,18 +726,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + storePairedU8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedU8Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - REV16(ARM64Reg::W0, ARM64Reg::W0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedS8; const u8* storePairedS8Slow; @@ -725,18 +753,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; + storePairedS8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedS8Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - REV16(ARM64Reg::W0, ARM64Reg::W0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedU16; @@ -750,21 +778,20 @@ void JitArm64::GenerateQuantizedStores() float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + storePairedU16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(32, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedU16Slow = GetCodePtr(); emit_quantize(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U32); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storePairedS16; // Used by Viewtiful Joe's intro movie const u8* storePairedS16Slow; @@ -777,36 +804,35 @@ void JitArm64::GenerateQuantizedStores() float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | + BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; + storePairedS16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(32, ARM64Reg::Q0, 0, addr_reg, ARM64Reg::SP); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storePairedS16Slow = GetCodePtr(); emit_quantize(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U32); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleFloat; const u8* storeSingleFloatSlow; { + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; + storeSingleFloat = GetCodePtr(); - float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D0, addr_reg, 0); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleFloatSlow = GetCodePtr(); - float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U32); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleU8; // Used by MKWii const u8* storeSingleU8Slow; @@ -822,17 +848,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + storeSingleU8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(8, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleU8Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(8, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U8); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleS8; const u8* storeSingleS8Slow; @@ -848,17 +875,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; + storeSingleS8 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.ST1(8, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleS8Slow = GetCodePtr(); emit_quantize(); - float_emit.SMOV(8, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U8); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleU16; // Used by MKWii const u8* storeSingleU16Slow; @@ -873,18 +901,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + storeSingleU16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleU16Slow = GetCodePtr(); emit_quantize(); - float_emit.UMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } const u8* storeSingleS16; const u8* storeSingleS16Slow; @@ -899,18 +927,18 @@ void JitArm64::GenerateQuantizedStores() float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); }; + constexpr u32 flags = + BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; + storeSingleS16 = GetCodePtr(); emit_quantize(); - ADD(addr_reg, addr_reg, MEM_REG); - float_emit.REV16(8, ARM64Reg::D0, ARM64Reg::D0); - float_emit.ST1(16, ARM64Reg::Q0, 0, addr_reg); + EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); storeSingleS16Slow = GetCodePtr(); emit_quantize(); - float_emit.SMOV(16, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X2, &PowerPC::Write_U16); - BR(ARM64Reg::X2); + EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + RET(ARM64Reg::X30); } JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore"); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp index 8c1237c061..f8aceb5615 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp @@ -191,19 +191,47 @@ private: bool m_sign_extend; }; -void ByteswapAfterLoad(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags, - bool is_reversed, bool is_extended) +void SwapPairs(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags) +{ + if (flags & BackPatchInfo::FLAG_SIZE_32) + emit->ROR(dst_reg, src_reg, 32); + else if (flags & BackPatchInfo::FLAG_SIZE_16) + emit->ROR(dst_reg, src_reg, 16); + else + emit->REV16(dst_reg, src_reg); +} + +void ByteswapAfterLoad(ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit, + ARM64Reg dst_reg, ARM64Reg src_reg, u32 flags, bool is_reversed, + bool is_extended) { if (is_reversed == !(flags & BackPatchInfo::FLAG_REVERSE)) { - if (flags & BackPatchInfo::FLAG_SIZE_32) + if (flags & BackPatchInfo::FLAG_SIZE_64) { - emit->REV32(dst_reg, src_reg); + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV64(8, dst_reg, src_reg); + else + emit->REV64(dst_reg, src_reg); + + src_reg = dst_reg; + } + else if (flags & BackPatchInfo::FLAG_SIZE_32) + { + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV32(8, dst_reg, src_reg); + else + emit->REV32(dst_reg, src_reg); + src_reg = dst_reg; } else if (flags & BackPatchInfo::FLAG_SIZE_16) { - emit->REV16(dst_reg, src_reg); + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV16(8, dst_reg, src_reg); + else + emit->REV16(dst_reg, src_reg); + src_reg = dst_reg; } } @@ -215,25 +243,47 @@ void ByteswapAfterLoad(ARM64XEmitter* emit, ARM64Reg dst_reg, ARM64Reg src_reg, } if (dst_reg != src_reg) - emit->MOV(dst_reg, src_reg); + { + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->ORR(dst_reg, src_reg, src_reg); + else + emit->MOV(dst_reg, src_reg); + } } -ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, ARM64Reg tmp_reg, ARM64Reg src_reg, u32 flags, - bool want_reversed) +ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit, + ARM64Reg tmp_reg, ARM64Reg src_reg, u32 flags, bool want_reversed) { ARM64Reg dst_reg = src_reg; if (want_reversed == !(flags & BackPatchInfo::FLAG_REVERSE)) { - if (flags & BackPatchInfo::FLAG_SIZE_32) + if (flags & BackPatchInfo::FLAG_SIZE_64) { dst_reg = tmp_reg; - emit->REV32(dst_reg, src_reg); + + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV64(8, dst_reg, src_reg); + else + emit->REV64(dst_reg, src_reg); + } + else if (flags & BackPatchInfo::FLAG_SIZE_32) + { + dst_reg = tmp_reg; + + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV32(8, dst_reg, src_reg); + else + emit->REV32(dst_reg, src_reg); } else if (flags & BackPatchInfo::FLAG_SIZE_16) { dst_reg = tmp_reg; - emit->REV16(dst_reg, src_reg); + + if (flags & BackPatchInfo::FLAG_FLOAT) + float_emit->REV16(8, dst_reg, src_reg); + else + emit->REV16(dst_reg, src_reg); } } @@ -243,6 +293,8 @@ ARM64Reg ByteswapBeforeStore(ARM64XEmitter* emit, ARM64Reg tmp_reg, ARM64Reg src void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use, BitSet32 fprs_in_use, ARM64Reg dst_reg, u32 address, u32 flags) { + ASSERT(!(flags & BackPatchInfo::FLAG_FLOAT)); + if (flags & BackPatchInfo::FLAG_SIZE_8) { MMIOReadCodeGenerator gen(emit, gprs_in_use, fprs_in_use, dst_reg, address, @@ -262,13 +314,15 @@ void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 mmio->GetHandlerForRead(address).Visit(gen); } - ByteswapAfterLoad(emit, dst_reg, dst_reg, flags, false, true); + ByteswapAfterLoad(emit, nullptr, dst_reg, dst_reg, flags, false, true); } void MMIOWriteRegToAddr(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use, BitSet32 fprs_in_use, ARM64Reg src_reg, u32 address, u32 flags) { - src_reg = ByteswapBeforeStore(emit, ARM64Reg::W1, src_reg, flags, false); + ASSERT(!(flags & BackPatchInfo::FLAG_FLOAT)); + + src_reg = ByteswapBeforeStore(emit, nullptr, ARM64Reg::W1, src_reg, flags, false); if (flags & BackPatchInfo::FLAG_SIZE_8) { diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h index 47d4b5ce95..9f1b8f8436 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.h @@ -8,11 +8,17 @@ #include "Core/HW/MMIO.h" -void ByteswapAfterLoad(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg dst_reg, - Arm64Gen::ARM64Reg src_reg, u32 flags, bool is_reversed, bool is_extended); +void SwapPairs(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg dst_reg, + Arm64Gen::ARM64Reg src_reg, u32 flags); -Arm64Gen::ARM64Reg ByteswapBeforeStore(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64Reg tmp_reg, - Arm64Gen::ARM64Reg src_reg, u32 flags, bool want_reversed); +void ByteswapAfterLoad(Arm64Gen::ARM64XEmitter* emit, Arm64Gen::ARM64FloatEmitter* float_emit, + Arm64Gen::ARM64Reg dst_reg, Arm64Gen::ARM64Reg src_reg, u32 flags, + bool is_reversed, bool is_extended); + +Arm64Gen::ARM64Reg ByteswapBeforeStore(Arm64Gen::ARM64XEmitter* emit, + Arm64Gen::ARM64FloatEmitter* float_emit, + Arm64Gen::ARM64Reg tmp_reg, Arm64Gen::ARM64Reg src_reg, + u32 flags, bool want_reversed); void MMIOLoadToReg(MMIO::Mapping* mmio, Arm64Gen::ARM64XEmitter* emit, BitSet32 gprs_in_use, BitSet32 fprs_in_use, Arm64Gen::ARM64Reg dst_reg, u32 address, u32 flags);