From bcde1aa8ff934278c004b59722d09ea855daf295 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 30 Aug 2015 17:03:54 -0500 Subject: [PATCH] [AArch64] Improve floating point single instructions. Instead of having an "INS" instruction after every single instruction to duplicate the bottom 64bits in to the top 64bits of the register, create a new FPR register cache type to track when a register's lower 64bits is supposed to be duplicated in to the high 64bits. Not necessarily actually having the lower bits duplicated in the host side register. This removes inefficient INS instructions from sequential single float instructions. In particular a very heavy single heavy block in Animal Crossing went from 712 instructions down to 520 instructions(~37% less instructions!) --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 18 +- .../JitArm64/JitArm64_FloatingPoint.cpp | 151 +++++++------ .../JitArm64/JitArm64_LoadStoreFloating.cpp | 14 +- .../JitArm64/JitArm64_LoadStorePaired.cpp | 10 +- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 200 +++++++++--------- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 114 +++++++++- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 14 +- .../Core/PowerPC/JitArmCommon/BackPatch.h | 3 +- 8 files changed, 300 insertions(+), 224 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 9f5e830f18..020e72fa0b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -76,23 +76,13 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, if (flags & BackPatchInfo::FLAG_SIZE_F32) { m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr); - m_float_emit.INS(32, RS, 1, RS, 0); m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); m_float_emit.FCVTL(64, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); } else { - if (flags & BackPatchInfo::FLAG_ONLY_LOWER) - { - m_float_emit.LDR(64, EncodeRegToDouble(RS), X28, addr); - m_float_emit.REV64(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); - } - else - { - m_float_emit.LDR(64, Q0, X28, addr); - m_float_emit.REV64(8, D0, D0); - m_float_emit.INS(64, RS, 0, Q0, 0); - } + m_float_emit.LDR(64, EncodeRegToDouble(RS), X28, addr); + m_float_emit.REV64(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS)); } } else if (flags & BackPatchInfo::FLAG_STORE) @@ -142,7 +132,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, handler.addr_reg = addr; handler.gprs = gprs_to_push; handler.fprs = fprs_to_push; - handler.flags = flags & ~BackPatchInfo::FLAG_ONLY_LOWER; + handler.flags = flags; FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_start]; auto handler_loc_iter = m_handler_to_loc.find(handler); @@ -199,7 +189,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, { MOVI2R(X30, (u64)&PowerPC::Read_U32); BLR(X30); - m_float_emit.DUP(32, RS, X0); + m_float_emit.INS(32, RS, 0, X0); m_float_emit.FCVTL(64, RS, RS); } else diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 4ee267f55c..099f97b710 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -24,7 +24,7 @@ void JitArm64::fabsx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == b); - ARM64Reg VB = fpr.R(b); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); @@ -37,14 +37,13 @@ void JitArm64::faddsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 1, VD, 0); } void JitArm64::faddx(UGeckoInstruction inst) @@ -56,8 +55,8 @@ void JitArm64::faddx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == a || d == b); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); @@ -70,17 +69,17 @@ void JitArm64::fmaddsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 1, VD, 0); + fpr.Unlock(V0); } @@ -93,9 +92,9 @@ void JitArm64::fmaddx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; fpr.BindToRegister(d, d == a || d == b || d == c); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FMADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); @@ -110,7 +109,7 @@ void JitArm64::fmrx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == b); - ARM64Reg VB = fpr.R(b); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.INS(64, VD, 0, VB, 0); @@ -123,17 +122,17 @@ void JitArm64::fmsubsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 1, VD, 0); + fpr.Unlock(V0); } @@ -146,9 +145,9 @@ void JitArm64::fmsubx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; fpr.BindToRegister(d, d == a || d == b || d == c); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FNMSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); @@ -161,14 +160,13 @@ void JitArm64::fmulsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c, false); + fpr.BindToRegister(d, d == a || d == c, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); m_float_emit.FMUL(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); - m_float_emit.INS(64, VD, 1, VD, 0); } void JitArm64::fmulx(UGeckoInstruction inst) @@ -180,8 +178,8 @@ void JitArm64::fmulx(UGeckoInstruction inst) u32 a = inst.FA, c = inst.FC, d = inst.FD; fpr.BindToRegister(d, d == a || d == c); - ARM64Reg VA = fpr.R(a); - ARM64Reg VC = fpr.R(c); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FMUL(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); @@ -196,7 +194,7 @@ void JitArm64::fnabsx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == b); - ARM64Reg VB = fpr.R(b); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); @@ -212,7 +210,7 @@ void JitArm64::fnegx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == b); - ARM64Reg VB = fpr.R(b); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); @@ -225,18 +223,18 @@ void JitArm64::fnmaddsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.FADD(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); - m_float_emit.INS(64, VD, 1, VD, 0); + fpr.Unlock(V0); } @@ -249,9 +247,9 @@ void JitArm64::fnmaddx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; fpr.BindToRegister(d, d == a || d == b || d == c); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FNMADD(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); @@ -264,18 +262,18 @@ void JitArm64::fnmsubsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VC)); m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(V0), EncodeRegToDouble(VB)); m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); - m_float_emit.INS(64, VD, 1, VD, 0); + fpr.Unlock(V0); } @@ -288,9 +286,9 @@ void JitArm64::fnmsubx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; fpr.BindToRegister(d, d == a || d == b || d == c); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VC = fpr.R(c); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VC = fpr.R(c, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FMSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VC), EncodeRegToDouble(VB)); @@ -305,9 +303,9 @@ void JitArm64::fselx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; fpr.BindToRegister(d, d == a || d == b || d == c); - ARM64Reg VD = fpr.R(d); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d, REG_IS_LOADED); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VC = fpr.R(c); m_float_emit.FCMPE(EncodeRegToDouble(VA)); @@ -321,14 +319,13 @@ void JitArm64::fsubsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 1, VD, 0); } void JitArm64::fsubx(UGeckoInstruction inst) @@ -340,8 +337,8 @@ void JitArm64::fsubx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == a || d == b); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FSUB(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); @@ -353,14 +350,13 @@ void JitArm64::frspx(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b, false); + fpr.BindToRegister(d, d == b, REG_DUP); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); m_float_emit.FCVTL(64, EncodeRegToDouble(VD), EncodeRegToDouble(VD)); - m_float_emit.INS(64, VD, 1, VD, 0); } void JitArm64::fcmpx(UGeckoInstruction inst) @@ -371,8 +367,8 @@ void JitArm64::fcmpx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB; int crf = inst.CRFD; - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg WA = gpr.GetReg(); ARM64Reg XA = EncodeRegTo64(WA); @@ -457,7 +453,7 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) u32 b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == b); - ARM64Reg VB = fpr.R(b); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); ARM64Reg V0 = fpr.GetReg(); @@ -481,8 +477,8 @@ void JitArm64::fdivx(UGeckoInstruction inst) u32 a = inst.FA, b = inst.FB, d = inst.FD; fpr.BindToRegister(d, d == a || d == b); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); ARM64Reg VD = fpr.R(d); m_float_emit.FDIV(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); @@ -495,12 +491,11 @@ void JitArm64::fdivsx(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_DUP); - ARM64Reg VA = fpr.R(a); - ARM64Reg VB = fpr.R(b); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_IS_LOADED); + ARM64Reg VB = fpr.R(b, REG_IS_LOADED); + ARM64Reg VD = fpr.R(d, REG_DUP); m_float_emit.FDIV(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB)); - m_float_emit.INS(64, VD, 1, VD, 0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 31bae8de51..5d5f7a750f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -71,19 +71,13 @@ void JitArm64::lfXX(UGeckoInstruction inst) u32 imm_addr = 0; bool is_immediate = false; - bool only_lower = !!(flags & BackPatchInfo::FLAG_SIZE_F64); + RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP; - fpr.BindToRegister(inst.FD, false, only_lower); + fpr.BindToRegister(inst.FD, false, type); - ARM64Reg VD = fpr.R(inst.FD, only_lower); + ARM64Reg VD = fpr.R(inst.FD, type); ARM64Reg addr_reg = W0; - if (!fpr.IsLower(inst.FD)) - only_lower = false; - - if (only_lower) - flags |= BackPatchInfo::FLAG_ONLY_LOWER; - gpr.Lock(W0, W30); fpr.Lock(Q0); @@ -270,7 +264,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) u32 imm_addr = 0; bool is_immediate = false; - ARM64Reg V0 = fpr.R(inst.FS); + ARM64Reg V0 = fpr.R(inst.FS, REG_IS_LOADED); ARM64Reg addr_reg = W1; gpr.Lock(W0, W1, W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 8bf46e17d4..c31ec67615 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -58,7 +58,7 @@ void JitArm64::psq_l(UGeckoInstruction inst) if (update) { - gpr.BindToRegister(inst.RA, false); + gpr.BindToRegister(inst.RA, REG_REG); MOV(arm_addr, addr_reg); } @@ -66,8 +66,8 @@ void JitArm64::psq_l(UGeckoInstruction inst) LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(X30); - fpr.BindToRegister(inst.RS, false, false); - ARM64Reg VS = fpr.R(inst.RS, false); + fpr.BindToRegister(inst.RS, false, REG_REG); + ARM64Reg VS = fpr.R(inst.RS, REG_REG); m_float_emit.FCVTL(64, VS, D0); if (inst.W) { @@ -97,7 +97,7 @@ void JitArm64::psq_st(UGeckoInstruction inst) fpr.Lock(Q0, Q1); ARM64Reg arm_addr = gpr.R(inst.RA); - ARM64Reg VS = fpr.R(inst.RS, false); + ARM64Reg VS = fpr.R(inst.RS, REG_REG); ARM64Reg scale_reg = W0; ARM64Reg addr_reg = W1; @@ -129,7 +129,7 @@ void JitArm64::psq_st(UGeckoInstruction inst) if (update) { - gpr.BindToRegister(inst.RA, false); + gpr.BindToRegister(inst.RA, REG_REG); MOV(arm_addr, addr_reg); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index ee27a5caf8..cff1f49ce9 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -23,10 +23,10 @@ void JitArm64::ps_abs(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b, false); + fpr.BindToRegister(d, d == b, REG_REG); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FABS(64, VD, VB); } @@ -38,11 +38,11 @@ void JitArm64::ps_add(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FADD(64, VD, VA, VB); } @@ -54,11 +54,11 @@ void JitArm64::ps_div(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FDIV(64, VD, VA, VB); } @@ -70,12 +70,12 @@ void JitArm64::ps_madd(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -91,12 +91,12 @@ void JitArm64::ps_madds0(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 0); @@ -113,12 +113,12 @@ void JitArm64::ps_madds1(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 1); @@ -135,11 +135,11 @@ void JitArm64::ps_merge00(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.TRN1(64, VD, VA, VB); } @@ -151,11 +151,11 @@ void JitArm64::ps_merge01(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.INS(64, VD, 0, VA, 0); m_float_emit.INS(64, VD, 1, VB, 1); @@ -168,11 +168,11 @@ void JitArm64::ps_merge10(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); if (d != a && d != b) { @@ -196,11 +196,11 @@ void JitArm64::ps_merge11(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.TRN2(64, VD, VA, VB); } @@ -216,10 +216,10 @@ void JitArm64::ps_mr(UGeckoInstruction inst) if (d == b) return; - fpr.BindToRegister(d, false, false); + fpr.BindToRegister(d, REG_REG, REG_REG); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.ORR(VD, VB, VB); } @@ -231,11 +231,11 @@ void JitArm64::ps_mul(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c, false); + fpr.BindToRegister(d, d == a || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FMUL(64, VD, VA, VC); } @@ -247,11 +247,11 @@ void JitArm64::ps_muls0(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c, false); + fpr.BindToRegister(d, d == a || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 0); @@ -266,11 +266,11 @@ void JitArm64::ps_muls1(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == c, false); + fpr.BindToRegister(d, d == a || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VC, 1); @@ -285,12 +285,12 @@ void JitArm64::ps_msub(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -306,10 +306,10 @@ void JitArm64::ps_nabs(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b, false); + fpr.BindToRegister(d, d == b, REG_REG); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FABS(64, VD, VB); m_float_emit.FNEG(64, VD, VD); @@ -322,10 +322,10 @@ void JitArm64::ps_neg(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b, false); + fpr.BindToRegister(d, d == b, REG_REG); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FNEG(64, VD, VB); } @@ -337,12 +337,12 @@ void JitArm64::ps_nmadd(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -359,12 +359,12 @@ void JitArm64::ps_nmsub(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.FMUL(64, V0, VA, VC); @@ -381,10 +381,10 @@ void JitArm64::ps_res(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == b, false); + fpr.BindToRegister(d, d == b, REG_REG); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FRSQRTE(64, VD, VB); } @@ -396,12 +396,12 @@ void JitArm64::ps_sel(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); if (d != a && d != b && d != c) { @@ -425,11 +425,11 @@ void JitArm64::ps_sub(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b, false); + fpr.BindToRegister(d, d == a || d == b, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); m_float_emit.FSUB(64, VD, VA, VB); } @@ -441,12 +441,12 @@ void JitArm64::ps_sum0(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VB, 1); @@ -471,12 +471,12 @@ void JitArm64::ps_sum1(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - fpr.BindToRegister(d, d == a || d == b || d == c, false); + fpr.BindToRegister(d, d == a || d == b || d == c, REG_REG); - ARM64Reg VA = fpr.R(a, false); - ARM64Reg VB = fpr.R(b, false); - ARM64Reg VC = fpr.R(c, false); - ARM64Reg VD = fpr.R(d, false); + ARM64Reg VA = fpr.R(a, REG_REG); + ARM64Reg VB = fpr.R(b, REG_REG); + ARM64Reg VC = fpr.R(c, REG_REG); + ARM64Reg VD = fpr.R(d, REG_REG); ARM64Reg V0 = fpr.GetReg(); m_float_emit.DUP(64, V0, VA, 0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index c58a3cee46..e33558cccc 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -274,7 +274,7 @@ void Arm64FPRCache::Flush(FlushMode mode, PPCAnalyst::CodeOp* op) } } -ARM64Reg Arm64FPRCache::R(u32 preg, bool only_lower) +ARM64Reg Arm64FPRCache::R(u32 preg, RegType type) { OpArg& reg = m_guest_registers[preg]; IncrementAllUsed(); @@ -287,7 +287,7 @@ ARM64Reg Arm64FPRCache::R(u32 preg, bool only_lower) break; case REG_LOWER_PAIR: { - if (!only_lower) + if (type == REG_REG) { // Load the high 64bits from the file and insert them in to the high 64bits of the host register ARM64Reg tmp_reg = GetReg(); @@ -298,18 +298,52 @@ ARM64Reg Arm64FPRCache::R(u32 preg, bool only_lower) // Change it over to a full 128bit register reg.LoadToReg(reg.GetReg()); } + else if (type == REG_DUP) + { + // We already only have the lower 64bits + // Don't do anything + } return reg.GetReg(); } break; + case REG_DUP: + { + ARM64Reg host_reg = reg.GetReg(); + if (type == REG_REG) + { + // We are requesting a full 128bit register + // but we are only available in the lower 64bits + // Duplicate to the top and change over + m_float_emit->INS(64, host_reg, 1, host_reg, 0); + reg.LoadToReg(host_reg); + } + else if (type == REG_LOWER_PAIR) + { + // We are only requesting the lower 64bits of a pair + // We've got to be careful in this instance + // Store our current duplicated high bits to the file + // then convert over to a lower reg + if (reg.IsDirty()) + m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][1])); + reg.LoadLowerReg(host_reg); + } + return host_reg; + } + break; case REG_NOTLOADED: // Register isn't loaded at /all/ { ARM64Reg host_reg = GetReg(); u32 load_size; - if (only_lower) + if (type == REG_LOWER_PAIR) { load_size = 64; reg.LoadLowerReg(host_reg); } + else if (type == REG_DUP) + { + load_size = 64; + reg.LoadDup(host_reg); + } else { load_size = 128; @@ -328,7 +362,7 @@ ARM64Reg Arm64FPRCache::R(u32 preg, bool only_lower) return INVALID_REG; } -void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, bool only_lower) +void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, RegType type) { OpArg& reg = m_guest_registers[preg]; @@ -343,12 +377,17 @@ void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, bool only_lower) { ARM64Reg host_reg = GetReg(); u32 load_size; - if (only_lower) + if (type == REG_LOWER_PAIR) { // We only want the lower 64bits load_size = 64; reg.LoadLowerReg(host_reg); } + else if (type == REG_DUP) + { + load_size = 64; + reg.LoadDup(host_reg); + } else { // We want the full 128bit register @@ -361,7 +400,8 @@ void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, bool only_lower) break; case REG_LOWER_PAIR: { - if (!only_lower) + ARM64Reg host_reg = reg.GetReg(); + if (type == REG_REG) { // Okay, we've got the lower reg loaded and we really wanted the full register if (do_load) @@ -369,27 +409,63 @@ void Arm64FPRCache::BindToRegister(u32 preg, bool do_load, bool only_lower) // Load the high 64bits from the file and insert them in to the high 64bits of the host register ARM64Reg tmp_reg = GetReg(); m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, X29, PPCSTATE_OFF(ps[preg][1])); - m_float_emit->INS(64, reg.GetReg(), 1, tmp_reg, 0); + m_float_emit->INS(64, host_reg, 1, tmp_reg, 0); UnlockRegister(tmp_reg); } // Change it over to a full 128bit register - reg.LoadToReg(reg.GetReg()); + reg.LoadToReg(host_reg); + } + else if (type == REG_DUP) + { + // Register is already the lower pair + // Just convert it over to a dup + reg.LoadDup(host_reg); } } break; case REG_REG: { - if (only_lower) + ARM64Reg host_reg = reg.GetReg(); + if (type == REG_LOWER_PAIR) { // If we only want the lower bits, let's store away the high bits and drop to a lower only register // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit store. // It would take longer to do an insert to a temporary and a 64bit store than to just do this. - ARM64Reg host_reg = reg.GetReg(); if (was_dirty) m_float_emit->STR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); reg.LoadLowerReg(host_reg); } + else if (type == REG_DUP) + { + // If we are going from a full 128bit register to a duplicate + // then we can just change over + reg.LoadDup(host_reg); + } + } + break; + case REG_DUP: + { + ARM64Reg host_reg = reg.GetReg(); + if (type == REG_REG) + { + // We are a duplicated register going to a full 128bit register + // Do an insert of our lower 64bits to the higher 64bits + m_float_emit->INS(64, host_reg, 1, host_reg, 0); + + // Change over to the full 128bit register + reg.LoadToReg(host_reg); + } + else if (type == REG_LOWER_PAIR) + { + // We are duplicated changing over to a lower register + // We've got to be careful in this instance and do a store of our lower 64bits + // to the upper 64bits in the PowerPC state + // That way incase if we hit the path of DUP->LOWER->REG we get the correct bits back + if (was_dirty) + m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][1])); + reg.LoadLowerReg(host_reg); + } } break; default: @@ -454,6 +530,24 @@ void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state) if (reg.IsDirty()) m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + if (!maintain_state) + { + UnlockRegister(host_reg); + reg.Flush(); + } + } + else if (reg.GetType() == REG_DUP) + { + ARM64Reg host_reg = reg.GetReg(); + if (reg.IsDirty()) + { + // If the paired registers were at the start of ppcState we could do an STP here. + // Too bad moving them would break savestate compatibility between x86_64 and AArch64 + //m_float_emit->STP(64, INDEX_SIGNED, host_reg, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0])); + m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][1])); + } + if (!maintain_state) { UnlockRegister(host_reg); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 433f0dad8f..fdbab5cc4c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -20,6 +20,8 @@ enum RegType REG_REG, // Reg type is register REG_IMM, // Reg is really a IMM REG_LOWER_PAIR, // Only the lower pair of a paired register + REG_DUP, // The lower reg is the same as the upper one (physical upper doesn't actually have the duplicated value) + REG_IS_LOADED, // We don't care what type it is, as long as the lower 64bits are loaded }; enum FlushMode @@ -65,6 +67,11 @@ public: m_type = REG_LOWER_PAIR; m_reg = reg; } + void LoadDup(ARM64Reg reg) + { + m_type = REG_DUP; + m_reg = reg; + } void LoadToImm(u32 imm) { m_type = REG_IMM; @@ -262,12 +269,9 @@ public: // Returns a guest register inside of a host register // Will dump an immediate to the host register as well - ARM64Reg R(u32 preg, bool only_lower = true); + ARM64Reg R(u32 preg, RegType type = REG_LOWER_PAIR); - void BindToRegister(u32 preg, bool do_load, bool only_lower = true); - - // Returns if the register is only the lower 64bit register - bool IsLower(u32 preg) const { return m_guest_registers[preg].GetType() == REG_LOWER_PAIR; } + void BindToRegister(u32 preg, bool do_load, RegType type = REG_LOWER_PAIR); BitSet32 GetCallerSavedUsed() override; diff --git a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h index 508b10b45d..d3b6f46c31 100644 --- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h +++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h @@ -18,8 +18,7 @@ struct BackPatchInfo FLAG_SIZE_F64 = (1 << 6), FLAG_REVERSE = (1 << 7), FLAG_EXTEND = (1 << 8), - FLAG_ONLY_LOWER = (1 << 9), - FLAG_SIZE_F32I = (1 << 10), + FLAG_SIZE_F32I = (1 << 9), }; static u32 GetFlagSize(u32 flags)