From f96ee475e4ce22ec1e18cd7ae1e8ff9fd7d0893a Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 20 Jan 2021 14:18:05 +0100 Subject: [PATCH 01/12] Implement ArmFPURoundMode.cpp Fixes https://bugs.dolphin-emu.org/issues/12388. Might also fix other games that have problems with float/paired instructions in JitArm64, but I haven't tested any. --- Source/Core/Common/ArmCPUDetect.cpp | 1 + Source/Core/Common/ArmFPURoundMode.cpp | 78 ++++++++++++++++++++++++++ Source/Core/Common/CMakeLists.txt | 2 +- Source/Core/DolphinLib.ARM64.props | 2 +- 4 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 Source/Core/Common/ArmFPURoundMode.cpp diff --git a/Source/Core/Common/ArmCPUDetect.cpp b/Source/Core/Common/ArmCPUDetect.cpp index 7fb0ef6765..a603ff03fd 100644 --- a/Source/Core/Common/ArmCPUDetect.cpp +++ b/Source/Core/Common/ArmCPUDetect.cpp @@ -69,6 +69,7 @@ void CPUInfo::Detect() CPU64bit = true; Mode64bit = true; vendor = CPUVendor::ARM; + bFlushToZero = true; #ifdef _WIN32 num_cores = std::thread::hardware_concurrency(); diff --git a/Source/Core/Common/ArmFPURoundMode.cpp b/Source/Core/Common/ArmFPURoundMode.cpp new file mode 100644 index 0000000000..323e456ae7 --- /dev/null +++ b/Source/Core/Common/ArmFPURoundMode.cpp @@ -0,0 +1,78 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "Common/CommonTypes.h" +#include "Common/FPURoundMode.h" + +#ifdef _MSC_VER +#include +#endif + +static u64 GetFPCR() +{ +#ifdef _MSC_VER + return _ReadStatusReg(ARM64_FPCR); +#else + u64 fpcr; + __asm__ __volatile__("mrs %0, fpcr" : "=r"(fpcr)); + return fpcr; +#endif +} + +static void SetFPCR(u64 fpcr) +{ +#ifdef _MSC_VER + _WriteStatusReg(ARM64_FPCR, fpcr); +#else + __asm__ __volatile__("msr fpcr, %0" : : "ri"(fpcr)); +#endif +} + +namespace FPURoundMode +{ +static const u64 default_fpcr = GetFPCR(); +static u64 saved_fpcr = default_fpcr; + +void SetRoundMode(int mode) +{ + // We don't need to do anything here since SetSIMDMode is always called after calling this +} + +void SetPrecisionMode(PrecisionMode mode) +{ +} + +void SetSIMDMode(int rounding_mode, bool non_ieee_mode) +{ + // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0) + constexpr u32 FZ = 1 << 24; + + // lookup table for FPSCR.RN-to-FPCR.RMode translation + constexpr u32 rounding_mode_table[] = { + (0 << 22), // nearest + (3 << 22), // zero + (1 << 22), // +inf + (2 << 22), // -inf + }; + + const u64 base = default_fpcr & ~(0b111 << 22); + SetFPCR(base | rounding_mode_table[rounding_mode] | (non_ieee_mode ? FZ : 0)); +} + +void SaveSIMDState() +{ + saved_fpcr = GetFPCR(); +} + +void LoadSIMDState() +{ + SetFPCR(saved_fpcr); +} + +void LoadDefaultSIMDState() +{ + SetFPCR(default_fpcr); +} + +} // namespace FPURoundMode diff --git a/Source/Core/Common/CMakeLists.txt b/Source/Core/Common/CMakeLists.txt index 9f7486a571..4846601305 100644 --- a/Source/Core/Common/CMakeLists.txt +++ b/Source/Core/Common/CMakeLists.txt @@ -199,7 +199,7 @@ if(_M_ARM_64) Arm64Emitter.h ArmCommon.h ArmCPUDetect.cpp - GenericFPURoundMode.cpp + ArmFPURoundMode.cpp ) else() if(_M_X86) #X86 diff --git a/Source/Core/DolphinLib.ARM64.props b/Source/Core/DolphinLib.ARM64.props index 61d9bd84b0..b7c7558845 100644 --- a/Source/Core/DolphinLib.ARM64.props +++ b/Source/Core/DolphinLib.ARM64.props @@ -13,7 +13,7 @@ - + From fdf7744a53f829d5488003a18964c27e39ef5813 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 2 Feb 2021 22:17:44 +0100 Subject: [PATCH 02/12] JitArm64: Move float conversion code out of EmitBackpatchRoutine This simplifies some of the following commits. It does require an extra register, but hey, we have 32 of them. Something I think would be nice to add to the register cache in the future is the ability to keep both the single and double version of a guest register in two different host registers when that is useful. That way, the extra register we write to here can be read by a later instruction, saving us from having to perform the same conversion again. --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 31 +--------- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 37 ++++++------ .../JitArm64/JitArm64_LoadStorePaired.cpp | 57 ++++++++++++------- .../Core/PowerPC/JitArmCommon/BackPatch.h | 17 +++--- 4 files changed, 69 insertions(+), 73 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index de3a8bf683..4f1aca8e60 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -61,23 +61,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT) { if (flags & BackPatchInfo::FLAG_SIZE_F32) - { - m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS); - m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr); - } - else if (flags & BackPatchInfo::FLAG_SIZE_F32I) { m_float_emit.REV32(8, ARM64Reg::D0, RS); m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr); } else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) - { - m_float_emit.FCVTN(32, ARM64Reg::D0, RS); - m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); - } - else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I) { m_float_emit.REV32(8, ARM64Reg::D0, RS); m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); @@ -184,37 +172,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT) { if (flags & BackPatchInfo::FLAG_SIZE_F32) - { - m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS); - m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); - BLR(ARM64Reg::X8); - } - else if (flags & BackPatchInfo::FLAG_SIZE_F32I) { m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0); MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); BLR(ARM64Reg::X8); } else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) - { - m_float_emit.FCVTN(32, ARM64Reg::D0, RS); - m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0); - ROR(ARM64Reg::X0, ARM64Reg::X0, 32); - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); - BLR(ARM64Reg::X8); - } - else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I) { m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); - ROR(ARM64Reg::X0, ARM64Reg::X0, 32); MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); + ROR(ARM64Reg::X0, ARM64Reg::X0, 32); BLR(ARM64Reg::X8); } else { - MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); + MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); BLR(ARM64Reg::X8); } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index e881551f64..5056c3b1ca 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -189,6 +189,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) u32 a = inst.RA, b = inst.RB; + bool want_single = false; s32 offset = inst.SIMM_16; u32 flags = BackPatchInfo::FLAG_STORE; bool update = false; @@ -200,10 +201,12 @@ void JitArm64::stfXX(UGeckoInstruction inst) switch (inst.SUBOP10) { case 663: // stfsx + want_single = true; flags |= BackPatchInfo::FLAG_SIZE_F32; offset_reg = b; break; case 695: // stfsux + want_single = true; flags |= BackPatchInfo::FLAG_SIZE_F32; update = true; offset_reg = b; @@ -218,16 +221,19 @@ void JitArm64::stfXX(UGeckoInstruction inst) offset_reg = b; break; case 983: // stfiwx - flags |= BackPatchInfo::FLAG_SIZE_F32I; + // This instruction writes the lower 32 bits of a double. want_single must be false + flags |= BackPatchInfo::FLAG_SIZE_F32; offset_reg = b; break; } break; case 53: // stfsu + want_single = true; flags |= BackPatchInfo::FLAG_SIZE_F32; update = true; break; case 52: // stfs + want_single = true; flags |= BackPatchInfo::FLAG_SIZE_F32; break; case 55: // stfdu @@ -242,19 +248,22 @@ void JitArm64::stfXX(UGeckoInstruction inst) u32 imm_addr = 0; bool is_immediate = false; - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0); - const bool single = (flags & BackPatchInfo::FLAG_SIZE_F32) && fpr.IsSingle(inst.FS, true); + const bool have_single = fpr.IsSingle(inst.FS, true); - const ARM64Reg V0 = fpr.R(inst.FS, single ? RegType::LowerPairSingle : RegType::LowerPair); + ARM64Reg V0 = + fpr.R(inst.FS, want_single && have_single ? RegType::LowerPairSingle : RegType::LowerPair); - if (single) + if (want_single && !have_single) { - flags &= ~BackPatchInfo::FLAG_SIZE_F32; - flags |= BackPatchInfo::FLAG_SIZE_F32I; + const ARM64Reg single_reg = fpr.GetReg(); + m_float_emit.FCVT(32, 64, EncodeRegToDouble(single_reg), EncodeRegToDouble(V0)); + V0 = single_reg; } + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); + ARM64Reg addr_reg = ARM64Reg::W1; if (update) @@ -359,19 +368,11 @@ void JitArm64::stfXX(UGeckoInstruction inst) accessSize = 32; LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); + if (flags & BackPatchInfo::FLAG_SIZE_F64) - { m_float_emit.REV64(8, ARM64Reg::Q0, V0); - } else if (flags & BackPatchInfo::FLAG_SIZE_F32) - { - m_float_emit.FCVT(32, 64, ARM64Reg::D0, EncodeRegToDouble(V0)); - m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0); - } - else if (flags & BackPatchInfo::FLAG_SIZE_F32I) - { m_float_emit.REV32(8, ARM64Reg::D0, V0); - } m_float_emit.STR(accessSize, IndexType::Post, accessSize == 64 ? ARM64Reg::Q0 : ARM64Reg::D0, ARM64Reg::X0, accessSize >> 3); @@ -399,6 +400,10 @@ void JitArm64::stfXX(UGeckoInstruction inst) { EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, V0, XA, regs_in_use, fprs_in_use); } + + if (want_single && !have_single) + fpr.Unlock(V0); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 1b4fcc3f85..c3778e330e 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -116,13 +116,44 @@ void JitArm64::psq_st(UGeckoInstruction inst) const bool update = inst.OPCD == 61; const s32 offset = inst.SIMM_12; - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); - const bool single = fpr.IsSingle(inst.RS); + const bool have_single = fpr.IsSingle(inst.RS); + + ARM64Reg VS = fpr.R(inst.RS, have_single ? RegType::Single : RegType::Register); + + if (js.assumeNoPairedQuantize) + { + if (!have_single) + { + const ARM64Reg single_reg = fpr.GetReg(); + + if (inst.W) + m_float_emit.FCVT(32, 64, EncodeRegToDouble(single_reg), EncodeRegToDouble(VS)); + else + m_float_emit.FCVTN(32, EncodeRegToDouble(single_reg), EncodeRegToDouble(VS)); + + VS = single_reg; + } + } + else + { + if (have_single) + { + m_float_emit.ORR(ARM64Reg::D0, VS, VS); + } + else + { + if (inst.W) + m_float_emit.FCVT(32, 64, ARM64Reg::D0, VS); + else + m_float_emit.FCVTN(32, ARM64Reg::D0, VS); + } + } + + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); const ARM64Reg arm_addr = gpr.R(inst.RA); - const ARM64Reg VS = fpr.R(inst.RS, single ? RegType::Single : RegType::Register); constexpr ARM64Reg scale_reg = ARM64Reg::W0; constexpr ARM64Reg addr_reg = ARM64Reg::W1; @@ -157,28 +188,13 @@ void JitArm64::psq_st(UGeckoInstruction inst) { u32 flags = BackPatchInfo::FLAG_STORE; - if (single) - flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32I : BackPatchInfo::FLAG_SIZE_F32X2I); - else - flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); + flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use, fprs_in_use); } else { - if (single) - { - m_float_emit.ORR(ARM64Reg::D0, VS, VS); - } - else - { - if (inst.W) - m_float_emit.FCVT(32, 64, ARM64Reg::D0, VS); - else - m_float_emit.FCVTN(32, ARM64Reg::D0, VS); - } - LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + inst.I)); UBFM(type_reg, scale_reg, 0, 2); // Type UBFM(scale_reg, scale_reg, 8, 13); // Scale @@ -212,6 +228,9 @@ void JitArm64::psq_st(UGeckoInstruction inst) SetJumpTarget(continue1); } + if (js.assumeNoPairedQuantize && !have_single) + fpr.Unlock(VS); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); } diff --git a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h index 58833eb687..a3a9f8b470 100644 --- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h +++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h @@ -16,14 +16,11 @@ struct BackPatchInfo FLAG_SIZE_32 = (1 << 4), FLAG_SIZE_F32 = (1 << 5), FLAG_SIZE_F32X2 = (1 << 6), - FLAG_SIZE_F32X2I = (1 << 7), - FLAG_SIZE_F64 = (1 << 8), - FLAG_REVERSE = (1 << 9), - FLAG_EXTEND = (1 << 10), - FLAG_SIZE_F32I = (1 << 11), - FLAG_ZERO_256 = (1 << 12), - FLAG_MASK_FLOAT = - FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F32X2I | FLAG_SIZE_F64 | FLAG_SIZE_F32I, + FLAG_SIZE_F64 = (1 << 7), + FLAG_REVERSE = (1 << 8), + FLAG_EXTEND = (1 << 9), + FLAG_ZERO_256 = (1 << 10), + FLAG_MASK_FLOAT = FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F64, }; static u32 GetFlagSize(u32 flags) @@ -34,8 +31,10 @@ struct BackPatchInfo return 16; if (flags & FLAG_SIZE_32) return 32; - if (flags & FLAG_SIZE_F32 || flags & FLAG_SIZE_F32I) + if (flags & FLAG_SIZE_F32) return 32; + if (flags & FLAG_SIZE_F32X2) + return 64; if (flags & FLAG_SIZE_F64) return 64; if (flags & FLAG_ZERO_256) From 949686bbe7edaabe3b9239b282b545c3f04047b1 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Thu, 21 Jan 2021 19:11:06 +0100 Subject: [PATCH 03/12] JitArm64: Factor out single/double conversion code to functions Preparation for following commits. This commit intentionally doesn't touch paired stores, since paired stores are supposed to flush to zero. (Consistent with Jit64.) --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 5 +++++ .../JitArm64/JitArm64_FloatingPoint.cpp | 20 +++++++++++++++++++ .../JitArm64/JitArm64_LoadStoreFloating.cpp | 2 +- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 19 +++++++++--------- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 6 +++++- 5 files changed, 41 insertions(+), 11 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index aa5d389827..4f8ea466f4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -152,6 +152,11 @@ public: void psq_l(UGeckoInstruction inst); void psq_st(UGeckoInstruction inst); + void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); + void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); + void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); + void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); + private: struct SlowmemHandler { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index b6a99f1c0b..482095d251 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -386,3 +386,23 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true), "Register allocation turned singles into doubles in the middle of fctiwzx"); } + +void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) +{ + m_float_emit.FCVT(32, 64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); +} + +void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) +{ + m_float_emit.FCVTN(32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); +} + +void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) +{ + m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); +} + +void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) +{ + m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 5056c3b1ca..3509df1936 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -258,7 +258,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) if (want_single && !have_single) { const ARM64Reg single_reg = fpr.GetReg(); - m_float_emit.FCVT(32, 64, EncodeRegToDouble(single_reg), EncodeRegToDouble(V0)); + ConvertDoubleToSingleLower(single_reg, V0); V0 = single_reg; } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index e41ffdbfa8..4b2ecd81e7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -17,9 +17,10 @@ using namespace Arm64Gen; -void Arm64RegCache::Init(ARM64XEmitter* emitter) +void Arm64RegCache::Init(JitArm64* jit) { - m_emit = emitter; + m_jit = jit; + m_emit = jit; m_float_emit.reset(new ARM64FloatEmitter(m_emit)); GetAllocationOrder(); } @@ -467,7 +468,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; // Else convert this register back to doubles. - m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoublePair(host_reg, host_reg); reg.Load(host_reg, RegType::Register); [[fallthrough]]; } @@ -482,7 +483,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; // Else convert this register back to a double. - m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); reg.Load(host_reg, RegType::LowerPair); [[fallthrough]]; } @@ -516,7 +517,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; } - m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); reg.Load(host_reg, RegType::Duplicated); [[fallthrough]]; } @@ -593,7 +594,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) { case RegType::Single: flush_reg = GetReg(); - m_float_emit->FCVTL(64, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoublePair(flush_reg, host_reg); [[fallthrough]]; case RegType::Register: // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit @@ -604,7 +605,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) break; case RegType::DuplicatedSingle: flush_reg = GetReg(); - m_float_emit->FCVT(64, 32, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg); [[fallthrough]]; case RegType::Duplicated: // Store PSR1 (which is equal to PSR0) in memory. @@ -712,13 +713,13 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) if (type == RegType::Single) { if (dirty) - m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoublePair(host_reg, host_reg); type = RegType::Register; } if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle) { if (dirty) - m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); if (type == RegType::DuplicatedSingle) type = RegType::Duplicated; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index edee9f2a85..465e8fef67 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -15,6 +15,8 @@ #include "Core/PowerPC/PPCAnalyst.h" #include "Core/PowerPC/PowerPC.h" +class JitArm64; + // Dedicated host registers // memory base register @@ -150,7 +152,7 @@ public: explicit Arm64RegCache(size_t guest_reg_count) : m_guest_registers(guest_reg_count) {} virtual ~Arm64RegCache() = default; - void Init(Arm64Gen::ARM64XEmitter* emitter); + void Init(JitArm64* jit); virtual void Start(PPCAnalyst::BlockRegStats& stats) {} void DiscardRegisters(BitSet32 regs); @@ -218,6 +220,8 @@ protected: reg.IncrementLastUsed(); } + JitArm64* m_jit = nullptr; + // Code emitter Arm64Gen::ARM64XEmitter* m_emit = nullptr; From 39eccf6603b673816f52beeea88d991195d330af Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 25 Jan 2021 22:41:09 +0100 Subject: [PATCH 04/12] JitArm64: Call RW before FCMPE in fselx Needed because the next commit will make RW clobber flags. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 482095d251..93ed5aa504 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -220,30 +220,28 @@ void JitArm64::fselx(UGeckoInstruction inst) const u32 c = inst.FC; const u32 d = inst.FD; - const bool a_single = fpr.IsSingle(a, true); - if (a_single) - { - const ARM64Reg VA = fpr.R(a, RegType::LowerPairSingle); - m_float_emit.FCMPE(EncodeRegToSingle(VA)); - } - else - { - const ARM64Reg VA = fpr.R(a, RegType::LowerPair); - m_float_emit.FCMPE(EncodeRegToDouble(VA)); - } + const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true); + const RegType b_and_c_type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair; + const auto b_and_c_reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble; + const bool a_single = fpr.IsSingle(a, true) && (b_and_c_singles || (a != b && a != c)); + const RegType a_type = a_single ? RegType::LowerPairSingle : RegType::LowerPair; + const auto a_reg_encoder = a_single ? EncodeRegToSingle : EncodeRegToDouble; + + const ARM64Reg VA = fpr.R(a, a_type); + const ARM64Reg VB = fpr.R(b, b_and_c_type); + const ARM64Reg VC = fpr.R(c, b_and_c_type); + + // If a == d, the RW call below may change the type of a to double. This is okay, because the + // actual value in the register is not altered by RW. So let's just assert before calling RW. ASSERT_MSG(DYNA_REC, a_single == fpr.IsSingle(a, true), "Register allocation turned singles into doubles in the middle of fselx"); - const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true); - const RegType type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair; - const auto reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble; + const ARM64Reg VD = fpr.RW(d, b_and_c_type); - const ARM64Reg VB = fpr.R(b, type); - const ARM64Reg VC = fpr.R(c, type); - const ARM64Reg VD = fpr.RW(d, type); - - m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE); + m_float_emit.FCMPE(a_reg_encoder(VA)); + m_float_emit.FCSEL(b_and_c_reg_encoder(VD), b_and_c_reg_encoder(VC), b_and_c_reg_encoder(VB), + CC_GE); ASSERT_MSG(DYNA_REC, b_and_c_singles == (fpr.IsSingle(b, true) && fpr.IsSingle(c, true)), "Register allocation turned singles into doubles in the middle of fselx"); From 6e0a5876ef6ddaef16c71fab9c7606d966707883 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Thu, 21 Jan 2021 22:02:53 +0100 Subject: [PATCH 05/12] JitArm64: Use accurate single/double conversions Our old conversion approach became a lot more inaccurate when enabling flush-to-zero, to the point of obviously breaking games. --- .../JitArm64/JitArm64_FloatingPoint.cpp | 60 +++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 93ed5aa504..4a4830ef63 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -9,6 +9,7 @@ #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" +#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" #include "Core/PowerPC/PPCTables.h" @@ -385,22 +386,73 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) "Register allocation turned singles into doubles in the middle of fctiwzx"); } +// Since the following float conversion functions are used in non-arithmetic PPC float +// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs +// into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN. + +// When calling the conversion functions, we are cheating a little and not +// saving the FPRs since we know the functions happen to not use them. + void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { - m_float_emit.FCVT(32, 64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + FlushCarry(); + + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + ABI_PushRegisters(gpr_saved); + + m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0); + QuickCallFunction(ARM64Reg::X1, &ConvertToSingle); + m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W0); + + ABI_PopRegisters(gpr_saved); } void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) { - m_float_emit.FCVTN(32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + FlushCarry(); + + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + ABI_PushRegisters(gpr_saved); + + m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0); + QuickCallFunction(ARM64Reg::X1, &ConvertToSingle); + m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W0); + + m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 1); + QuickCallFunction(ARM64Reg::X1, &ConvertToSingle); + m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W0); + + ABI_PopRegisters(gpr_saved); } void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { - m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + FlushCarry(); + + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + ABI_PushRegisters(gpr_saved); + + m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0); + QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); + + ABI_PopRegisters(gpr_saved); } void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) { - m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + FlushCarry(); + + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + ABI_PushRegisters(gpr_saved); + + m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1); + QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X0); + + m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0); + QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); + + ABI_PopRegisters(gpr_saved); } From 28e4869c432e9f2ebaf90ec1e27aca16bea2dd21 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 24 Jan 2021 20:18:43 +0100 Subject: [PATCH 06/12] JitArm64: Optimize ConvertDoubleToSingle --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 2 ++ .../JitArm64/JitArm64_FloatingPoint.cpp | 16 ++++----- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 35 +++++++++++++++++-- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 4f8ea466f4..9d98910660 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -220,6 +220,8 @@ private: // AsmRoutines void GenerateAsm(); void GenerateCommonAsm(); + void GenerateConvertDoubleToSingle(); + void GenerateQuantizedLoadStores(); // Profiling void BeginTimeProfile(JitBlock* b); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 4a4830ef63..50c9d1f85c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -397,12 +397,12 @@ void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); - const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30}; ABI_PushRegisters(gpr_saved); m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0); - QuickCallFunction(ARM64Reg::X1, &ConvertToSingle); - m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W0); + BL(cdts); + m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1); ABI_PopRegisters(gpr_saved); } @@ -411,16 +411,16 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); - const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30}; ABI_PushRegisters(gpr_saved); m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0); - QuickCallFunction(ARM64Reg::X1, &ConvertToSingle); - m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W0); + BL(cdts); + m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1); m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 1); - QuickCallFunction(ARM64Reg::X1, &ConvertToSingle); - m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W0); + BL(cdts); + m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W1); ABI_PopRegisters(gpr_saved); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 3b426a6214..c686c31ce4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -194,6 +194,39 @@ void JitArm64::GenerateAsm() } void JitArm64::GenerateCommonAsm() +{ + GetAsmRoutines()->cdts = GetCodePtr(); + GenerateConvertDoubleToSingle(); + JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts"); + + GenerateQuantizedLoadStores(); +} + +// Input in X0, output in W1, clobbers X0-X3 and flags. +void JitArm64::GenerateConvertDoubleToSingle() +{ + UBFX(ARM64Reg::X2, ARM64Reg::X0, 52, 11); + SUB(ARM64Reg::W3, ARM64Reg::W2, 874); + CMP(ARM64Reg::W3, 896 - 874); + LSR(ARM64Reg::X1, ARM64Reg::X0, 32); + FixupBranch denormal = B(CCFlags::CC_LS); + + ANDI2R(ARM64Reg::X1, ARM64Reg::X1, 0xc0000000); + BFXIL(ARM64Reg::X1, ARM64Reg::X0, 29, 30); + RET(); + + SetJumpTarget(denormal); + LSR(ARM64Reg::X3, ARM64Reg::X0, 21); + MOVZ(ARM64Reg::X0, 905); + ORRI2R(ARM64Reg::W3, ARM64Reg::W3, 0x80000000); + SUB(ARM64Reg::W2, ARM64Reg::W0, ARM64Reg::W2); + LSRV(ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W2); + ANDI2R(ARM64Reg::X3, ARM64Reg::X1, 0x80000000); + ORR(ARM64Reg::X1, ARM64Reg::X3, ARM64Reg::X2); + RET(); +} + +void JitArm64::GenerateQuantizedLoadStores() { // X0 is the scale // X1 is address @@ -654,6 +687,4 @@ void JitArm64::GenerateCommonAsm() paired_store_quantized[29] = storeSingleU16Slow; paired_store_quantized[30] = storeSingleS8Slow; paired_store_quantized[31] = storeSingleS16Slow; - - GetAsmRoutines()->mfcr = nullptr; } From 018e24762481fd59e9c3e9e0bf2a52dddfdecdca Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 31 Jan 2021 19:20:02 +0100 Subject: [PATCH 07/12] JitArm64: Optimize ConvertSingleToDouble, part 1 --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 1 + .../JitArm64/JitArm64_FloatingPoint.cpp | 15 +++--- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 46 +++++++++++++++++++ .../Core/PowerPC/JitCommon/JitAsmCommon.h | 1 + 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 9d98910660..cc65155ccd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -221,6 +221,7 @@ private: void GenerateAsm(); void GenerateCommonAsm(); void GenerateConvertDoubleToSingle(); + void GenerateConvertSingleToDouble(); void GenerateQuantizedLoadStores(); // Profiling diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 50c9d1f85c..d3d5f7ddbf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -9,7 +9,6 @@ #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" -#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" #include "Core/PowerPC/PPCTables.h" @@ -390,9 +389,6 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) // instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs // into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN. -// When calling the conversion functions, we are cheating a little and not -// saving the FPRs since we know the functions happen to not use them. - void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); @@ -429,11 +425,11 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); - const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0); - QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + BL(cstd); m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); @@ -443,15 +439,16 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); - const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + // Save X0-X4 and X30 if they're in use + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1); - QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + BL(cstd); m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X0); m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0); - QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + BL(cstd); m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index c686c31ce4..57fa59d9df 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -199,6 +199,10 @@ void JitArm64::GenerateCommonAsm() GenerateConvertDoubleToSingle(); JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts"); + GetAsmRoutines()->cstd = GetCodePtr(); + GenerateConvertSingleToDouble(); + JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cstd"); + GenerateQuantizedLoadStores(); } @@ -226,6 +230,48 @@ void JitArm64::GenerateConvertDoubleToSingle() RET(); } +// Input in W0, output in X0, clobbers X0-X4 and flags. +void JitArm64::GenerateConvertSingleToDouble() +{ + UBFX(ARM64Reg::W1, ARM64Reg::W0, 23, 8); + FixupBranch normal_or_nan = CBNZ(ARM64Reg::W1); + + ANDI2R(ARM64Reg::W1, ARM64Reg::W0, 0x007fffff); + FixupBranch denormal = CBNZ(ARM64Reg::W1); + + // Zero + LSL(ARM64Reg::X0, ARM64Reg::X0, 32); + RET(); + + SetJumpTarget(denormal); + ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x80000000); + CLZ(ARM64Reg::X3, ARM64Reg::X1); + LSL(ARM64Reg::X2, ARM64Reg::X2, 32); + ORRI2R(ARM64Reg::X4, ARM64Reg::X3, 0xffffffffffffffc0); + SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); + ADD(ARM64Reg::X3, ARM64Reg::X4, 23); + LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3); + BFI(ARM64Reg::X2, ARM64Reg::X1, 30, 22); + MOVI2R(ARM64Reg::X1, 0x3a90000000000000); + ADD(ARM64Reg::X0, ARM64Reg::X2, ARM64Reg::X1); + RET(); + + SetJumpTarget(normal_or_nan); + CMP(ARM64Reg::W1, 0xff); + ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x40000000); + CSET(ARM64Reg::W4, CCFlags::CC_NEQ); + ANDI2R(ARM64Reg::W3, ARM64Reg::W0, 0xc0000000); + EOR(ARM64Reg::W2, ARM64Reg::W4, ARM64Reg::W2, ArithOption(ARM64Reg::W2, ShiftType::LSR, 30)); + MOVI2R(ARM64Reg::X1, 0x3800000000000000); + ANDI2R(ARM64Reg::W4, ARM64Reg::W0, 0x3fffffff); + LSL(ARM64Reg::X3, ARM64Reg::X3, 32); + CMP(ARM64Reg::W2, 0); + CSEL(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::ZR, CCFlags::CC_NEQ); + BFI(ARM64Reg::X3, ARM64Reg::X4, 29, 30); + ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X1); + RET(); +} + void JitArm64::GenerateQuantizedLoadStores() { // X0 is the scale diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index d8e22a0a3a..c525e7849c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -26,6 +26,7 @@ struct CommonAsmRoutinesBase const u8* fres; const u8* mfcr; const u8* cdts; + const u8* cstd; // In: array index: GQR to use. // In: ECX: Address to read from. From 1d106ceaf5615f9e7fc3697776c27dcfd9431e5d Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 1 Feb 2021 22:14:16 +0100 Subject: [PATCH 08/12] JitArm64: Optimize ConvertSingleToDouble, part 2 If we can prove that FCVT will provide a correct conversion, we can use FCVT. This makes the common case a bit faster and the less likely cases (unfortunately including zero, which FCVT actually can convert correctly) a bit slower. --- Source/Core/Common/Arm64Emitter.cpp | 8 ++ Source/Core/Common/Arm64Emitter.h | 2 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 11 ++- .../JitArm64/JitArm64_FloatingPoint.cpp | 97 ++++++++++++++++++- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 29 ++++-- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 6 +- 6 files changed, 139 insertions(+), 14 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 837d7efdc8..1a718d1e3e 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); } +void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm); +} void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a076098cb2..58caec8d08 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1094,6 +1094,8 @@ public: void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Conditional select void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index cc65155ccd..1c60ae0aaf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -154,8 +154,10 @@ public: void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); - void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); - void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); + void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg, + Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); + void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg, + Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); private: struct SlowmemHandler @@ -189,14 +191,18 @@ private: nearcode = GetWritableCodePtr(); SetCodePtrUnsafe(farcode.GetWritableCodePtr()); AlignCode16(); + m_in_farcode = true; } void SwitchToNearCode() { farcode.SetCodePtrUnsafe(GetWritableCodePtr()); SetCodePtrUnsafe(nearcode); + m_in_farcode = false; } + bool IsInFarCode() const { return m_in_farcode; } + // Dump a memory range of code void DumpCode(const u8* start, const u8* end); @@ -262,6 +268,7 @@ private: Arm64Gen::ARM64CodeBlock farcode; u8* nearcode; // Backed up when we switch to far code. + bool m_in_farcode = false; bool m_enable_blr_optimization; bool m_cleanup_after_stackfault = false; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index d3d5f7ddbf..59e27431cd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -421,10 +421,35 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) ABI_PopRegisters(gpr_saved); } -void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) +void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg) { + ASSERT(scratch_reg != src_reg); + + const bool switch_to_farcode = !IsInFarCode(); + FlushCarry(); + // Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set? + // (This check unfortunately also catches zeroes) + + FixupBranch fast; + if (scratch_reg != ARM64Reg::INVALID_REG) + { + m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg)); + m_float_emit.FCMP(EncodeRegToSingle(scratch_reg)); + fast = B(CCFlags::CC_GT); + + if (switch_to_farcode) + { + FixupBranch slow = B(); + + SwitchToFarCode(); + SetJumpTarget(slow); + } + } + + // If no (or if we don't have a scratch register), call the bit-exact routine + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); @@ -433,12 +458,65 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); + + // If yes, do a fast conversion with FCVT + + if (scratch_reg != ARM64Reg::INVALID_REG) + { + FixupBranch continue1 = B(); + + if (switch_to_farcode) + SwitchToNearCode(); + + SetJumpTarget(fast); + + m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + + SetJumpTarget(continue1); + } } -void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) +void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg) { + ASSERT(scratch_reg != src_reg); + + const bool switch_to_farcode = !IsInFarCode(); + FlushCarry(); + // Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set? + // (This check unfortunately also catches zeroes) + + FixupBranch fast; + if (scratch_reg != ARM64Reg::INVALID_REG) + { + // Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether + // the absolute value of the corresponding element in src_reg compares greater than 0 + m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0); + m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg), + EncodeRegToDouble(scratch_reg)); + + // 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero) + // 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal) + // 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal) + // 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN) + m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0); + + // Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)? + m_float_emit.FCMP(EncodeRegToDouble(scratch_reg)); + fast = B(CCFlags::CC_VS); + + if (switch_to_farcode) + { + FixupBranch slow = B(); + + SwitchToFarCode(); + SetJumpTarget(slow); + } + } + + // If no (or if we don't have a scratch register), call the bit-exact routine + // Save X0-X4 and X30 if they're in use const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); @@ -452,4 +530,19 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); + + // If yes, do a fast conversion with FCVTL + + if (scratch_reg != ARM64Reg::INVALID_REG) + { + FixupBranch continue1 = B(); + + if (switch_to_farcode) + SwitchToNearCode(); + + SetJumpTarget(fast); + m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + + SetJumpTarget(continue1); + } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 4b2ecd81e7..1363863286 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -468,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; // Else convert this register back to doubles. - m_jit->ConvertSingleToDoublePair(host_reg, host_reg); + const ARM64Reg tmp_reg = GetReg(); + m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg); + UnlockRegister(tmp_reg); + reg.Load(host_reg, RegType::Register); [[fallthrough]]; } @@ -483,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; // Else convert this register back to a double. - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); + const ARM64Reg tmp_reg = GetReg(); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + UnlockRegister(tmp_reg); + reg.Load(host_reg, RegType::LowerPair); [[fallthrough]]; } @@ -517,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; } - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); + const ARM64Reg tmp_reg = GetReg(); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + UnlockRegister(tmp_reg); + reg.Load(host_reg, RegType::Duplicated); [[fallthrough]]; } @@ -594,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) { case RegType::Single: flush_reg = GetReg(); - m_jit->ConvertSingleToDoublePair(flush_reg, host_reg); + m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg); [[fallthrough]]; case RegType::Register: // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit @@ -605,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) break; case RegType::DuplicatedSingle: flush_reg = GetReg(); - m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg); + m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg); [[fallthrough]]; case RegType::Duplicated: // Store PSR1 (which is equal to PSR0) in memory. @@ -709,17 +718,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) const bool dirty = reg.IsDirty(); RegType type = reg.GetType(); + // If FlushRegister calls GetReg with all registers locked, we can get infinite recursion + const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG; + // If we're in single mode, just convert it back to a double. if (type == RegType::Single) { if (dirty) - m_jit->ConvertSingleToDoublePair(host_reg, host_reg); + m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg); type = RegType::Register; } if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle) { if (dirty) - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); if (type == RegType::DuplicatedSingle) type = RegType::Duplicated; @@ -771,6 +783,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) reg.Flush(); } } + + if (tmp_reg != ARM64Reg::INVALID_REG) + UnlockRegister(tmp_reg); } void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 465e8fef67..8375687c87 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -168,6 +168,9 @@ public: void UpdateLastUsed(BitSet32 regs_used); + // Get available host registers + u32 GetUnlockedRegisterCount() const; + // Locks a register so a cache cannot use it // Useful for function calls template @@ -211,9 +214,6 @@ protected: void DiscardRegister(size_t preg); virtual void FlushRegister(size_t preg, bool maintain_state) = 0; - // Get available host registers - u32 GetUnlockedRegisterCount() const; - void IncrementAllUsed() { for (auto& reg : m_guest_registers) From 2a9d88739c6c9c3e7a9fc5f3443e0cb07c4eefe0 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 2 Feb 2021 21:43:50 +0100 Subject: [PATCH 09/12] JitArm64: Skip accurate single/double conversion if store-safe --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 3 ++ .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 2 +- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 4 +++ Source/Core/Core/PowerPC/JitArm64/Jit.h | 12 ++++--- .../JitArm64/JitArm64_FloatingPoint.cpp | 34 ++++++++++++++++--- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 2 +- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 18 +++++----- Source/Core/Core/PowerPC/JitCommon/JitBase.h | 2 ++ Source/Core/Core/PowerPC/PPCAnalyst.cpp | 3 +- Source/Core/Core/PowerPC/PPCAnalyst.h | 3 +- 10 files changed, 62 insertions(+), 21 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index ac8b60afee..ed69b5d4cd 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -982,6 +982,7 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) js.compilerPC = op.address; js.op = &op; + js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst; js.instructionNumber = i; js.instructionsLeft = (code_block.m_num_instructions - 1) - i; const GekkoOPInfo* opinfo = op.opinfo; @@ -1118,6 +1119,8 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) CompileInstruction(op); + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; + if (jo.memcheck && (opinfo->flags & FL_LOADSTORE)) { // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 2ce40f08c8..347f67e2c6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -105,7 +105,7 @@ void Jit64::stfXXX(UGeckoInstruction inst) if (single) { - if (js.op->fprIsStoreSafe[s]) + if (js.fpr_is_store_safe[s]) { RCOpArg Rs = fpr.Use(s, RCMode::Read); RegCache::Realize(Rs); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 9c1b98d2f0..2d838d7ba6 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -695,6 +695,7 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) js.compilerPC = op.address; js.op = &op; + js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst; js.instructionNumber = i; js.instructionsLeft = (code_block.m_num_instructions - 1) - i; const GekkoOPInfo* opinfo = op.opinfo; @@ -830,6 +831,9 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) } CompileInstruction(op); + + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; + if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer) FlushCarry(); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 1c60ae0aaf..f8b4b5f146 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -152,11 +152,15 @@ public: void psq_l(UGeckoInstruction inst); void psq_st(UGeckoInstruction inst); - void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); - void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); - void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg, + void ConvertDoubleToSingleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg, + Arm64Gen::ARM64Reg src_reg); + void ConvertDoubleToSinglePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg, + Arm64Gen::ARM64Reg src_reg); + void ConvertSingleToDoubleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg, + Arm64Gen::ARM64Reg src_reg, Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); - void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg, + void ConvertSingleToDoublePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg, + Arm64Gen::ARM64Reg src_reg, Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); private: diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 59e27431cd..8d3553afd7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -389,8 +389,14 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) // instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs // into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN. -void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) +void JitArm64::ConvertDoubleToSingleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg) { + if (js.fpr_is_store_safe[guest_reg]) + { + m_float_emit.FCVT(32, 64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + return; + } + FlushCarry(); const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30}; @@ -403,8 +409,14 @@ void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) ABI_PopRegisters(gpr_saved); } -void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) +void JitArm64::ConvertDoubleToSinglePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg) { + if (js.fpr_is_store_safe[guest_reg]) + { + m_float_emit.FCVTN(32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + return; + } + FlushCarry(); const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30}; @@ -421,10 +433,17 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) ABI_PopRegisters(gpr_saved); } -void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg) +void JitArm64::ConvertSingleToDoubleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg, + ARM64Reg scratch_reg) { ASSERT(scratch_reg != src_reg); + if (js.fpr_is_store_safe[guest_reg]) + { + m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + return; + } + const bool switch_to_farcode = !IsInFarCode(); FlushCarry(); @@ -476,10 +495,17 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, A } } -void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg) +void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg, + ARM64Reg scratch_reg) { ASSERT(scratch_reg != src_reg); + if (js.fpr_is_store_safe[guest_reg]) + { + m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + return; + } + const bool switch_to_farcode = !IsInFarCode(); FlushCarry(); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 3509df1936..068d61d0fb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -258,7 +258,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) if (want_single && !have_single) { const ARM64Reg single_reg = fpr.GetReg(); - ConvertDoubleToSingleLower(single_reg, V0); + ConvertDoubleToSingleLower(inst.FS, single_reg, V0); V0 = single_reg; } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 1363863286..3715c897d2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -469,7 +469,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) // Else convert this register back to doubles. const ARM64Reg tmp_reg = GetReg(); - m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg); + m_jit->ConvertSingleToDoublePair(preg, host_reg, host_reg, tmp_reg); UnlockRegister(tmp_reg); reg.Load(host_reg, RegType::Register); @@ -487,7 +487,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) // Else convert this register back to a double. const ARM64Reg tmp_reg = GetReg(); - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg); UnlockRegister(tmp_reg); reg.Load(host_reg, RegType::LowerPair); @@ -524,7 +524,7 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) } const ARM64Reg tmp_reg = GetReg(); - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg); UnlockRegister(tmp_reg); reg.Load(host_reg, RegType::Duplicated); @@ -594,7 +594,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) if ((type == RegType::LowerPair || type == RegType::LowerPairSingle) && was_dirty) { // We must *not* change host_reg as this register might still be in use. So it's fine to - // store this register, but it's *not* fine to convert it to double. So for double convertion, + // store this register, but it's *not* fine to convert it to double. So for double conversion, // a temporary register needs to be used. ARM64Reg host_reg = reg.GetReg(); ARM64Reg flush_reg = host_reg; @@ -603,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) { case RegType::Single: flush_reg = GetReg(); - m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg); + m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg); [[fallthrough]]; case RegType::Register: // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit @@ -614,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) break; case RegType::DuplicatedSingle: flush_reg = GetReg(); - m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg); + m_jit->ConvertSingleToDoubleLower(preg, flush_reg, host_reg, flush_reg); [[fallthrough]]; case RegType::Duplicated: // Store PSR1 (which is equal to PSR0) in memory. @@ -725,13 +725,13 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) if (type == RegType::Single) { if (dirty) - m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg); + m_jit->ConvertSingleToDoublePair(preg, host_reg, host_reg, tmp_reg); type = RegType::Register; } if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle) { if (dirty) - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg); if (type == RegType::DuplicatedSingle) type = RegType::Duplicated; @@ -822,7 +822,7 @@ void Arm64FPRCache::FixSinglePrecision(size_t preg) m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); reg.Load(host_reg, RegType::DuplicatedSingle); break; - case RegType::Register: // PS0 and PS1 needs to be converted + case RegType::Register: // PS0 and PS1 need to be converted m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); reg.Load(host_reg, RegType::Single); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 558beec9b9..2f1686b832 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -8,6 +8,7 @@ #include #include +#include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Common/x64Emitter.h" #include "Core/ConfigManager.h" @@ -98,6 +99,7 @@ protected: PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats fpa; PPCAnalyst::CodeOp* op; + BitSet32 fpr_is_store_safe; JitBlock* curBlock; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 0017d636e7..caa9f0c398 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -976,7 +976,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: op.fprIsSingle = fprIsSingle; op.fprIsDuplicated = fprIsDuplicated; - op.fprIsStoreSafe = fprIsStoreSafe; + op.fprIsStoreSafeBeforeInst = fprIsStoreSafe; if (op.fregOut >= 0) { if (op.opinfo->type == OpType::SingleFP) @@ -1036,6 +1036,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std: (op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS); } } + op.fprIsStoreSafeAfterInst = fprIsStoreSafe; if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS) { diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index d1154baee6..740e23848f 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -66,7 +66,8 @@ struct CodeOp // 16B // convert between single and double formats by just using the host machine's instruction for it. // (The reason why we can't always do this is because some games rely on the exact bits of // denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.) - BitSet32 fprIsStoreSafe; + BitSet32 fprIsStoreSafeBeforeInst; + BitSet32 fprIsStoreSafeAfterInst; BitSet32 GetFregsOut() const { From 9d6263f306848ba960c13627e6a54e3c1eebe7b8 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 13 Feb 2021 12:38:20 +0100 Subject: [PATCH 10/12] JitArm64: Add unit tests for single/double conversion --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 2 +- Source/UnitTests/Core/CMakeLists.txt | 1 + .../PowerPC/JitArm64/ConvertSingleDouble.cpp | 273 ++++++++++++++++++ Source/UnitTests/UnitTests.vcxproj | 1 + 4 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 Source/UnitTests/Core/PowerPC/JitArm64/ConvertSingleDouble.cpp diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index f8b4b5f146..815e808847 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -163,7 +163,7 @@ public: Arm64Gen::ARM64Reg src_reg, Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); -private: +protected: struct SlowmemHandler { Arm64Gen::ARM64Reg dest_reg; diff --git a/Source/UnitTests/Core/CMakeLists.txt b/Source/UnitTests/Core/CMakeLists.txt index 01bfd99fbe..1ea9bee6fb 100644 --- a/Source/UnitTests/Core/CMakeLists.txt +++ b/Source/UnitTests/Core/CMakeLists.txt @@ -21,6 +21,7 @@ if(_M_X86) ) elseif(_M_ARM_64) add_dolphin_test(PowerPCTest + PowerPC/JitArm64/ConvertSingleDouble.cpp PowerPC/JitArm64/MovI2R.cpp ) endif() diff --git a/Source/UnitTests/Core/PowerPC/JitArm64/ConvertSingleDouble.cpp b/Source/UnitTests/Core/PowerPC/JitArm64/ConvertSingleDouble.cpp new file mode 100644 index 0000000000..0c81ab1e8b --- /dev/null +++ b/Source/UnitTests/Core/PowerPC/JitArm64/ConvertSingleDouble.cpp @@ -0,0 +1,273 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include +#include + +#include "Common/Arm64Emitter.h" +#include "Common/BitUtils.h" +#include "Common/CommonTypes.h" +#include "Common/FPURoundMode.h" +#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" +#include "Core/PowerPC/JitArm64/Jit.h" + +#include +#include + +namespace +{ +using namespace Arm64Gen; + +// The ABI situation for returning an std::tuple seems annoying. Let's use this struct instead +template +struct Pair +{ + T value1; + T value2; +}; + +class TestConversion : private JitArm64 +{ +public: + TestConversion() + { + AllocCodeSpace(4096); + AddChildCodeSpace(&farcode, 2048); + + gpr.Init(this); + fpr.Init(this); + + js.fpr_is_store_safe = BitSet32(0); + + GetAsmRoutines()->cdts = GetCodePtr(); + GenerateConvertDoubleToSingle(); + GetAsmRoutines()->cstd = GetCodePtr(); + GenerateConvertSingleToDouble(); + + gpr.Lock(ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); + + convert_single_to_double_lower = Common::BitCast(GetCodePtr()); + m_float_emit.INS(32, ARM64Reg::S0, 0, ARM64Reg::W0); + ConvertSingleToDoubleLower(0, ARM64Reg::D0, ARM64Reg::S0, ARM64Reg::Q1); + m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0); + RET(); + + convert_single_to_double_pair = Common::BitCast (*)(u32, u32)>(GetCodePtr()); + m_float_emit.INS(32, ARM64Reg::D0, 0, ARM64Reg::W0); + m_float_emit.INS(32, ARM64Reg::D0, 1, ARM64Reg::W1); + ConvertSingleToDoublePair(0, ARM64Reg::Q0, ARM64Reg::D0, ARM64Reg::Q1); + m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::Q0, 0); + m_float_emit.UMOV(64, ARM64Reg::X1, ARM64Reg::Q0, 1); + RET(); + + convert_double_to_single_lower = Common::BitCast(GetCodePtr()); + m_float_emit.INS(64, ARM64Reg::D0, 0, ARM64Reg::X0); + ConvertDoubleToSingleLower(0, ARM64Reg::S0, ARM64Reg::D0); + m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::S0, 0); + RET(); + + convert_double_to_single_pair = Common::BitCast (*)(u64, u64)>(GetCodePtr()); + m_float_emit.INS(64, ARM64Reg::Q0, 0, ARM64Reg::X0); + m_float_emit.INS(64, ARM64Reg::Q0, 1, ARM64Reg::X1); + ConvertDoubleToSinglePair(0, ARM64Reg::D0, ARM64Reg::Q0); + m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0); + RET(); + + gpr.Unlock(ARM64Reg::W30); + fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); + + FlushIcache(); + + // Set the rounding mode to something that's as annoying as possible to handle + // (flush-to-zero enabled, and rounding not symmetric about the origin) + FPURoundMode::SetSIMDMode(FPURoundMode::RoundMode::ROUND_UP, true); + } + + ~TestConversion() override + { + FPURoundMode::LoadDefaultSIMDState(); + + FreeCodeSpace(); + } + + u64 ConvertSingleToDouble(u32 value) { return convert_single_to_double_lower(value); } + + Pair ConvertSingleToDouble(u32 value1, u32 value2) + { + return convert_single_to_double_pair(value1, value2); + } + + u32 ConvertDoubleToSingle(u64 value) { return convert_double_to_single_lower(value); } + + Pair ConvertDoubleToSingle(u64 value1, u64 value2) + { + return convert_double_to_single_pair(value1, value2); + } + +private: + std::function convert_single_to_double_lower; + std::function(u32, u32)> convert_single_to_double_pair; + std::function convert_double_to_single_lower; + std::function(u64, u64)> convert_double_to_single_pair; +}; + +} // namespace + +TEST(JitArm64, ConvertDoubleToSingle) +{ + TestConversion test; + + const std::vector input_values{ + // Special values + 0x0000'0000'0000'0000, // positive zero + 0x0000'0000'0000'0001, // smallest positive denormal + 0x0000'0000'0100'0000, + 0x000F'FFFF'FFFF'FFFF, // largest positive denormal + 0x0010'0000'0000'0000, // smallest positive normal + 0x0010'0000'0000'0002, + 0x3FF0'0000'0000'0000, // 1.0 + 0x7FEF'FFFF'FFFF'FFFF, // largest positive normal + 0x7FF0'0000'0000'0000, // positive infinity + 0x7FF0'0000'0000'0001, // first positive SNaN + 0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN + 0x7FF8'0000'0000'0000, // first positive QNaN + 0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN + 0x8000'0000'0000'0000, // negative zero + 0x8000'0000'0000'0001, // smallest negative denormal + 0x8000'0000'0100'0000, + 0x800F'FFFF'FFFF'FFFF, // largest negative denormal + 0x8010'0000'0000'0000, // smallest negative normal + 0x8010'0000'0000'0002, + 0xBFF0'0000'0000'0000, // -1.0 + 0xFFEF'FFFF'FFFF'FFFF, // largest negative normal + 0xFFF0'0000'0000'0000, // negative infinity + 0xFFF0'0000'0000'0001, // first negative SNaN + 0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN + 0xFFF8'0000'0000'0000, // first negative QNaN + 0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN + + // (exp > 896) Boundary Case + 0x3800'0000'0000'0000, // 2^(-127) = Denormal in single-prec + 0x3810'0000'0000'0000, // 2^(-126) = Smallest single-prec normal + 0xB800'0000'0000'0000, // -2^(-127) = Denormal in single-prec + 0xB810'0000'0000'0000, // -2^(-126) = Smallest single-prec normal + 0x3800'1234'5678'9ABC, 0x3810'1234'5678'9ABC, 0xB800'1234'5678'9ABC, 0xB810'1234'5678'9ABC, + + // (exp >= 874) Boundary Case + 0x3680'0000'0000'0000, // 2^(-150) = Unrepresentable in single-prec + 0x36A0'0000'0000'0000, // 2^(-149) = Smallest single-prec denormal + 0x36B0'0000'0000'0000, // 2^(-148) = Single-prec denormal + 0xB680'0000'0000'0000, // -2^(-150) = Unrepresentable in single-prec + 0xB6A0'0000'0000'0000, // -2^(-149) = Smallest single-prec denormal + 0xB6B0'0000'0000'0000, // -2^(-148) = Single-prec denormal + 0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC, + 0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC, + + // Some typical numbers + 0x3FF8'0000'0000'0000, // 1.5 + 0x408F'4000'0000'0000, // 1000 + 0xC008'0000'0000'0000, // -3 + }; + + for (const u64 input : input_values) + { + const u32 expected = ConvertToSingle(input); + const u32 actual = test.ConvertDoubleToSingle(input); + + if (expected != actual) + fmt::print("{:016x} -> {:08x} == {:08x}\n", input, actual, expected); + + EXPECT_EQ(expected, actual); + } + + for (const u64 input1 : input_values) + { + for (const u64 input2 : input_values) + { + const u32 expected1 = ConvertToSingle(input1); + const u32 expected2 = ConvertToSingle(input2); + const auto [actual1, actual2] = test.ConvertDoubleToSingle(input1, input2); + + if (expected1 != actual1 || expected2 != actual2) + { + fmt::print("{:016x} -> {:08x} == {:08x},\n", input1, actual1, expected1); + fmt::print("{:016x} -> {:08x} == {:08x}\n", input2, actual2, expected2); + } + + EXPECT_EQ(expected1, actual1); + EXPECT_EQ(expected2, actual2); + } + } +} + +TEST(JitArm64, ConvertSingleToDouble) +{ + TestConversion test; + + const std::vector input_values{ + // Special values + 0x0000'0000, // positive zero + 0x0000'0001, // smallest positive denormal + 0x0000'1000, + 0x007F'FFFF, // largest positive denormal + 0x0080'0000, // smallest positive normal + 0x0080'0002, + 0x3F80'0000, // 1.0 + 0x7F7F'FFFF, // largest positive normal + 0x7F80'0000, // positive infinity + 0x7F80'0001, // first positive SNaN + 0x7FBF'FFFF, // last positive SNaN + 0x7FC0'0000, // first positive QNaN + 0x7FFF'FFFF, // last positive QNaN + 0x8000'0000, // negative zero + 0x8000'0001, // smallest negative denormal + 0x8000'1000, + 0x807F'FFFF, // largest negative denormal + 0x8080'0000, // smallest negative normal + 0x8080'0002, + 0xBFF0'0000, // -1.0 + 0xFF7F'FFFF, // largest negative normal + 0xFF80'0000, // negative infinity + 0xFF80'0001, // first negative SNaN + 0xFFBF'FFFF, // last negative SNaN + 0xFFC0'0000, // first negative QNaN + 0xFFFF'FFFF, // last negative QNaN + + // Some typical numbers + 0x3FC0'0000, // 1.5 + 0x447A'0000, // 1000 + 0xC040'0000, // -3 + }; + + for (const u32 input : input_values) + { + const u64 expected = ConvertToDouble(input); + const u64 actual = test.ConvertSingleToDouble(input); + + if (expected != actual) + fmt::print("{:08x} -> {:016x} == {:016x}\n", input, actual, expected); + + EXPECT_EQ(expected, actual); + } + + for (const u32 input1 : input_values) + { + for (const u32 input2 : input_values) + { + const u64 expected1 = ConvertToDouble(input1); + const u64 expected2 = ConvertToDouble(input2); + const auto [actual1, actual2] = test.ConvertSingleToDouble(input1, input2); + + if (expected1 != actual1 || expected2 != actual2) + { + fmt::print("{:08x} -> {:016x} == {:016x},\n", input1, actual1, expected1); + fmt::print("{:08x} -> {:016x} == {:016x}\n", input2, actual2, expected2); + } + + EXPECT_EQ(expected1, actual1); + EXPECT_EQ(expected2, actual2); + } + } +} diff --git a/Source/UnitTests/UnitTests.vcxproj b/Source/UnitTests/UnitTests.vcxproj index a178911b22..697625be2a 100644 --- a/Source/UnitTests/UnitTests.vcxproj +++ b/Source/UnitTests/UnitTests.vcxproj @@ -81,6 +81,7 @@ + From 54451ac731149641969dfc54b40745e312b59c2a Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 21 Feb 2021 11:21:02 +0100 Subject: [PATCH 11/12] JitArm64: Use ConvertSingleToDoubleLower in RW when faster --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 2 ++ .../JitArm64/JitArm64_FloatingPoint.cpp | 5 +++++ .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 22 +++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 815e808847..2ec565943b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -163,6 +163,8 @@ public: Arm64Gen::ARM64Reg src_reg, Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); + bool IsFPRStoreSafe(size_t guest_reg) const; + protected: struct SlowmemHandler { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 8d3553afd7..7ba7dd12ea 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -572,3 +572,8 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR SetJumpTarget(continue1); } } + +bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const +{ + return js.fpr_is_store_safe[guest_reg]; +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 3715c897d2..fdcb55cfb9 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -602,9 +602,27 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) switch (reg.GetType()) { case RegType::Single: + // For a store-safe register, conversion is just one instruction regardless of whether + // we're whether we're converting a pair, so ConvertSingleToDoublePair followed by a + // 128-bit store is faster than INS followed by ConvertSingleToDoubleLower and a + // 64-bit store. But for registers which are not store-safe, the latter is better. flush_reg = GetReg(); - m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg); - [[fallthrough]]; + if (!m_jit->IsFPRStoreSafe(preg)) + { + ARM64Reg scratch_reg = GetReg(); + m_float_emit->INS(32, flush_reg, 0, host_reg, 1); + m_jit->ConvertSingleToDoubleLower(preg, flush_reg, flush_reg, scratch_reg); + m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG, u32(PPCSTATE_OFF_PS1(preg))); + Unlock(scratch_reg); + break; + } + else + { + m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg); + m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG, + u32(PPCSTATE_OFF_PS0(preg))); + } + break; case RegType::Register: // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit // store. From 69c14d6ec39815db22c8a246b9fffb5beb5f5931 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 23 Jan 2021 20:13:45 +0100 Subject: [PATCH 12/12] JitArm64: Fix frspx with single precision source I haven't observed this breaking any game, but it didn't match the behavior of the interpreter as far as I could tell from reading the code, in that denormals weren't being flushed. --- .../Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 7ba7dd12ea..4410ecd564 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -258,7 +258,7 @@ void JitArm64::frspx(UGeckoInstruction inst) const u32 d = inst.FD; const bool single = fpr.IsSingle(b, true); - if (single) + if (single && js.fpr_is_store_safe[b]) { // Source is already in single precision, so no need to do anything but to copy to PSR1. const ARM64Reg VB = fpr.R(b, RegType::LowerPairSingle); @@ -266,6 +266,9 @@ void JitArm64::frspx(UGeckoInstruction inst) if (b != d) m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB)); + + ASSERT_MSG(DYNA_REC, fpr.IsSingle(b, true), + "Register allocation turned singles into doubles in the middle of frspx"); } else { @@ -274,9 +277,6 @@ void JitArm64::frspx(UGeckoInstruction inst) m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); } - - ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true), - "Register allocation turned singles into doubles in the middle of frspx"); } void JitArm64::fcmpX(UGeckoInstruction inst)