From 018e24762481fd59e9c3e9e0bf2a52dddfdecdca Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 31 Jan 2021 19:20:02 +0100 Subject: [PATCH] JitArm64: Optimize ConvertSingleToDouble, part 1 --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 1 + .../JitArm64/JitArm64_FloatingPoint.cpp | 15 +++--- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 46 +++++++++++++++++++ .../Core/PowerPC/JitCommon/JitAsmCommon.h | 1 + 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 9d98910660..cc65155ccd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -221,6 +221,7 @@ private: void GenerateAsm(); void GenerateCommonAsm(); void GenerateConvertDoubleToSingle(); + void GenerateConvertSingleToDouble(); void GenerateQuantizedLoadStores(); // Profiling diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 50c9d1f85c..d3d5f7ddbf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -9,7 +9,6 @@ #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" -#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" #include "Core/PowerPC/PPCTables.h" @@ -390,9 +389,6 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) // instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs // into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN. -// When calling the conversion functions, we are cheating a little and not -// saving the FPRs since we know the functions happen to not use them. - void JitArm64::ConvertDoubleToSingleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); @@ -429,11 +425,11 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); - const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0); - QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + BL(cstd); m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); @@ -443,15 +439,16 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) { FlushCarry(); - const BitSet32 gpr_saved = gpr.GetCallerSavedUsed(); + // Save X0-X4 and X30 if they're in use + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1); - QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + BL(cstd); m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X0); m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0); - QuickCallFunction(ARM64Reg::X1, &ConvertToDouble); + BL(cstd); m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index c686c31ce4..57fa59d9df 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -199,6 +199,10 @@ void JitArm64::GenerateCommonAsm() GenerateConvertDoubleToSingle(); JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts"); + GetAsmRoutines()->cstd = GetCodePtr(); + GenerateConvertSingleToDouble(); + JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cstd"); + GenerateQuantizedLoadStores(); } @@ -226,6 +230,48 @@ void JitArm64::GenerateConvertDoubleToSingle() RET(); } +// Input in W0, output in X0, clobbers X0-X4 and flags. +void JitArm64::GenerateConvertSingleToDouble() +{ + UBFX(ARM64Reg::W1, ARM64Reg::W0, 23, 8); + FixupBranch normal_or_nan = CBNZ(ARM64Reg::W1); + + ANDI2R(ARM64Reg::W1, ARM64Reg::W0, 0x007fffff); + FixupBranch denormal = CBNZ(ARM64Reg::W1); + + // Zero + LSL(ARM64Reg::X0, ARM64Reg::X0, 32); + RET(); + + SetJumpTarget(denormal); + ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x80000000); + CLZ(ARM64Reg::X3, ARM64Reg::X1); + LSL(ARM64Reg::X2, ARM64Reg::X2, 32); + ORRI2R(ARM64Reg::X4, ARM64Reg::X3, 0xffffffffffffffc0); + SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); + ADD(ARM64Reg::X3, ARM64Reg::X4, 23); + LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3); + BFI(ARM64Reg::X2, ARM64Reg::X1, 30, 22); + MOVI2R(ARM64Reg::X1, 0x3a90000000000000); + ADD(ARM64Reg::X0, ARM64Reg::X2, ARM64Reg::X1); + RET(); + + SetJumpTarget(normal_or_nan); + CMP(ARM64Reg::W1, 0xff); + ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x40000000); + CSET(ARM64Reg::W4, CCFlags::CC_NEQ); + ANDI2R(ARM64Reg::W3, ARM64Reg::W0, 0xc0000000); + EOR(ARM64Reg::W2, ARM64Reg::W4, ARM64Reg::W2, ArithOption(ARM64Reg::W2, ShiftType::LSR, 30)); + MOVI2R(ARM64Reg::X1, 0x3800000000000000); + ANDI2R(ARM64Reg::W4, ARM64Reg::W0, 0x3fffffff); + LSL(ARM64Reg::X3, ARM64Reg::X3, 32); + CMP(ARM64Reg::W2, 0); + CSEL(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::ZR, CCFlags::CC_NEQ); + BFI(ARM64Reg::X3, ARM64Reg::X4, 29, 30); + ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X1); + RET(); +} + void JitArm64::GenerateQuantizedLoadStores() { // X0 is the scale diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index d8e22a0a3a..c525e7849c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -26,6 +26,7 @@ struct CommonAsmRoutinesBase const u8* fres; const u8* mfcr; const u8* cdts; + const u8* cstd; // In: array index: GQR to use. // In: ECX: Address to read from.