Merge pull request #9637 from JosJuice/jitarm64-fprf

JitArm64: Implement FPRF updates
This commit is contained in:
Mat M 2021-05-13 06:39:28 -04:00 committed by GitHub
commit 725ea3d9c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 273 additions and 34 deletions

View File

@ -3039,28 +3039,31 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
{
EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
}
else
else if (IsGPR(Rd) != IsGPR(Rn))
{
ASSERT_MSG(DYNA_REC, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
int rmode = 0;
int opcode = 6;
int sf = 0;
if (IsSingle(Rd) && !Is64Bit(Rn) && !top)
{
// GPR to scalar single
opcode |= 1;
}
else if (!Is64Bit(Rd) && IsSingle(Rn) && !top)
{
// Scalar single to GPR - defaults are correct
const ARM64Reg gpr = IsGPR(Rn) ? Rn : Rd;
const ARM64Reg fpr = IsGPR(Rn) ? Rd : Rn;
const int sf = Is64Bit(gpr) ? 1 : 0;
const int type = Is64Bit(gpr) ? (top ? 2 : 1) : 0;
const int rmode = top ? 1 : 0;
const int opcode = IsGPR(Rn) ? 7 : 6;
ASSERT_MSG(DYNA_REC, !top || IsQuad(fpr), "FMOV: top can only be used with quads");
// TODO: Should this check be more lenient? Sometimes you do want to do things like
// read the lower 32 bits of a double
ASSERT_MSG(DYNA_REC,
(!Is64Bit(gpr) && IsSingle(fpr)) ||
(Is64Bit(gpr) && ((IsDouble(fpr) && !top) || (IsQuad(fpr) && top))),
"FMOV: Mismatched sizes");
Write32((sf << 31) | (0x1e << 24) | (type << 22) | (1 << 21) | (rmode << 19) | (opcode << 16) |
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
}
else
{
// TODO
ASSERT_MSG(DYNA_REC, 0, "FMOV: Unhandled case");
}
Write32((sf << 31) | (0x1e2 << 20) | (rmode << 19) | (opcode << 16) | (DecodeReg(Rn) << 5) |
DecodeReg(Rd));
ASSERT_MSG(DYNA_REC, 0, "FMOV: Unsupported case");
}
}

View File

@ -399,6 +399,7 @@ union UReg_MSR
};
#define FPRF_SHIFT 12
#define FPRF_WIDTH 5
#define FPRF_MASK (0x1F << FPRF_SHIFT)
// FPSCR exception flags

View File

@ -234,6 +234,7 @@ protected:
void GenerateCommonAsm();
void GenerateConvertDoubleToSingle();
void GenerateConvertSingleToDouble();
void GenerateFPRF(bool single);
void GenerateQuantizedLoadStores();
// Profiling
@ -262,6 +263,8 @@ protected:
Arm64Gen::ARM64Reg),
bool Rc = false);
void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
// <Fastmem fault location, slowmem handler location>
std::map<const u8*, FastmemArea> m_fault_to_handler;
std::map<SlowmemHandler, const u8*> m_handler_to_loc;

View File

@ -9,6 +9,7 @@
#include "Core/ConfigManager.h"
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
#include "Core/PowerPC/PPCTables.h"
@ -16,12 +17,25 @@
using namespace Arm64Gen;
void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
{
if (!SConfig::GetInstance().bFPRF || !js.op->wantsFPRF)
return;
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
reg = single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg);
m_float_emit.FMOV(single ? ARM64Reg::W0 : ARM64Reg::X0, reg);
BL(single ? GetAsmRoutines()->fprf_single : GetAsmRoutines()->fprf_double);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
}
void JitArm64::fp_arith(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF);
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
u32 op5 = inst.SUBOP5;
@ -120,13 +134,17 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
}
}
if (single || packed)
const bool outputs_are_singles = single || packed;
if (outputs_are_singles)
{
ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
"Register allocation turned singles into doubles in the middle of fp_arith");
fpr.FixSinglePrecision(d);
}
SetFPRFIfNeeded(outputs_are_singles, VD);
}
void JitArm64::fp_logic(UGeckoInstruction inst)
@ -252,7 +270,6 @@ void JitArm64::frspx(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF);
const u32 b = inst.FB;
const u32 d = inst.FD;
@ -269,6 +286,8 @@ void JitArm64::frspx(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, fpr.IsSingle(b, true),
"Register allocation turned singles into doubles in the middle of frspx");
SetFPRFIfNeeded(true, VD);
}
else
{
@ -276,6 +295,8 @@ void JitArm64::frspx(UGeckoInstruction inst)
const ARM64Reg VD = fpr.RW(d, RegType::DuplicatedSingle);
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
SetFPRFIfNeeded(true, VD);
}
}
@ -283,7 +304,8 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF);
const bool fprf = SConfig::GetInstance().bFPRF && js.op->wantsFPRF;
const u32 a = inst.FA;
const u32 b = inst.FB;
@ -299,6 +321,14 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
gpr.BindCRToRegister(crf, false);
const ARM64Reg XA = gpr.CR(crf);
ARM64Reg fpscr_reg;
if (fprf)
{
fpscr_reg = gpr.GetReg();
LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
ANDI2R(fpscr_reg, fpscr_reg, ~FPRF_MASK);
}
FixupBranch pNaN, pLesser, pGreater;
FixupBranch continue1, continue2, continue3;
ORR(XA, ARM64Reg::ZR, 32, 0, true);
@ -317,11 +347,16 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
// A == B
ORR(XA, XA, 64 - 63, 0, true);
if (fprf)
ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_EQ << FPRF_SHIFT);
continue1 = B();
SetJumpTarget(pNaN);
MOVI2R(XA, PowerPC::ConditionRegister::PPCToInternal(PowerPC::CR_SO));
if (fprf)
ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_SO << FPRF_SHIFT);
if (a != b)
{
@ -329,12 +364,16 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
SetJumpTarget(pGreater);
ORR(XA, XA, 0, 0, true);
if (fprf)
ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_GT << FPRF_SHIFT);
continue3 = B();
SetJumpTarget(pLesser);
ORR(XA, XA, 64 - 62, 1, true);
ORR(XA, XA, 0, 0, true);
if (fprf)
ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_LT << FPRF_SHIFT);
SetJumpTarget(continue2);
SetJumpTarget(continue3);
@ -343,6 +382,12 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a, true) && fpr.IsSingle(b, true)),
"Register allocation turned singles into doubles in the middle of fcmpX");
if (fprf)
{
STR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
gpr.Unlock(fpscr_reg);
}
}
void JitArm64::fctiwzx(UGeckoInstruction inst)
@ -371,12 +416,12 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
}
else
{
const ARM64Reg V1 = gpr.GetReg();
const ARM64Reg WA = gpr.GetReg();
m_float_emit.FCVTS(V1, EncodeRegToDouble(VB), RoundingMode::Z);
m_float_emit.FMOV(EncodeRegToSingle(VD), V1);
m_float_emit.FCVTS(WA, EncodeRegToDouble(VB), RoundingMode::Z);
m_float_emit.FMOV(EncodeRegToSingle(VD), WA);
gpr.Unlock(V1);
gpr.Unlock(WA);
}
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
fpr.Unlock(V0);

View File

@ -76,7 +76,6 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);
FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF);
const u32 a = inst.FA;
const u32 c = inst.FC;
@ -99,6 +98,8 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
"Register allocation turned singles into doubles in the middle of ps_mulsX");
fpr.FixSinglePrecision(d);
SetFPRFIfNeeded(true, VD);
}
void JitArm64::ps_maddXX(UGeckoInstruction inst)
@ -106,7 +107,6 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);
FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF);
const u32 a = inst.FA;
const u32 b = inst.FB;
@ -257,13 +257,15 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
break;
}
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
"Register allocation turned singles into doubles in the middle of ps_maddXX");
fpr.FixSinglePrecision(d);
if (V0Q != ARM64Reg::INVALID_REG)
fpr.Unlock(V0Q);
SetFPRFIfNeeded(true, VD);
}
void JitArm64::ps_sel(UGeckoInstruction inst)
@ -311,7 +313,6 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(bJITPairedOff);
FALLBACK_IF(inst.Rc);
FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF);
const u32 a = inst.FA;
const u32 b = inst.FB;
@ -343,10 +344,12 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
}
fpr.Unlock(V0);
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
"Register allocation turned singles into doubles in the middle of ps_sumX");
fpr.FixSinglePrecision(d);
fpr.Unlock(V0);
SetFPRFIfNeeded(true, VD);
}

View File

@ -4,11 +4,14 @@
#include "Common/Arm64Emitter.h"
#include "Common/CommonTypes.h"
#include "Common/FloatUtils.h"
#include "Common/JitRegister.h"
#include "Common/MathUtil.h"
#include "Core/CoreTiming.h"
#include "Core/HW/CPU.h"
#include "Core/HW/Memmap.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
#include "Core/PowerPC/JitCommon/JitCache.h"
@ -203,6 +206,12 @@ void JitArm64::GenerateCommonAsm()
GenerateConvertSingleToDouble();
JitRegister::Register(GetAsmRoutines()->cstd, GetCodePtr(), "JIT_cstd");
GetAsmRoutines()->fprf_single = GetCodePtr();
GenerateFPRF(true);
GetAsmRoutines()->fprf_double = GetCodePtr();
GenerateFPRF(false);
JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF");
GenerateQuantizedLoadStores();
}
@ -272,6 +281,91 @@ void JitArm64::GenerateConvertSingleToDouble()
RET();
}
// Input in X0. Outputs to memory (PPCState). Clobbers X0-X4 and flags.
void JitArm64::GenerateFPRF(bool single)
{
const auto reg_encoder = single ? EncodeRegTo32 : EncodeRegTo64;
const ARM64Reg input_reg = reg_encoder(ARM64Reg::W0);
const ARM64Reg temp_reg = reg_encoder(ARM64Reg::W1);
const ARM64Reg exp_reg = reg_encoder(ARM64Reg::W2);
constexpr ARM64Reg fprf_reg = ARM64Reg::W3;
constexpr ARM64Reg fpscr_reg = ARM64Reg::W4;
const auto INPUT_EXP_MASK = single ? Common::FLOAT_EXP : Common::DOUBLE_EXP;
const auto INPUT_FRAC_MASK = single ? Common::FLOAT_FRAC : Common::DOUBLE_FRAC;
constexpr u32 OUTPUT_SIGN_MASK = 0xC;
// This code is duplicated for the most common cases for performance.
// For the less common cases, we branch to an existing copy of this code.
auto emit_write_fprf_and_ret = [&] {
BFI(fpscr_reg, fprf_reg, FPRF_SHIFT, FPRF_WIDTH);
STR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
RET();
};
// First of all, start the load of the old FPSCR value, in case it takes a while
LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
CMP(input_reg, 0); // Grab sign bit (conveniently the same bit for floats as for integers)
ANDI2R(exp_reg, input_reg, INPUT_EXP_MASK); // Grab exponent
// Most branches handle the sign in the same way. Perform that handling before branching
MOVI2R(ARM64Reg::W3, Common::PPC_FPCLASS_PN);
MOVI2R(ARM64Reg::W1, Common::PPC_FPCLASS_NN);
CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W3, CCFlags::CC_LT);
FixupBranch zero_or_denormal = CBZ(exp_reg);
// exp != 0
MOVI2R(temp_reg, INPUT_EXP_MASK);
CMP(exp_reg, temp_reg);
FixupBranch nan_or_inf = B(CCFlags::CC_EQ);
// exp != 0 && exp != EXP_MASK
const u8* normal = GetCodePtr();
emit_write_fprf_and_ret();
// exp == 0
SetJumpTarget(zero_or_denormal);
TSTI2R(input_reg, INPUT_FRAC_MASK);
FixupBranch denormal;
if (single)
{
// To match the interpreter, what we output should be based on how the input would be classified
// after conversion to double. Converting a denormal single to a double always results in a
// normal double, so for denormal singles we need to output PPC_FPCLASS_PN/PPC_FPCLASS_NN.
// TODO: Hardware test that the interpreter actually is correct.
B(CCFlags::CC_NEQ, normal);
}
else
{
denormal = B(CCFlags::CC_NEQ);
}
// exp == 0 && frac == 0
LSR(ARM64Reg::W1, fprf_reg, 3);
MOVI2R(fprf_reg, Common::PPC_FPCLASS_PZ & ~OUTPUT_SIGN_MASK);
BFI(fprf_reg, ARM64Reg::W1, 4, 1);
const u8* write_fprf_and_ret = GetCodePtr();
emit_write_fprf_and_ret();
// exp == 0 && frac != 0
if (!single)
SetJumpTarget(denormal);
ORRI2R(fprf_reg, fprf_reg, Common::PPC_FPCLASS_PD & ~OUTPUT_SIGN_MASK);
B(write_fprf_and_ret);
// exp == EXP_MASK
SetJumpTarget(nan_or_inf);
TSTI2R(input_reg, INPUT_FRAC_MASK);
ORRI2R(ARM64Reg::W1, fprf_reg, Common::PPC_FPCLASS_PINF & ~OUTPUT_SIGN_MASK);
MOVI2R(ARM64Reg::W2, Common::PPC_FPCLASS_QNAN);
CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W2, CCFlags::CC_EQ);
B(write_fprf_and_ret);
}
void JitArm64::GenerateQuantizedLoadStores()
{
// X0 is the scale

View File

@ -27,6 +27,8 @@ struct CommonAsmRoutinesBase
const u8* mfcr;
const u8* cdts;
const u8* cstd;
const u8* fprf_single;
const u8* fprf_double;
// In: array index: GQR to use.
// In: ECX: Address to read from.

View File

@ -24,6 +24,7 @@ elseif(_M_ARM_64)
add_dolphin_test(PowerPCTest
PowerPC/DivUtilsTest.cpp
PowerPC/JitArm64/ConvertSingleDouble.cpp
PowerPC/JitArm64/FPRF.cpp
PowerPC/JitArm64/MovI2R.cpp
)
else()

View File

@ -0,0 +1,86 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <cinttypes>
#include <functional>
#include <vector>
#include "Common/Arm64Emitter.h"
#include "Common/BitUtils.h"
#include "Common/CommonTypes.h"
#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/PowerPC.h"
#include "../TestValues.h"
#include <gtest/gtest.h>
namespace
{
using namespace Arm64Gen;
class TestFPRF : public JitArm64
{
public:
TestFPRF()
{
AllocCodeSpace(4096);
const u8* raw_fprf_single = GetCodePtr();
GenerateFPRF(true);
const u8* raw_fprf_double = GetCodePtr();
GenerateFPRF(false);
fprf_single = Common::BitCast<void (*)(u32)>(GetCodePtr());
MOV(ARM64Reg::X15, ARM64Reg::X30);
MOV(ARM64Reg::X14, PPC_REG);
MOVP2R(PPC_REG, &PowerPC::ppcState);
BL(raw_fprf_single);
MOV(ARM64Reg::X30, ARM64Reg::X15);
MOV(PPC_REG, ARM64Reg::X14);
RET();
fprf_double = Common::BitCast<void (*)(u64)>(GetCodePtr());
MOV(ARM64Reg::X15, ARM64Reg::X30);
MOV(ARM64Reg::X14, PPC_REG);
MOVP2R(PPC_REG, &PowerPC::ppcState);
BL(raw_fprf_double);
MOV(ARM64Reg::X30, ARM64Reg::X15);
MOV(PPC_REG, ARM64Reg::X14);
RET();
}
std::function<void(u32)> fprf_single;
std::function<void(u64)> fprf_double;
};
} // namespace
static u32 RunUpdateFPRF(const std::function<void()>& f)
{
PowerPC::ppcState.fpscr.Hex = 0x12345678;
f();
return PowerPC::ppcState.fpscr.Hex;
}
TEST(JitArm64, FPRF)
{
TestFPRF test;
for (const u64 double_input : double_test_values)
{
const u32 expected_double =
RunUpdateFPRF([&] { PowerPC::UpdateFPRF(Common::BitCast<double>(double_input)); });
const u32 actual_double = RunUpdateFPRF([&] { test.fprf_double(double_input); });
EXPECT_EQ(expected_double, actual_double);
const u32 single_input = ConvertToSingle(double_input);
const u32 expected_single = RunUpdateFPRF(
[&] { PowerPC::UpdateFPRF(Common::BitCast<double>(ConvertToDouble(single_input))); });
const u32 actual_single = RunUpdateFPRF([&] { test.fprf_single(single_input); });
EXPECT_EQ(expected_single, actual_single);
}
}

View File

@ -83,6 +83,7 @@
</ItemGroup>
<ItemGroup Condition="'$(Platform)'=='ARM64'">
<ClCompile Include="Core\PowerPC\JitArm64\ConvertSingleDouble.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\FPRF.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\MovI2R.cpp" />
</ItemGroup>
<ItemGroup>