Merge pull request #9815 from JosJuice/jitarm64-fmaless
JitArm64: Implement FMA-less path for FMA instructions
This commit is contained in:
commit
bcd1831339
|
@ -6,8 +6,10 @@
|
||||||
#include "Common/Arm64Emitter.h"
|
#include "Common/Arm64Emitter.h"
|
||||||
#include "Common/CPUDetect.h"
|
#include "Common/CPUDetect.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
#include "Common/Config/Config.h"
|
||||||
#include "Common/StringUtil.h"
|
#include "Common/StringUtil.h"
|
||||||
|
|
||||||
|
#include "Core/Config/SessionSettings.h"
|
||||||
#include "Core/ConfigManager.h"
|
#include "Core/ConfigManager.h"
|
||||||
#include "Core/Core.h"
|
#include "Core/Core.h"
|
||||||
#include "Core/CoreTiming.h"
|
#include "Core/CoreTiming.h"
|
||||||
|
@ -91,6 +93,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
ARM64Reg VA{}, VB{}, VC{}, VD{};
|
ARM64Reg VA{}, VB{}, VC{}, VD{};
|
||||||
|
|
||||||
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||||
|
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
|
||||||
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
|
@ -153,17 +156,26 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
VC = reg_encoder(fpr.R(c, type));
|
VC = reg_encoder(fpr.R(c, type));
|
||||||
VD = reg_encoder(fpr.RW(d, type_out));
|
VD = reg_encoder(fpr.RW(d, type_out));
|
||||||
|
|
||||||
|
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
|
||||||
|
|
||||||
if (round_c)
|
if (round_c)
|
||||||
{
|
{
|
||||||
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
||||||
|
|
||||||
V0Q = fpr.GetReg();
|
V0Q = fpr.GetReg();
|
||||||
const ARM64Reg V1Q = fpr.GetReg();
|
V1Q = fpr.GetReg();
|
||||||
|
|
||||||
Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
|
Force25BitPrecision(reg_encoder(V1Q), VC, reg_encoder(V0Q));
|
||||||
VC = reg_encoder(V0Q);
|
VC = reg_encoder(V1Q);
|
||||||
|
}
|
||||||
|
|
||||||
fpr.Unlock(V1Q);
|
ARM64Reg inaccurate_fma_temp_reg = VD;
|
||||||
|
if (inaccurate_fma && d == b)
|
||||||
|
{
|
||||||
|
if (V0Q == ARM64Reg::INVALID_REG)
|
||||||
|
V0Q = fpr.GetReg();
|
||||||
|
|
||||||
|
inaccurate_fma_temp_reg = reg_encoder(V0Q);
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (op5)
|
switch (op5)
|
||||||
|
@ -180,23 +192,37 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
case 25:
|
case 25:
|
||||||
m_float_emit.FMUL(VD, VA, VC);
|
m_float_emit.FMUL(VD, VA, VC);
|
||||||
break;
|
break;
|
||||||
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
|
|
||||||
m_float_emit.FNMSUB(VD, VA, VC, VB);
|
|
||||||
break;
|
|
||||||
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
|
|
||||||
m_float_emit.FMADD(VD, VA, VC, VB);
|
|
||||||
break;
|
|
||||||
// While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
|
// While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic],
|
||||||
// the subtly different definitions affect how signed zeroes are handled.
|
// the subtly different definitions affect how signed zeroes are handled.
|
||||||
// Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
|
// Also, PowerPC's nmadd/nmsub perform rounding before the final negation.
|
||||||
// So, negate using a separate instruction instead of using AArch64's nmadd/msub.
|
// So, we negate using a separate FNEG instruction instead of using AArch64's nmadd/msub.
|
||||||
|
case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm"
|
||||||
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
|
case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)"
|
||||||
m_float_emit.FNMSUB(VD, VA, VC, VB);
|
if (inaccurate_fma)
|
||||||
m_float_emit.FNEG(VD, VD);
|
{
|
||||||
|
m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
|
||||||
|
m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_float_emit.FNMSUB(VD, VA, VC, VB);
|
||||||
|
}
|
||||||
|
if (op5 == 30)
|
||||||
|
m_float_emit.FNEG(VD, VD);
|
||||||
break;
|
break;
|
||||||
|
case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm"
|
||||||
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
|
case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)"
|
||||||
m_float_emit.FMADD(VD, VA, VC, VB);
|
if (inaccurate_fma)
|
||||||
m_float_emit.FNEG(VD, VD);
|
{
|
||||||
|
m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC);
|
||||||
|
m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_float_emit.FMADD(VD, VA, VC, VB);
|
||||||
|
}
|
||||||
|
if (op5 == 31)
|
||||||
|
m_float_emit.FNEG(VD, VD);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
|
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
|
||||||
|
@ -206,6 +232,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
|
|
||||||
if (V0Q != ARM64Reg::INVALID_REG)
|
if (V0Q != ARM64Reg::INVALID_REG)
|
||||||
fpr.Unlock(V0Q);
|
fpr.Unlock(V0Q);
|
||||||
|
if (V1Q != ARM64Reg::INVALID_REG)
|
||||||
|
fpr.Unlock(V1Q);
|
||||||
|
|
||||||
if (outputs_are_singles)
|
if (outputs_are_singles)
|
||||||
{
|
{
|
||||||
|
|
|
@ -5,8 +5,10 @@
|
||||||
|
|
||||||
#include "Common/Arm64Emitter.h"
|
#include "Common/Arm64Emitter.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
#include "Common/Config/Config.h"
|
||||||
#include "Common/StringUtil.h"
|
#include "Common/StringUtil.h"
|
||||||
|
|
||||||
|
#include "Core/Config/SessionSettings.h"
|
||||||
#include "Core/ConfigManager.h"
|
#include "Core/ConfigManager.h"
|
||||||
#include "Core/Core.h"
|
#include "Core/Core.h"
|
||||||
#include "Core/CoreTiming.h"
|
#include "Core/CoreTiming.h"
|
||||||
|
@ -135,6 +137,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
const u32 d = inst.FD;
|
const u32 d = inst.FD;
|
||||||
const u32 op5 = inst.SUBOP5;
|
const u32 op5 = inst.SUBOP5;
|
||||||
|
|
||||||
|
const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA);
|
||||||
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
|
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
|
||||||
const bool round_c = !js.op->fprIsSingle[inst.FC];
|
const bool round_c = !js.op->fprIsSingle[inst.FC];
|
||||||
const RegType type = singles ? RegType::Single : RegType::Register;
|
const RegType type = singles ? RegType::Single : RegType::Register;
|
||||||
|
@ -169,11 +172,23 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
VC = reg_encoder(V1Q);
|
VC = reg_encoder(V1Q);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ARM64Reg inaccurate_fma_temp_reg = VD;
|
||||||
|
if (inaccurate_fma && d == b)
|
||||||
|
{
|
||||||
|
allocate_v0_if_needed();
|
||||||
|
inaccurate_fma_temp_reg = V0;
|
||||||
|
}
|
||||||
|
|
||||||
ARM64Reg result_reg = VD;
|
ARM64Reg result_reg = VD;
|
||||||
switch (op5)
|
switch (op5)
|
||||||
{
|
{
|
||||||
case 14: // ps_madds0: d = a * c.ps0 + b
|
case 14: // ps_madds0: d = a * c.ps0 + b
|
||||||
if (VD == VB)
|
if (inaccurate_fma)
|
||||||
|
{
|
||||||
|
m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 0);
|
||||||
|
m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB);
|
||||||
|
}
|
||||||
|
else if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC, 0);
|
m_float_emit.FMLA(size, VD, VA, VC, 0);
|
||||||
}
|
}
|
||||||
|
@ -191,7 +206,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 15: // ps_madds1: d = a * c.ps1 + b
|
case 15: // ps_madds1: d = a * c.ps1 + b
|
||||||
if (VD == VB)
|
if (inaccurate_fma)
|
||||||
|
{
|
||||||
|
m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 1);
|
||||||
|
m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB);
|
||||||
|
}
|
||||||
|
else if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC, 1);
|
m_float_emit.FMLA(size, VD, VA, VC, 1);
|
||||||
}
|
}
|
||||||
|
@ -210,7 +230,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
break;
|
break;
|
||||||
case 28: // ps_msub: d = a * c - b
|
case 28: // ps_msub: d = a * c - b
|
||||||
case 30: // ps_nmsub: d = -(a * c - b)
|
case 30: // ps_nmsub: d = -(a * c - b)
|
||||||
if (VD != VA && VD != VC)
|
if (inaccurate_fma)
|
||||||
|
{
|
||||||
|
m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC);
|
||||||
|
m_float_emit.FSUB(size, VD, inaccurate_fma_temp_reg, VB);
|
||||||
|
}
|
||||||
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
m_float_emit.FNEG(size, VD, VB);
|
m_float_emit.FNEG(size, VD, VB);
|
||||||
m_float_emit.FMLA(size, VD, VA, VC);
|
m_float_emit.FMLA(size, VD, VA, VC);
|
||||||
|
@ -225,7 +250,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
break;
|
break;
|
||||||
case 29: // ps_madd: d = a * c + b
|
case 29: // ps_madd: d = a * c + b
|
||||||
case 31: // ps_nmadd: d = -(a * c + b)
|
case 31: // ps_nmadd: d = -(a * c + b)
|
||||||
if (VD == VB)
|
if (inaccurate_fma)
|
||||||
|
{
|
||||||
|
m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC);
|
||||||
|
m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB);
|
||||||
|
}
|
||||||
|
else if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC);
|
m_float_emit.FMLA(size, VD, VA, VC);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue