Merge pull request #11147 from JosJuice/jitarm64-arith-org
JitArm64: Merge ps_mulsX, ps_maddXX, and parts of fp_arith
This commit is contained in:
commit
5b69c67b3a
|
@ -152,9 +152,8 @@ public:
|
||||||
void frsqrtex(UGeckoInstruction inst);
|
void frsqrtex(UGeckoInstruction inst);
|
||||||
|
|
||||||
// Paired
|
// Paired
|
||||||
void ps_maddXX(UGeckoInstruction inst);
|
|
||||||
void ps_mergeXX(UGeckoInstruction inst);
|
void ps_mergeXX(UGeckoInstruction inst);
|
||||||
void ps_mulsX(UGeckoInstruction inst);
|
void ps_arith(UGeckoInstruction inst);
|
||||||
void ps_sel(UGeckoInstruction inst);
|
void ps_sel(UGeckoInstruction inst);
|
||||||
void ps_sumX(UGeckoInstruction inst);
|
void ps_sumX(UGeckoInstruction inst);
|
||||||
void ps_res(UGeckoInstruction inst);
|
void ps_res(UGeckoInstruction inst);
|
||||||
|
|
|
@ -69,86 +69,35 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
|
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
|
||||||
u32 op5 = inst.SUBOP5;
|
u32 op5 = inst.SUBOP5;
|
||||||
|
|
||||||
bool single = inst.OPCD == 59;
|
|
||||||
bool packed = inst.OPCD == 4;
|
|
||||||
|
|
||||||
const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
|
const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
|
||||||
const bool use_b = op5 != 25; // fmul uses no B
|
const bool use_b = op5 != 25; // fmul uses no B
|
||||||
|
|
||||||
const bool outputs_are_singles = single || packed;
|
const bool output_is_single = inst.OPCD == 59;
|
||||||
const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC];
|
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
|
||||||
|
const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC];
|
||||||
|
|
||||||
const auto inputs_are_singles_func = [&] {
|
const auto inputs_are_singles_func = [&] {
|
||||||
return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
|
return fpr.IsSingle(a, true) && (!use_b || fpr.IsSingle(b, true)) &&
|
||||||
(!use_c || fpr.IsSingle(c, !packed));
|
(!use_c || fpr.IsSingle(c, true));
|
||||||
};
|
};
|
||||||
const bool inputs_are_singles = inputs_are_singles_func();
|
const bool inputs_are_singles = inputs_are_singles_func();
|
||||||
|
|
||||||
ARM64Reg VA{}, VB{}, VC{}, VD{};
|
const RegType type =
|
||||||
|
(inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair;
|
||||||
|
const RegType type_out =
|
||||||
|
output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
|
||||||
|
RegType::LowerPair;
|
||||||
|
const auto reg_encoder =
|
||||||
|
(inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble;
|
||||||
|
|
||||||
|
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
||||||
|
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
|
||||||
|
ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
|
||||||
|
const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out));
|
||||||
|
|
||||||
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||||
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
|
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
|
||||||
|
|
||||||
if (packed)
|
|
||||||
{
|
|
||||||
const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
|
|
||||||
const u8 size = inputs_are_singles ? 32 : 64;
|
|
||||||
const auto reg_encoder = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad;
|
|
||||||
|
|
||||||
VA = reg_encoder(fpr.R(a, type));
|
|
||||||
if (use_b)
|
|
||||||
VB = reg_encoder(fpr.R(b, type));
|
|
||||||
if (use_c)
|
|
||||||
VC = reg_encoder(fpr.R(c, type));
|
|
||||||
VD = reg_encoder(fpr.RW(d, type));
|
|
||||||
|
|
||||||
if (round_c)
|
|
||||||
{
|
|
||||||
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
|
||||||
|
|
||||||
V0Q = fpr.GetReg();
|
|
||||||
|
|
||||||
Force25BitPrecision(reg_encoder(V0Q), VC);
|
|
||||||
VC = reg_encoder(V0Q);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (op5)
|
|
||||||
{
|
|
||||||
case 18:
|
|
||||||
m_float_emit.FDIV(size, VD, VA, VB);
|
|
||||||
break;
|
|
||||||
case 20:
|
|
||||||
m_float_emit.FSUB(size, VD, VA, VB);
|
|
||||||
break;
|
|
||||||
case 21:
|
|
||||||
m_float_emit.FADD(size, VD, VA, VB);
|
|
||||||
break;
|
|
||||||
case 25:
|
|
||||||
m_float_emit.FMUL(size, VD, VA, VC);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
const RegType type =
|
|
||||||
(inputs_are_singles && single) ? RegType::LowerPairSingle : RegType::LowerPair;
|
|
||||||
const RegType type_out =
|
|
||||||
single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) :
|
|
||||||
RegType::LowerPair;
|
|
||||||
const auto reg_encoder = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble;
|
|
||||||
|
|
||||||
VA = reg_encoder(fpr.R(a, type));
|
|
||||||
if (use_b)
|
|
||||||
VB = reg_encoder(fpr.R(b, type));
|
|
||||||
if (use_c)
|
|
||||||
VC = reg_encoder(fpr.R(c, type));
|
|
||||||
VD = reg_encoder(fpr.RW(d, type_out));
|
|
||||||
|
|
||||||
const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA);
|
|
||||||
|
|
||||||
if (round_c)
|
if (round_c)
|
||||||
{
|
{
|
||||||
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
||||||
|
@ -217,14 +166,13 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
|
ASSERT_MSG(DYNA_REC, 0, "fp_arith");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (V0Q != ARM64Reg::INVALID_REG)
|
if (V0Q != ARM64Reg::INVALID_REG)
|
||||||
fpr.Unlock(V0Q);
|
fpr.Unlock(V0Q);
|
||||||
if (V1Q != ARM64Reg::INVALID_REG)
|
if (V1Q != ARM64Reg::INVALID_REG)
|
||||||
fpr.Unlock(V1Q);
|
fpr.Unlock(V1Q);
|
||||||
|
|
||||||
if (outputs_are_singles)
|
if (output_is_single)
|
||||||
{
|
{
|
||||||
ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
|
ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(),
|
||||||
"Register allocation turned singles into doubles in the middle of fp_arith");
|
"Register allocation turned singles into doubles in the middle of fp_arith");
|
||||||
|
@ -232,7 +180,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
fpr.FixSinglePrecision(d);
|
fpr.FixSinglePrecision(d);
|
||||||
}
|
}
|
||||||
|
|
||||||
SetFPRFIfNeeded(outputs_are_singles, VD);
|
SetFPRFIfNeeded(output_is_single, VD);
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::fp_logic(UGeckoInstruction inst)
|
void JitArm64::fp_logic(UGeckoInstruction inst)
|
||||||
|
|
|
@ -73,55 +73,7 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
|
||||||
"Register allocation turned singles into doubles in the middle of ps_mergeXX");
|
"Register allocation turned singles into doubles in the middle of ps_mergeXX");
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArm64::ps_mulsX(UGeckoInstruction inst)
|
void JitArm64::ps_arith(UGeckoInstruction inst)
|
||||||
{
|
|
||||||
INSTRUCTION_START
|
|
||||||
JITDISABLE(bJITPairedOff);
|
|
||||||
FALLBACK_IF(inst.Rc);
|
|
||||||
FALLBACK_IF(jo.fp_exceptions);
|
|
||||||
|
|
||||||
const u32 a = inst.FA;
|
|
||||||
const u32 c = inst.FC;
|
|
||||||
const u32 d = inst.FD;
|
|
||||||
|
|
||||||
const bool upper = inst.SUBOP5 == 13;
|
|
||||||
|
|
||||||
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
|
|
||||||
const bool round_c = !js.op->fprIsSingle[inst.FC];
|
|
||||||
const RegType type = singles ? RegType::Single : RegType::Register;
|
|
||||||
const u8 size = singles ? 32 : 64;
|
|
||||||
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
|
||||||
|
|
||||||
const ARM64Reg VA = fpr.R(a, type);
|
|
||||||
ARM64Reg VC = fpr.R(c, type);
|
|
||||||
const ARM64Reg VD = fpr.RW(d, type);
|
|
||||||
|
|
||||||
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
|
||||||
|
|
||||||
if (round_c)
|
|
||||||
{
|
|
||||||
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
|
|
||||||
|
|
||||||
V0Q = fpr.GetReg();
|
|
||||||
|
|
||||||
Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC));
|
|
||||||
VC = reg_encoder(V0Q);
|
|
||||||
}
|
|
||||||
|
|
||||||
m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
|
|
||||||
|
|
||||||
if (V0Q != ARM64Reg::INVALID_REG)
|
|
||||||
fpr.Unlock(V0Q);
|
|
||||||
|
|
||||||
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
|
|
||||||
"Register allocation turned singles into doubles in the middle of ps_mulsX");
|
|
||||||
|
|
||||||
fpr.FixSinglePrecision(d);
|
|
||||||
|
|
||||||
SetFPRFIfNeeded(true, VD);
|
|
||||||
}
|
|
||||||
|
|
||||||
void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITPairedOff);
|
JITDISABLE(bJITPairedOff);
|
||||||
|
@ -134,16 +86,23 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
const u32 d = inst.FD;
|
const u32 d = inst.FD;
|
||||||
const u32 op5 = inst.SUBOP5;
|
const u32 op5 = inst.SUBOP5;
|
||||||
|
|
||||||
|
const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX
|
||||||
|
const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B
|
||||||
|
|
||||||
|
const auto singles_func = [&] {
|
||||||
|
return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c));
|
||||||
|
};
|
||||||
|
const bool singles = singles_func();
|
||||||
|
|
||||||
const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA);
|
const bool inaccurate_fma = !Config::Get(Config::SESSION_USE_FMA);
|
||||||
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
|
const bool round_c = use_c && !js.op->fprIsSingle[inst.FC];
|
||||||
const bool round_c = !js.op->fprIsSingle[inst.FC];
|
|
||||||
const RegType type = singles ? RegType::Single : RegType::Register;
|
const RegType type = singles ? RegType::Single : RegType::Register;
|
||||||
const u8 size = singles ? 32 : 64;
|
const u8 size = singles ? 32 : 64;
|
||||||
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
||||||
|
|
||||||
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
||||||
const ARM64Reg VB = reg_encoder(fpr.R(b, type));
|
const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG;
|
||||||
ARM64Reg VC = reg_encoder(fpr.R(c, type));
|
ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG;
|
||||||
const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
|
const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
|
||||||
|
|
||||||
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||||
|
@ -178,6 +137,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
ARM64Reg result_reg = VD;
|
ARM64Reg result_reg = VD;
|
||||||
switch (op5)
|
switch (op5)
|
||||||
{
|
{
|
||||||
|
case 12: // ps_muls0: d = a * c.ps0
|
||||||
|
m_float_emit.FMUL(size, VD, VA, VC, 0);
|
||||||
|
break;
|
||||||
|
case 13: // ps_muls1: d = a * c.ps1
|
||||||
|
m_float_emit.FMUL(size, VD, VA, VC, 1);
|
||||||
|
break;
|
||||||
case 14: // ps_madds0: d = a * c.ps0 + b
|
case 14: // ps_madds0: d = a * c.ps0 + b
|
||||||
if (inaccurate_fma)
|
if (inaccurate_fma)
|
||||||
{
|
{
|
||||||
|
@ -224,6 +189,18 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
result_reg = V0;
|
result_reg = V0;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 18: // ps_div
|
||||||
|
m_float_emit.FDIV(size, VD, VA, VB);
|
||||||
|
break;
|
||||||
|
case 20: // ps_sub
|
||||||
|
m_float_emit.FSUB(size, VD, VA, VB);
|
||||||
|
break;
|
||||||
|
case 21: // ps_add
|
||||||
|
m_float_emit.FADD(size, VD, VA, VB);
|
||||||
|
break;
|
||||||
|
case 25: // ps_mul
|
||||||
|
m_float_emit.FMUL(size, VD, VA, VC);
|
||||||
|
break;
|
||||||
case 28: // ps_msub: d = a * c - b
|
case 28: // ps_msub: d = a * c - b
|
||||||
case 30: // ps_nmsub: d = -(a * c - b)
|
case 30: // ps_nmsub: d = -(a * c - b)
|
||||||
if (inaccurate_fma)
|
if (inaccurate_fma)
|
||||||
|
@ -269,7 +246,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
ASSERT_MSG(DYNA_REC, 0, "ps_madd - invalid op");
|
ASSERT_MSG(DYNA_REC, 0, "ps_arith - invalid op");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -292,8 +269,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
if (V1Q != ARM64Reg::INVALID_REG)
|
if (V1Q != ARM64Reg::INVALID_REG)
|
||||||
fpr.Unlock(V1Q);
|
fpr.Unlock(V1Q);
|
||||||
|
|
||||||
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
|
ASSERT_MSG(DYNA_REC, singles == singles_func(),
|
||||||
"Register allocation turned singles into doubles in the middle of ps_maddXX");
|
"Register allocation turned singles into doubles in the middle of ps_arith");
|
||||||
|
|
||||||
fpr.FixSinglePrecision(d);
|
fpr.FixSinglePrecision(d);
|
||||||
|
|
||||||
|
|
|
@ -108,21 +108,21 @@ constexpr std::array<GekkoOPTemplate, 13> table4{{
|
||||||
constexpr std::array<GekkoOPTemplate, 17> table4_2{{
|
constexpr std::array<GekkoOPTemplate, 17> table4_2{{
|
||||||
{10, &JitArm64::ps_sumX}, // ps_sum0
|
{10, &JitArm64::ps_sumX}, // ps_sum0
|
||||||
{11, &JitArm64::ps_sumX}, // ps_sum1
|
{11, &JitArm64::ps_sumX}, // ps_sum1
|
||||||
{12, &JitArm64::ps_mulsX}, // ps_muls0
|
{12, &JitArm64::ps_arith}, // ps_muls0
|
||||||
{13, &JitArm64::ps_mulsX}, // ps_muls1
|
{13, &JitArm64::ps_arith}, // ps_muls1
|
||||||
{14, &JitArm64::ps_maddXX}, // ps_madds0
|
{14, &JitArm64::ps_arith}, // ps_madds0
|
||||||
{15, &JitArm64::ps_maddXX}, // ps_madds1
|
{15, &JitArm64::ps_arith}, // ps_madds1
|
||||||
{18, &JitArm64::fp_arith}, // ps_div
|
{18, &JitArm64::ps_arith}, // ps_div
|
||||||
{20, &JitArm64::fp_arith}, // ps_sub
|
{20, &JitArm64::ps_arith}, // ps_sub
|
||||||
{21, &JitArm64::fp_arith}, // ps_add
|
{21, &JitArm64::ps_arith}, // ps_add
|
||||||
{23, &JitArm64::ps_sel}, // ps_sel
|
{23, &JitArm64::ps_sel}, // ps_sel
|
||||||
{24, &JitArm64::ps_res}, // ps_res
|
{24, &JitArm64::ps_res}, // ps_res
|
||||||
{25, &JitArm64::fp_arith}, // ps_mul
|
{25, &JitArm64::ps_arith}, // ps_mul
|
||||||
{26, &JitArm64::ps_rsqrte}, // ps_rsqrte
|
{26, &JitArm64::ps_rsqrte}, // ps_rsqrte
|
||||||
{28, &JitArm64::ps_maddXX}, // ps_msub
|
{28, &JitArm64::ps_arith}, // ps_msub
|
||||||
{29, &JitArm64::ps_maddXX}, // ps_madd
|
{29, &JitArm64::ps_arith}, // ps_madd
|
||||||
{30, &JitArm64::ps_maddXX}, // ps_nmsub
|
{30, &JitArm64::ps_arith}, // ps_nmsub
|
||||||
{31, &JitArm64::ps_maddXX}, // ps_nmadd
|
{31, &JitArm64::ps_arith}, // ps_nmadd
|
||||||
}};
|
}};
|
||||||
|
|
||||||
constexpr std::array<GekkoOPTemplate, 4> table4_3{{
|
constexpr std::array<GekkoOPTemplate, 4> table4_3{{
|
||||||
|
|
Loading…
Reference in New Issue