Merge pull request #9712 from JosJuice/jitarm64-fmul-rounding
JitArm64: Fix fmul rounding issues
This commit is contained in:
commit
1054abc9cc
|
@ -2294,6 +2294,15 @@ void ARM64FloatEmitter::EmitScalar2Source(bool M, bool S, u32 type, u32 opcode,
|
||||||
(opcode << 12) | (1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
|
(opcode << 12) | (1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ARM64FloatEmitter::EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
|
||||||
|
ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __func__);
|
||||||
|
|
||||||
|
Write32((1 << 30) | (U << 29) | (0b11110001 << 21) | (size << 22) | (DecodeReg(Rm) << 16) |
|
||||||
|
(opcode << 11) | (1 << 10) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
|
||||||
|
}
|
||||||
|
|
||||||
void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
|
void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
|
||||||
ARM64Reg Rm)
|
ARM64Reg Rm)
|
||||||
{
|
{
|
||||||
|
@ -3118,6 +3127,11 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scalar - 2 Source
|
// Scalar - 2 Source
|
||||||
|
void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, IsDouble(Rd), "%s only supports double registers!", __func__);
|
||||||
|
EmitScalarThreeSame(0, 3, 0b10000, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
{
|
{
|
||||||
EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
|
EmitScalar2Source(0, 0, IsDouble(Rd), 2, Rd, Rn, Rm);
|
||||||
|
@ -3189,10 +3203,18 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, uint8_t imm8)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vector
|
// Vector
|
||||||
|
void ARM64FloatEmitter::ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
EmitThreeSame(0, size >> 6, 0b10000, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
void ARM64FloatEmitter::AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
{
|
{
|
||||||
EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
|
EmitThreeSame(0, 0, 3, Rd, Rn, Rm);
|
||||||
}
|
}
|
||||||
|
void ARM64FloatEmitter::BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
EmitThreeSame(0, 1, 3, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
{
|
{
|
||||||
EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
|
EmitThreeSame(1, 1, 3, Rd, Rn, Rm);
|
||||||
|
@ -3300,6 +3322,10 @@ void ARM64FloatEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
{
|
{
|
||||||
EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
|
EmitThreeSame(0, 2, 3, Rd, Rn, Rm);
|
||||||
}
|
}
|
||||||
|
void ARM64FloatEmitter::ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
EmitThreeSame(0, 3, 3, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
void ARM64FloatEmitter::REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn)
|
||||||
{
|
{
|
||||||
Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
|
Emit2RegMisc(IsQuad(Rd), 0, size >> 4, 1, Rd, Rn);
|
||||||
|
@ -3879,11 +3905,10 @@ void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift)
|
||||||
EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
|
EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
|
void ARM64FloatEmitter::ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op)
|
||||||
{
|
{
|
||||||
bool Q = IsQuad(Rd);
|
bool Q = IsQuad(Rd);
|
||||||
u8 cmode = 1;
|
u8 cmode = 1;
|
||||||
u8 op = 1;
|
|
||||||
if (size == 16)
|
if (size == 16)
|
||||||
{
|
{
|
||||||
ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
|
ASSERT_MSG(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!",
|
||||||
|
@ -3919,6 +3944,16 @@ void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
|
||||||
EncodeModImm(Q, op, cmode, 0, Rd, imm);
|
EncodeModImm(Q, op, cmode, 0, Rd, imm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ARM64FloatEmitter::ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
|
||||||
|
{
|
||||||
|
ORR_BIC(size, Rd, imm, shift, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift)
|
||||||
|
{
|
||||||
|
ORR_BIC(size, Rd, imm, shift, 1);
|
||||||
|
}
|
||||||
|
|
||||||
void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
|
void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
|
||||||
{
|
{
|
||||||
bool bundled_loadstore = false;
|
bool bundled_loadstore = false;
|
||||||
|
|
|
@ -1000,6 +1000,7 @@ public:
|
||||||
void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);
|
void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn);
|
||||||
|
|
||||||
// Scalar - 2 Source
|
// Scalar - 2 Source
|
||||||
|
void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void FMUL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void FSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
@ -1020,7 +1021,9 @@ public:
|
||||||
void FMOV(ARM64Reg Rd, uint8_t imm8);
|
void FMOV(ARM64Reg Rd, uint8_t imm8);
|
||||||
|
|
||||||
// Vector
|
// Vector
|
||||||
|
void ADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
|
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
|
||||||
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
|
@ -1043,6 +1046,7 @@ public:
|
||||||
void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void FSUB(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void NOT(ARM64Reg Rd, ARM64Reg Rn);
|
void NOT(ARM64Reg Rd, ARM64Reg Rn);
|
||||||
void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
|
void MOV(ARM64Reg Rd, ARM64Reg Rn) { ORR(Rd, Rn, Rn); }
|
||||||
void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
void REV16(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
void REV32(u8 size, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
|
@ -1128,6 +1132,7 @@ public:
|
||||||
|
|
||||||
// Modified Immediate
|
// Modified Immediate
|
||||||
void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
|
void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0);
|
||||||
|
void ORR(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
|
||||||
void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
|
void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0);
|
||||||
|
|
||||||
void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG,
|
void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = ARM64Reg::INVALID_REG,
|
||||||
|
@ -1145,6 +1150,7 @@ private:
|
||||||
void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
|
void EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
|
||||||
void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
|
void EmitScalar2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn,
|
||||||
ARM64Reg Rm);
|
ARM64Reg Rm);
|
||||||
|
void EmitScalarThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
|
void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
|
@ -1178,6 +1184,8 @@ private:
|
||||||
void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
|
void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
|
||||||
void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
|
void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh);
|
||||||
|
|
||||||
|
void ORR_BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift, u8 op);
|
||||||
|
|
||||||
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
||||||
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
||||||
void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
|
||||||
|
|
|
@ -270,6 +270,8 @@ protected:
|
||||||
bool Rc = false);
|
bool Rc = false);
|
||||||
|
|
||||||
void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
|
void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg);
|
||||||
|
void Force25BitPrecision(Arm64Gen::ARM64Reg output, Arm64Gen::ARM64Reg input,
|
||||||
|
Arm64Gen::ARM64Reg temp);
|
||||||
|
|
||||||
// <Fastmem fault location, slowmem handler location>
|
// <Fastmem fault location, slowmem handler location>
|
||||||
std::map<const u8*, FastmemArea> m_fault_to_handler;
|
std::map<const u8*, FastmemArea> m_fault_to_handler;
|
||||||
|
|
|
@ -39,6 +39,29 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg)
|
||||||
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
|
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Emulate the odd truncation/rounding that the PowerPC does on the RHS operand before
|
||||||
|
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
|
||||||
|
// rounding to nearest as it does.
|
||||||
|
void JitArm64::Force25BitPrecision(ARM64Reg output, ARM64Reg input, ARM64Reg temp)
|
||||||
|
{
|
||||||
|
ASSERT(output != input && output != temp && input != temp);
|
||||||
|
|
||||||
|
// temp = 0x0000'0000'0800'0000ULL
|
||||||
|
// output = 0xFFFF'FFFF'F800'0000ULL
|
||||||
|
m_float_emit.MOVI(32, temp, 0x08, 24);
|
||||||
|
m_float_emit.MOVI(64, output, 0xFFFF'FFFF'0000'0000ULL);
|
||||||
|
m_float_emit.BIC(temp, temp, output);
|
||||||
|
m_float_emit.ORR(32, output, 0xF8, 24);
|
||||||
|
|
||||||
|
// output = (input & ~0xFFFFFFF) + ((input & (1ULL << 27)) << 1)
|
||||||
|
m_float_emit.AND(temp, input, temp);
|
||||||
|
m_float_emit.AND(output, input, output);
|
||||||
|
if (IsQuad(input))
|
||||||
|
m_float_emit.ADD(64, output, output, temp);
|
||||||
|
else
|
||||||
|
m_float_emit.ADD(output, output, temp);
|
||||||
|
}
|
||||||
|
|
||||||
void JitArm64::fp_arith(UGeckoInstruction inst)
|
void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -51,8 +74,11 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
bool single = inst.OPCD == 59;
|
bool single = inst.OPCD == 59;
|
||||||
bool packed = inst.OPCD == 4;
|
bool packed = inst.OPCD == 4;
|
||||||
|
|
||||||
bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
|
const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
|
||||||
bool use_b = op5 != 25; // fmul uses no B
|
const bool use_b = op5 != 25; // fmul uses no B
|
||||||
|
|
||||||
|
const bool outputs_are_singles = single || packed;
|
||||||
|
const bool round_c = use_c && outputs_are_singles && !js.op->fprIsSingle[inst.FC];
|
||||||
|
|
||||||
const auto inputs_are_singles_func = [&] {
|
const auto inputs_are_singles_func = [&] {
|
||||||
return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
|
return fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) &&
|
||||||
|
@ -62,6 +88,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
|
|
||||||
ARM64Reg VA{}, VB{}, VC{}, VD{};
|
ARM64Reg VA{}, VB{}, VC{}, VD{};
|
||||||
|
|
||||||
|
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||||
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
|
const RegType type = inputs_are_singles ? RegType::Single : RegType::Register;
|
||||||
|
@ -75,6 +103,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
VC = reg_encoder(fpr.R(c, type));
|
VC = reg_encoder(fpr.R(c, type));
|
||||||
VD = reg_encoder(fpr.RW(d, type));
|
VD = reg_encoder(fpr.RW(d, type));
|
||||||
|
|
||||||
|
if (round_c)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
||||||
|
|
||||||
|
V0Q = fpr.GetReg();
|
||||||
|
const ARM64Reg V1Q = fpr.GetReg();
|
||||||
|
|
||||||
|
Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
|
||||||
|
VC = reg_encoder(V0Q);
|
||||||
|
|
||||||
|
fpr.Unlock(V1Q);
|
||||||
|
}
|
||||||
|
|
||||||
switch (op5)
|
switch (op5)
|
||||||
{
|
{
|
||||||
case 18:
|
case 18:
|
||||||
|
@ -110,6 +151,19 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
VC = reg_encoder(fpr.R(c, type));
|
VC = reg_encoder(fpr.R(c, type));
|
||||||
VD = reg_encoder(fpr.RW(d, type_out));
|
VD = reg_encoder(fpr.RW(d, type_out));
|
||||||
|
|
||||||
|
if (round_c)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single");
|
||||||
|
|
||||||
|
V0Q = fpr.GetReg();
|
||||||
|
const ARM64Reg V1Q = fpr.GetReg();
|
||||||
|
|
||||||
|
Force25BitPrecision(reg_encoder(V0Q), VC, reg_encoder(V1Q));
|
||||||
|
VC = reg_encoder(V0Q);
|
||||||
|
|
||||||
|
fpr.Unlock(V1Q);
|
||||||
|
}
|
||||||
|
|
||||||
switch (op5)
|
switch (op5)
|
||||||
{
|
{
|
||||||
case 18:
|
case 18:
|
||||||
|
@ -142,7 +196,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool outputs_are_singles = single || packed;
|
if (V0Q != ARM64Reg::INVALID_REG)
|
||||||
|
fpr.Unlock(V0Q);
|
||||||
|
|
||||||
if (outputs_are_singles)
|
if (outputs_are_singles)
|
||||||
{
|
{
|
||||||
|
|
|
@ -84,16 +84,35 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
|
||||||
const bool upper = inst.SUBOP5 == 13;
|
const bool upper = inst.SUBOP5 == 13;
|
||||||
|
|
||||||
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
|
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
|
||||||
|
const bool round_c = !js.op->fprIsSingle[inst.FC];
|
||||||
const RegType type = singles ? RegType::Single : RegType::Register;
|
const RegType type = singles ? RegType::Single : RegType::Register;
|
||||||
const u8 size = singles ? 32 : 64;
|
const u8 size = singles ? 32 : 64;
|
||||||
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
||||||
|
|
||||||
const ARM64Reg VA = fpr.R(a, type);
|
const ARM64Reg VA = fpr.R(a, type);
|
||||||
const ARM64Reg VC = fpr.R(c, type);
|
ARM64Reg VC = fpr.R(c, type);
|
||||||
const ARM64Reg VD = fpr.RW(d, type);
|
const ARM64Reg VD = fpr.RW(d, type);
|
||||||
|
|
||||||
|
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||||
|
|
||||||
|
if (round_c)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
|
||||||
|
|
||||||
|
V0Q = fpr.GetReg();
|
||||||
|
const ARM64Reg V1Q = fpr.GetReg();
|
||||||
|
|
||||||
|
Force25BitPrecision(reg_encoder(V0Q), reg_encoder(VC), reg_encoder(V1Q));
|
||||||
|
VC = reg_encoder(V0Q);
|
||||||
|
|
||||||
|
fpr.Unlock(V1Q);
|
||||||
|
}
|
||||||
|
|
||||||
m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
|
m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(VC), upper ? 1 : 0);
|
||||||
|
|
||||||
|
if (V0Q != ARM64Reg::INVALID_REG)
|
||||||
|
fpr.Unlock(V0Q);
|
||||||
|
|
||||||
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
|
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(c)),
|
||||||
"Register allocation turned singles into doubles in the middle of ps_mulsX");
|
"Register allocation turned singles into doubles in the middle of ps_mulsX");
|
||||||
|
|
||||||
|
@ -115,31 +134,45 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
const u32 op5 = inst.SUBOP5;
|
const u32 op5 = inst.SUBOP5;
|
||||||
|
|
||||||
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
|
const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
|
||||||
|
const bool round_c = !js.op->fprIsSingle[inst.FC];
|
||||||
const RegType type = singles ? RegType::Single : RegType::Register;
|
const RegType type = singles ? RegType::Single : RegType::Register;
|
||||||
const u8 size = singles ? 32 : 64;
|
const u8 size = singles ? 32 : 64;
|
||||||
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
||||||
|
|
||||||
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
||||||
const ARM64Reg VB = reg_encoder(fpr.R(b, type));
|
const ARM64Reg VB = reg_encoder(fpr.R(b, type));
|
||||||
const ARM64Reg VC = reg_encoder(fpr.R(c, type));
|
ARM64Reg VC = reg_encoder(fpr.R(c, type));
|
||||||
const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
|
const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
|
||||||
|
|
||||||
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
ARM64Reg V0Q = ARM64Reg::INVALID_REG;
|
||||||
ARM64Reg V0 = ARM64Reg::INVALID_REG;
|
ARM64Reg V0 = ARM64Reg::INVALID_REG;
|
||||||
if (d != b && (d == a || d == c))
|
ARM64Reg V1Q = ARM64Reg::INVALID_REG;
|
||||||
|
|
||||||
|
if (round_c || (d != b && (d == a || d == c)))
|
||||||
{
|
{
|
||||||
V0Q = fpr.GetReg();
|
V0Q = fpr.GetReg();
|
||||||
V0 = reg_encoder(V0Q);
|
V0 = reg_encoder(V0Q);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (round_c)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single");
|
||||||
|
|
||||||
|
V1Q = fpr.GetReg();
|
||||||
|
|
||||||
|
Force25BitPrecision(reg_encoder(V1Q), VC, V0);
|
||||||
|
VC = reg_encoder(V1Q);
|
||||||
|
}
|
||||||
|
|
||||||
switch (op5)
|
switch (op5)
|
||||||
{
|
{
|
||||||
case 14: // ps_madds0
|
case 14: // ps_madds0
|
||||||
// d = a * c.ps0 + b
|
// d = a * c.ps0 + b
|
||||||
if (d == b)
|
if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC, 0);
|
m_float_emit.FMLA(size, VD, VA, VC, 0);
|
||||||
}
|
}
|
||||||
else if (d != a && d != c)
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
m_float_emit.MOV(VD, VB);
|
m_float_emit.MOV(VD, VB);
|
||||||
m_float_emit.FMLA(size, VD, VA, VC, 0);
|
m_float_emit.FMLA(size, VD, VA, VC, 0);
|
||||||
|
@ -153,11 +186,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
break;
|
break;
|
||||||
case 15: // ps_madds1
|
case 15: // ps_madds1
|
||||||
// d = a * c.ps1 + b
|
// d = a * c.ps1 + b
|
||||||
if (d == b)
|
if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC, 1);
|
m_float_emit.FMLA(size, VD, VA, VC, 1);
|
||||||
}
|
}
|
||||||
else if (d != a && d != c)
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
m_float_emit.MOV(VD, VB);
|
m_float_emit.MOV(VD, VB);
|
||||||
m_float_emit.FMLA(size, VD, VA, VC, 1);
|
m_float_emit.FMLA(size, VD, VA, VC, 1);
|
||||||
|
@ -171,14 +204,14 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
break;
|
break;
|
||||||
case 28: // ps_msub
|
case 28: // ps_msub
|
||||||
// d = a * c - b
|
// d = a * c - b
|
||||||
if (d == b)
|
if (VD == VB)
|
||||||
{
|
{
|
||||||
// d = -(-a * c + b)
|
// d = -(-a * c + b)
|
||||||
// rounding is incorrect if the rounding mode is +/- infinity
|
// rounding is incorrect if the rounding mode is +/- infinity
|
||||||
m_float_emit.FMLS(size, VD, VA, VC);
|
m_float_emit.FMLS(size, VD, VA, VC);
|
||||||
m_float_emit.FNEG(size, VD, VD);
|
m_float_emit.FNEG(size, VD, VD);
|
||||||
}
|
}
|
||||||
else if (d != a && d != c)
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
m_float_emit.FNEG(size, VD, VB);
|
m_float_emit.FNEG(size, VD, VB);
|
||||||
m_float_emit.FMLA(size, VD, VA, VC);
|
m_float_emit.FMLA(size, VD, VA, VC);
|
||||||
|
@ -192,11 +225,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
break;
|
break;
|
||||||
case 29: // ps_madd
|
case 29: // ps_madd
|
||||||
// d = a * c + b
|
// d = a * c + b
|
||||||
if (d == b)
|
if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC);
|
m_float_emit.FMLA(size, VD, VA, VC);
|
||||||
}
|
}
|
||||||
else if (d != a && d != c)
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
m_float_emit.MOV(VD, VB);
|
m_float_emit.MOV(VD, VB);
|
||||||
m_float_emit.FMLA(size, VD, VA, VC);
|
m_float_emit.FMLA(size, VD, VA, VC);
|
||||||
|
@ -215,11 +248,11 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
// Note: PowerPC rounds before the final negation.
|
// Note: PowerPC rounds before the final negation.
|
||||||
// We don't handle this at the moment because it's
|
// We don't handle this at the moment because it's
|
||||||
// only relevant when rounding to +/- infinity.
|
// only relevant when rounding to +/- infinity.
|
||||||
if (d == b)
|
if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLS(size, VD, VA, VC);
|
m_float_emit.FMLS(size, VD, VA, VC);
|
||||||
}
|
}
|
||||||
else if (d != a && d != c)
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
m_float_emit.MOV(VD, VB);
|
m_float_emit.MOV(VD, VB);
|
||||||
m_float_emit.FMLS(size, VD, VA, VC);
|
m_float_emit.FMLS(size, VD, VA, VC);
|
||||||
|
@ -233,12 +266,12 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
break;
|
break;
|
||||||
case 31: // ps_nmadd
|
case 31: // ps_nmadd
|
||||||
// d = -(a * c + b)
|
// d = -(a * c + b)
|
||||||
if (d == b)
|
if (VD == VB)
|
||||||
{
|
{
|
||||||
m_float_emit.FMLA(size, VD, VA, VC);
|
m_float_emit.FMLA(size, VD, VA, VC);
|
||||||
m_float_emit.FNEG(size, VD, VD);
|
m_float_emit.FNEG(size, VD, VD);
|
||||||
}
|
}
|
||||||
else if (d != a && d != c)
|
else if (VD != VA && VD != VC)
|
||||||
{
|
{
|
||||||
// d = -a * c - b
|
// d = -a * c - b
|
||||||
// See rounding note at ps_nmsub.
|
// See rounding note at ps_nmsub.
|
||||||
|
@ -259,6 +292,8 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
|
||||||
|
|
||||||
if (V0Q != ARM64Reg::INVALID_REG)
|
if (V0Q != ARM64Reg::INVALID_REG)
|
||||||
fpr.Unlock(V0Q);
|
fpr.Unlock(V0Q);
|
||||||
|
if (V1Q != ARM64Reg::INVALID_REG)
|
||||||
|
fpr.Unlock(V1Q);
|
||||||
|
|
||||||
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
|
ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)),
|
||||||
"Register allocation turned singles into doubles in the middle of ps_maddXX");
|
"Register allocation turned singles into doubles in the middle of ps_maddXX");
|
||||||
|
|
|
@ -979,7 +979,19 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
||||||
op.fprIsStoreSafeBeforeInst = fprIsStoreSafe;
|
op.fprIsStoreSafeBeforeInst = fprIsStoreSafe;
|
||||||
if (op.fregOut >= 0)
|
if (op.fregOut >= 0)
|
||||||
{
|
{
|
||||||
if (op.opinfo->type == OpType::SingleFP)
|
BitSet32 bitexact_inputs;
|
||||||
|
if (op.opinfo->flags &
|
||||||
|
(FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT))
|
||||||
|
{
|
||||||
|
if (op.opinfo->flags & FL_IN_FLOAT_A_BITEXACT)
|
||||||
|
bitexact_inputs[op.inst.FA] = true;
|
||||||
|
if (op.opinfo->flags & FL_IN_FLOAT_B_BITEXACT)
|
||||||
|
bitexact_inputs[op.inst.FB] = true;
|
||||||
|
if (op.opinfo->flags & FL_IN_FLOAT_C_BITEXACT)
|
||||||
|
bitexact_inputs[op.inst.FC] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (op.opinfo->type == OpType::SingleFP || !strncmp(op.opinfo->opname, "frsp", 4))
|
||||||
{
|
{
|
||||||
fprIsSingle[op.fregOut] = true;
|
fprIsSingle[op.fregOut] = true;
|
||||||
fprIsDuplicated[op.fregOut] = true;
|
fprIsDuplicated[op.fregOut] = true;
|
||||||
|
@ -989,6 +1001,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
||||||
fprIsSingle[op.fregOut] = true;
|
fprIsSingle[op.fregOut] = true;
|
||||||
fprIsDuplicated[op.fregOut] = true;
|
fprIsDuplicated[op.fregOut] = true;
|
||||||
}
|
}
|
||||||
|
else if (bitexact_inputs)
|
||||||
|
{
|
||||||
|
fprIsSingle[op.fregOut] = (fprIsSingle & bitexact_inputs) == bitexact_inputs;
|
||||||
|
fprIsDuplicated[op.fregOut] = false;
|
||||||
|
}
|
||||||
else if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
|
else if (op.opinfo->type == OpType::PS || op.opinfo->type == OpType::LoadPS)
|
||||||
{
|
{
|
||||||
fprIsSingle[op.fregOut] = true;
|
fprIsSingle[op.fregOut] = true;
|
||||||
|
@ -1007,20 +1024,10 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
||||||
// So, discard all information we have.
|
// So, discard all information we have.
|
||||||
fprIsStoreSafe = BitSet32(0);
|
fprIsStoreSafe = BitSet32(0);
|
||||||
}
|
}
|
||||||
else if (op.opinfo->flags &
|
else if (bitexact_inputs)
|
||||||
(FL_IN_FLOAT_A_BITEXACT | FL_IN_FLOAT_B_BITEXACT | FL_IN_FLOAT_C_BITEXACT))
|
|
||||||
{
|
{
|
||||||
// If the instruction copies bits between registers (without flushing denormals to zero
|
// If the instruction copies bits between registers (without flushing denormals to zero
|
||||||
// or turning SNaN into QNaN), the output is store-safe if the inputs are.
|
// or turning SNaN into QNaN), the output is store-safe if the inputs are.
|
||||||
|
|
||||||
BitSet32 bitexact_inputs;
|
|
||||||
if (op.opinfo->flags & FL_IN_FLOAT_A_BITEXACT)
|
|
||||||
bitexact_inputs[op.inst.FA] = true;
|
|
||||||
if (op.opinfo->flags & FL_IN_FLOAT_B_BITEXACT)
|
|
||||||
bitexact_inputs[op.inst.FB] = true;
|
|
||||||
if (op.opinfo->flags & FL_IN_FLOAT_C_BITEXACT)
|
|
||||||
bitexact_inputs[op.inst.FC] = true;
|
|
||||||
|
|
||||||
fprIsStoreSafe[op.fregOut] = (fprIsStoreSafe & bitexact_inputs) == bitexact_inputs;
|
fprIsStoreSafe[op.fregOut] = (fprIsStoreSafe & bitexact_inputs) == bitexact_inputs;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1032,8 +1039,9 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
|
||||||
// TODO: if we go directly from a load to a float instruction, and the value isn't used
|
// TODO: if we go directly from a load to a float instruction, and the value isn't used
|
||||||
// for anything else, we can use fast single -> double conversion after the load.
|
// for anything else, we can use fast single -> double conversion after the load.
|
||||||
|
|
||||||
fprIsStoreSafe[op.fregOut] =
|
fprIsStoreSafe[op.fregOut] = op.opinfo->type == OpType::SingleFP ||
|
||||||
(op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS);
|
op.opinfo->type == OpType::PS ||
|
||||||
|
!strncmp(op.opinfo->opname, "frsp", 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
op.fprIsStoreSafeAfterInst = fprIsStoreSafe;
|
op.fprIsStoreSafeAfterInst = fprIsStoreSafe;
|
||||||
|
|
Loading…
Reference in New Issue