Jit64: Fix FPRF handling of denormal singles
This commit is contained in:
parent
8d2c069c34
commit
ccd8233ea3
|
@ -121,8 +121,11 @@ public:
|
|||
// Generates a branch that will check if a given bit of a CR register part
|
||||
// is set or not.
|
||||
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
||||
void SetFPRFIfNeeded(Gen::X64Reg xmm);
|
||||
|
||||
void SetFPRFIfNeeded(const Gen::OpArg& xmm, bool single);
|
||||
void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
|
||||
bool duplicate = false);
|
||||
void FinalizeDoubleResult(Gen::X64Reg output, const Gen::OpArg& input);
|
||||
void HandleNaNs(UGeckoInstruction inst, Gen::X64Reg xmm_out, Gen::X64Reg xmm_in,
|
||||
Gen::X64Reg clobber = Gen::XMM0);
|
||||
|
||||
|
|
|
@ -33,13 +33,63 @@ alignas(16) static const double half_qnan_and_s32_max[2] = {0x7FFFFFFF, -0x80000
|
|||
// We can avoid calculating FPRF if it's not needed; every float operation resets it, so
|
||||
// if it's going to be clobbered in a future instruction before being read, we can just
|
||||
// not calculate it.
|
||||
void Jit64::SetFPRFIfNeeded(X64Reg xmm)
|
||||
void Jit64::SetFPRFIfNeeded(const OpArg& input, bool single)
|
||||
{
|
||||
// As far as we know, the games that use this flag only need FPRF for fmul and fmadd, but
|
||||
// FPRF is fast enough in JIT that we might as well just enable it for every float instruction
|
||||
// if the FPRF flag is set.
|
||||
if (SConfig::GetInstance().bFPRF && js.op->wantsFPRF)
|
||||
SetFPRF(xmm);
|
||||
if (!SConfig::GetInstance().bFPRF || !js.op->wantsFPRF)
|
||||
return;
|
||||
|
||||
X64Reg xmm = XMM0;
|
||||
if (input.IsSimpleReg())
|
||||
xmm = input.GetSimpleReg();
|
||||
else
|
||||
MOVSD(xmm, input);
|
||||
|
||||
SetFPRF(xmm, single);
|
||||
}
|
||||
|
||||
void Jit64::FinalizeSingleResult(X64Reg output, const OpArg& input, bool packed, bool duplicate)
|
||||
{
|
||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
||||
if (jo.accurateSinglePrecision)
|
||||
{
|
||||
if (packed)
|
||||
{
|
||||
CVTPD2PS(output, input);
|
||||
SetFPRFIfNeeded(R(output), true);
|
||||
CVTPS2PD(output, R(output));
|
||||
}
|
||||
else
|
||||
{
|
||||
CVTSD2SS(output, input);
|
||||
SetFPRFIfNeeded(R(output), true);
|
||||
CVTSS2SD(output, R(output));
|
||||
if (duplicate)
|
||||
MOVDDUP(output, R(output));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!input.IsSimpleReg(output))
|
||||
{
|
||||
if (duplicate)
|
||||
MOVDDUP(output, input);
|
||||
else
|
||||
MOVAPD(output, input);
|
||||
}
|
||||
|
||||
SetFPRFIfNeeded(input, true);
|
||||
}
|
||||
}
|
||||
|
||||
void Jit64::FinalizeDoubleResult(X64Reg output, const OpArg& input)
|
||||
{
|
||||
if (!input.IsSimpleReg(output))
|
||||
MOVSD(output, input);
|
||||
|
||||
SetFPRFIfNeeded(input, false);
|
||||
}
|
||||
|
||||
void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Reg clobber)
|
||||
|
@ -210,8 +260,9 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
|||
|
||||
HandleNaNs(inst, Rd, dest);
|
||||
if (single)
|
||||
ForceSinglePrecision(Rd, Rd, packed, true);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeSingleResult(Rd, Rd, packed, true);
|
||||
else
|
||||
FinalizeDoubleResult(Rd, Rd);
|
||||
};
|
||||
|
||||
switch (inst.SUBOP5)
|
||||
|
@ -452,14 +503,13 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||
if (single)
|
||||
{
|
||||
HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
|
||||
ForceSinglePrecision(Rd, R(result_reg), packed, true);
|
||||
FinalizeSingleResult(Rd, R(result_reg), packed, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
HandleNaNs(inst, result_reg, result_reg, XMM1);
|
||||
MOVSD(Rd, R(result_reg));
|
||||
FinalizeDoubleResult(Rd, R(result_reg));
|
||||
}
|
||||
SetFPRFIfNeeded(Rd);
|
||||
}
|
||||
|
||||
void Jit64::fsign(UGeckoInstruction inst)
|
||||
|
@ -763,12 +813,11 @@ void Jit64::frspx(UGeckoInstruction inst)
|
|||
int d = inst.FD;
|
||||
bool packed = js.op->fprIsDuplicated[b] && !cpu_info.bAtom;
|
||||
|
||||
RCOpArg Rb = fpr.Use(b, RCMode::Read);
|
||||
RCOpArg Rb = fpr.Bind(b, RCMode::Read);
|
||||
RCX64Reg Rd = fpr.Bind(d, RCMode::Write);
|
||||
RegCache::Realize(Rb, Rd);
|
||||
|
||||
ForceSinglePrecision(Rd, Rb, packed, true);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeSingleResult(Rd, Rb, packed, true);
|
||||
}
|
||||
|
||||
void Jit64::frsqrtex(UGeckoInstruction inst)
|
||||
|
@ -786,8 +835,7 @@ void Jit64::frsqrtex(UGeckoInstruction inst)
|
|||
|
||||
MOVAPD(XMM0, Rb);
|
||||
CALL(asm_routines.frsqrte);
|
||||
MOVSD(Rd, XMM0);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeDoubleResult(Rd, R(XMM0));
|
||||
}
|
||||
|
||||
void Jit64::fresx(UGeckoInstruction inst)
|
||||
|
@ -806,5 +854,5 @@ void Jit64::fresx(UGeckoInstruction inst)
|
|||
MOVAPD(XMM0, Rb);
|
||||
CALL(asm_routines.fres);
|
||||
MOVDDUP(Rd, R(XMM0));
|
||||
SetFPRFIfNeeded(Rd);
|
||||
SetFPRFIfNeeded(R(XMM0), true);
|
||||
}
|
||||
|
|
|
@ -77,8 +77,7 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
|||
PanicAlertFmt("ps_sum WTF!!!");
|
||||
}
|
||||
HandleNaNs(inst, Rd, tmp, tmp == XMM1 ? XMM0 : XMM1);
|
||||
ForceSinglePrecision(Rd, Rd);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeSingleResult(Rd, Rd);
|
||||
}
|
||||
|
||||
void Jit64::ps_muls(UGeckoInstruction inst)
|
||||
|
@ -112,8 +111,7 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
|||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||
MULPD(XMM1, Ra);
|
||||
HandleNaNs(inst, Rd, XMM1);
|
||||
ForceSinglePrecision(Rd, Rd);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeSingleResult(Rd, Rd);
|
||||
}
|
||||
|
||||
void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||
|
@ -171,8 +169,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst)
|
|||
CALL(asm_routines.frsqrte);
|
||||
MOVLHPS(Rd, XMM0);
|
||||
|
||||
ForceSinglePrecision(Rd, Rd);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeSingleResult(Rd, Rd);
|
||||
}
|
||||
|
||||
void Jit64::ps_res(UGeckoInstruction inst)
|
||||
|
@ -196,8 +193,7 @@ void Jit64::ps_res(UGeckoInstruction inst)
|
|||
CALL(asm_routines.fres);
|
||||
MOVLHPS(Rd, XMM0);
|
||||
|
||||
ForceSinglePrecision(Rd, Rd);
|
||||
SetFPRFIfNeeded(Rd);
|
||||
FinalizeSingleResult(Rd, Rd);
|
||||
}
|
||||
|
||||
void Jit64::ps_cmpXX(UGeckoInstruction inst)
|
||||
|
|
|
@ -727,34 +727,6 @@ void EmuCodeBlock::JitClearCA()
|
|||
MOV(8, PPCSTATE(xer_ca), Imm8(0));
|
||||
}
|
||||
|
||||
void EmuCodeBlock::ForceSinglePrecision(X64Reg output, const OpArg& input, bool packed,
|
||||
bool duplicate)
|
||||
{
|
||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
||||
if (m_jit.jo.accurateSinglePrecision)
|
||||
{
|
||||
if (packed)
|
||||
{
|
||||
CVTPD2PS(output, input);
|
||||
CVTPS2PD(output, R(output));
|
||||
}
|
||||
else
|
||||
{
|
||||
CVTSD2SS(output, input);
|
||||
CVTSS2SD(output, R(output));
|
||||
if (duplicate)
|
||||
MOVDDUP(output, R(output));
|
||||
}
|
||||
}
|
||||
else if (!input.IsSimpleReg(output))
|
||||
{
|
||||
if (duplicate)
|
||||
MOVDDUP(output, input);
|
||||
else
|
||||
MOVAPD(output, input);
|
||||
}
|
||||
}
|
||||
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&),
|
||||
void (XEmitter::*sseOp)(X64Reg, const OpArg&), X64Reg regOp,
|
||||
|
@ -907,30 +879,35 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
|
|||
MOVDDUP(dst, R(dst));
|
||||
}
|
||||
|
||||
alignas(16) static const u64 psDoubleExp[2] = {0x7FF0000000000000ULL, 0};
|
||||
alignas(16) static const u64 psDoubleFrac[2] = {0x000FFFFFFFFFFFFFULL, 0};
|
||||
alignas(16) static const u64 psDoubleNoSign[2] = {0x7FFFFFFFFFFFFFFFULL, 0};
|
||||
alignas(16) static const u64 psDoubleExp[2] = {Common::DOUBLE_EXP, 0};
|
||||
alignas(16) static const u64 psDoubleFrac[2] = {Common::DOUBLE_FRAC, 0};
|
||||
alignas(16) static const u64 psDoubleNoSign[2] = {~Common::DOUBLE_SIGN, 0};
|
||||
|
||||
alignas(16) static const u32 psFloatExp[4] = {Common::FLOAT_EXP, 0, 0, 0};
|
||||
alignas(16) static const u32 psFloatFrac[4] = {Common::FLOAT_FRAC, 0, 0, 0};
|
||||
alignas(16) static const u32 psFloatNoSign[4] = {~Common::FLOAT_SIGN, 0, 0, 0};
|
||||
|
||||
// TODO: it might be faster to handle FPRF in the same way as CR is currently handled for integer,
|
||||
// storing
|
||||
// the result of each floating point op and calculating it when needed. This is trickier than for
|
||||
// integers
|
||||
// though, because there's 32 possible FPRF bit combinations but only 9 categories of floating point
|
||||
// values,
|
||||
// which makes the whole thing rather trickier.
|
||||
// Fortunately, PPCAnalyzer can optimize out a large portion of FPRF calculations, so maybe this
|
||||
// isn't
|
||||
// quite that necessary.
|
||||
void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
||||
// storing the result of each floating point op and calculating it when needed. This is trickier
|
||||
// than for integers though, because there's 32 possible FPRF bit combinations but only 9 categories
|
||||
// of floating point values. Fortunately, PPCAnalyzer can optimize out a large portion of FPRF
|
||||
// calculations, so maybe this isn't quite that necessary.
|
||||
void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm, bool single)
|
||||
{
|
||||
const int input_size = single ? 32 : 64;
|
||||
|
||||
AND(32, PPCSTATE(fpscr), Imm32(~FPRF_MASK));
|
||||
|
||||
FixupBranch continue1, continue2, continue3, continue4;
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), xmm);
|
||||
SHR(64, R(RSCRATCH), Imm8(63)); // Get the sign bit; almost all the branches need it.
|
||||
PTEST(xmm, MConst(psDoubleExp));
|
||||
// Get the sign bit; almost all the branches need it.
|
||||
SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
|
||||
if (single)
|
||||
PTEST(xmm, MConst(psFloatExp));
|
||||
else
|
||||
PTEST(xmm, MConst(psDoubleExp));
|
||||
FixupBranch maxExponent = J_CC(CC_C);
|
||||
FixupBranch zeroExponent = J_CC(CC_Z);
|
||||
|
||||
|
@ -940,7 +917,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
|||
continue1 = J();
|
||||
|
||||
SetJumpTarget(maxExponent);
|
||||
PTEST(xmm, MConst(psDoubleFrac));
|
||||
if (single)
|
||||
PTEST(xmm, MConst(psFloatFrac));
|
||||
else
|
||||
PTEST(xmm, MConst(psDoubleFrac));
|
||||
FixupBranch notNAN = J_CC(CC_Z);
|
||||
|
||||
// Max exponent + mantissa: PPC_FPCLASS_QNAN
|
||||
|
@ -955,7 +935,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
|||
continue3 = J();
|
||||
|
||||
SetJumpTarget(zeroExponent);
|
||||
PTEST(xmm, MConst(psDoubleNoSign));
|
||||
if (single)
|
||||
PTEST(xmm, MConst(psFloatNoSign));
|
||||
else
|
||||
PTEST(xmm, MConst(psDoubleNoSign));
|
||||
FixupBranch zero = J_CC(CC_Z);
|
||||
|
||||
// No exponent + mantissa: sign ? PPC_FPCLASS_ND : PPC_FPCLASS_PD;
|
||||
|
@ -971,37 +954,58 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
|||
else
|
||||
{
|
||||
MOVQ_xmm(R(RSCRATCH), xmm);
|
||||
TEST(64, R(RSCRATCH), MConst(psDoubleExp));
|
||||
if (single)
|
||||
TEST(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP));
|
||||
else
|
||||
TEST(64, R(RSCRATCH), MConst(psDoubleExp));
|
||||
FixupBranch zeroExponent = J_CC(CC_Z);
|
||||
AND(64, R(RSCRATCH), MConst(psDoubleNoSign));
|
||||
CMP(64, R(RSCRATCH), MConst(psDoubleExp));
|
||||
|
||||
if (single)
|
||||
{
|
||||
AND(32, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN));
|
||||
CMP(32, R(RSCRATCH), Imm32(Common::FLOAT_EXP));
|
||||
}
|
||||
else
|
||||
{
|
||||
AND(64, R(RSCRATCH), MConst(psDoubleNoSign));
|
||||
CMP(64, R(RSCRATCH), MConst(psDoubleExp));
|
||||
}
|
||||
FixupBranch nan =
|
||||
J_CC(CC_G); // This works because if the sign bit is set, RSCRATCH is negative
|
||||
FixupBranch infinity = J_CC(CC_E);
|
||||
|
||||
MOVQ_xmm(R(RSCRATCH), xmm);
|
||||
SHR(64, R(RSCRATCH), Imm8(63));
|
||||
SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
|
||||
LEA(32, RSCRATCH,
|
||||
MScaled(RSCRATCH, Common::PPC_FPCLASS_NN - Common::PPC_FPCLASS_PN, Common::PPC_FPCLASS_PN));
|
||||
continue1 = J();
|
||||
|
||||
SetJumpTarget(nan);
|
||||
MOV(32, R(RSCRATCH), Imm32(Common::PPC_FPCLASS_QNAN));
|
||||
continue2 = J();
|
||||
|
||||
SetJumpTarget(infinity);
|
||||
MOVQ_xmm(R(RSCRATCH), xmm);
|
||||
SHR(64, R(RSCRATCH), Imm8(63));
|
||||
SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
|
||||
LEA(32, RSCRATCH,
|
||||
MScaled(RSCRATCH, Common::PPC_FPCLASS_NINF - Common::PPC_FPCLASS_PINF,
|
||||
Common::PPC_FPCLASS_PINF));
|
||||
continue3 = J();
|
||||
|
||||
SetJumpTarget(zeroExponent);
|
||||
TEST(64, R(RSCRATCH), MConst(psDoubleNoSign));
|
||||
if (single)
|
||||
TEST(input_size, R(RSCRATCH), Imm32(~Common::FLOAT_SIGN));
|
||||
else
|
||||
TEST(input_size, R(RSCRATCH), MConst(psDoubleNoSign));
|
||||
FixupBranch zero = J_CC(CC_Z);
|
||||
SHR(64, R(RSCRATCH), Imm8(63));
|
||||
|
||||
SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
|
||||
LEA(32, RSCRATCH,
|
||||
MScaled(RSCRATCH, Common::PPC_FPCLASS_ND - Common::PPC_FPCLASS_PD, Common::PPC_FPCLASS_PD));
|
||||
continue4 = J();
|
||||
|
||||
SetJumpTarget(zero);
|
||||
SHR(64, R(RSCRATCH), Imm8(63));
|
||||
SHR(input_size, R(RSCRATCH), Imm8(input_size - 1));
|
||||
SHL(32, R(RSCRATCH), Imm8(4));
|
||||
ADD(32, R(RSCRATCH), Imm8(Common::PPC_FPCLASS_PZ));
|
||||
}
|
||||
|
|
|
@ -117,14 +117,12 @@ public:
|
|||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, const Gen::OpArg&, u8), Gen::X64Reg regOp,
|
||||
const Gen::OpArg& arg1, const Gen::OpArg& arg2, u8 imm);
|
||||
|
||||
void ForceSinglePrecision(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
|
||||
bool duplicate = false);
|
||||
void Force25BitPrecision(Gen::X64Reg output, const Gen::OpArg& input, Gen::X64Reg tmp);
|
||||
|
||||
// RSCRATCH might get trashed
|
||||
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
|
||||
void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
|
||||
void SetFPRF(Gen::X64Reg xmm);
|
||||
void SetFPRF(Gen::X64Reg xmm, bool single);
|
||||
void Clear();
|
||||
|
||||
protected:
|
||||
|
|
Loading…
Reference in New Issue