Jit64: Preserve inputs when software_fma && m_accurate_nans
When writing the software FMA code, I didn't realize that we can't overwrite d if d is the same register as one of the inputs and HandleNaNs is going to be called. This fixes that.
This commit is contained in:
parent
4312840a4b
commit
5e58a46361
|
@ -107,8 +107,6 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
|
||||||
if (!m_accurate_nans)
|
if (!m_accurate_nans)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
ASSERT(xmm != clobber);
|
|
||||||
|
|
||||||
if (inst.OPCD != 4)
|
if (inst.OPCD != 4)
|
||||||
{
|
{
|
||||||
// not paired-single
|
// not paired-single
|
||||||
|
@ -148,6 +146,8 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm, X64Reg clobber, std::
|
||||||
{
|
{
|
||||||
// paired-single
|
// paired-single
|
||||||
|
|
||||||
|
ASSERT(xmm != clobber);
|
||||||
|
|
||||||
if (cpu_info.bSSE4_1)
|
if (cpu_info.bSSE4_1)
|
||||||
{
|
{
|
||||||
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD);
|
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, R(xmm), R(xmm), CMP_UNORD);
|
||||||
|
@ -325,7 +325,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
FALLBACK_IF(jo.fp_exceptions);
|
FALLBACK_IF(jo.fp_exceptions);
|
||||||
|
|
||||||
// We would like to emulate FMA instructions accurately without rounding error if possible, but
|
// We would like to emulate FMA instructions accurately without rounding error if possible, but
|
||||||
// unfortunately emulating FMA in software is just too slow on CPUs that are too old to have FMA
|
// unfortunately, emulating FMA in software is just too slow on CPUs that are too old to have FMA
|
||||||
// instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should
|
// instructions, so we have the Config::SESSION_USE_FMA setting to determine whether we should
|
||||||
// emulate FMA instructions accurately or by a performing a multiply followed by a separate add.
|
// emulate FMA instructions accurately or by a performing a multiply followed by a separate add.
|
||||||
//
|
//
|
||||||
|
@ -346,6 +346,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
bool single = inst.OPCD == 4 || inst.OPCD == 59;
|
||||||
bool round_input = single && !js.op->fprIsSingle[c];
|
bool round_input = single && !js.op->fprIsSingle[c];
|
||||||
|
bool preserve_inputs = m_accurate_nans;
|
||||||
|
bool preserve_d = preserve_inputs && (a == d || b == d || c == d);
|
||||||
bool packed =
|
bool packed =
|
||||||
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
|
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
|
||||||
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
||||||
|
@ -356,21 +358,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
const bool madds1 = inst.SUBOP5 == 15;
|
const bool madds1 = inst.SUBOP5 == 15;
|
||||||
const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
|
const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
|
||||||
|
|
||||||
|
X64Reg scratch_xmm = XMM0;
|
||||||
|
X64Reg result_xmm = XMM1;
|
||||||
|
X64Reg Rc_duplicated = XMM2;
|
||||||
|
|
||||||
RCOpArg Ra;
|
RCOpArg Ra;
|
||||||
RCOpArg Rb;
|
RCOpArg Rb;
|
||||||
RCOpArg Rc;
|
RCOpArg Rc;
|
||||||
RCX64Reg Rd;
|
RCX64Reg Rd;
|
||||||
RCX64Reg scratch_guard;
|
RCX64Reg xmm2_guard;
|
||||||
|
RCX64Reg result_xmm_guard;
|
||||||
RCX64Reg Rc_duplicated_guard;
|
RCX64Reg Rc_duplicated_guard;
|
||||||
X64Reg Rc_duplicated = XMM2;
|
|
||||||
if (software_fma)
|
if (software_fma)
|
||||||
{
|
{
|
||||||
scratch_guard = fpr.Scratch(XMM2);
|
xmm2_guard = fpr.Scratch(XMM2);
|
||||||
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
|
||||||
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||||
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
|
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
|
||||||
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||||
RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
|
if (preserve_d && packed)
|
||||||
|
{
|
||||||
|
result_xmm_guard = fpr.Scratch();
|
||||||
|
RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard, result_xmm_guard);
|
||||||
|
result_xmm = Gen::X64Reg(result_xmm_guard);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
RegCache::Realize(Ra, Rb, Rc, Rd, xmm2_guard);
|
||||||
|
result_xmm = packed ? Gen::X64Reg(Rd) : XMM0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -391,8 +407,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
X64Reg scratch_xmm = XMM0;
|
|
||||||
X64Reg result_xmm = XMM1;
|
|
||||||
if (software_fma)
|
if (software_fma)
|
||||||
{
|
{
|
||||||
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
|
||||||
|
@ -411,10 +425,11 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
Force25BitPrecision(XMM1, R(XMM1), XMM2);
|
Force25BitPrecision(XMM1, R(XMM1), XMM2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write the result from the previous loop iteration into Rd so we don't lose it.
|
// Write the result from the previous loop iteration into result_xmm so we don't lose it.
|
||||||
// It's important that this is done after reading Rc above, in case we have madds1 and c == d.
|
// It's important that this is done after reading Rc above, in case we have madds1 and
|
||||||
|
// result_xmm == Rd == Rc.
|
||||||
if (packed && i == 0)
|
if (packed && i == 0)
|
||||||
MOVLHPS(Rd, XMM0);
|
MOVLHPS(result_xmm, XMM0);
|
||||||
|
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
{
|
{
|
||||||
|
@ -437,14 +452,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
MOVSD(R(result_xmm), XMM0);
|
||||||
MOVSD(Rd, XMM0);
|
|
||||||
result_xmm = Rd;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
DEBUG_ASSERT(result_xmm == XMM0);
|
||||||
result_xmm = XMM0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (madds_accurate_nans)
|
if (madds_accurate_nans)
|
||||||
{
|
{
|
||||||
|
@ -530,6 +540,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
// HandleNaNs needs to clobber XMM0
|
// HandleNaNs needs to clobber XMM0
|
||||||
MOVAPD(Rd, R(result_xmm));
|
MOVAPD(Rd, R(result_xmm));
|
||||||
result_xmm = Rd;
|
result_xmm = Rd;
|
||||||
|
DEBUG_ASSERT(!preserve_d);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
|
// If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
|
||||||
|
|
Loading…
Reference in New Issue