[AArch64] Implement fctiwzx

Improves the povray benchmark time by 5.6%
This commit is contained in:
Ryan Houdek 2015-08-23 15:35:18 -05:00
parent 4fa23abbe1
commit 561744819e
3 changed files with 37 additions and 1 deletions

View File

@ -147,6 +147,7 @@ public:
void fsubx(UGeckoInstruction inst);
void fcmpx(UGeckoInstruction inst);
void frspx(UGeckoInstruction inst);
void fctiwzx(UGeckoInstruction inst);
// Paired
void ps_abs(UGeckoInstruction inst);

View File

@ -558,3 +558,38 @@ void JitArm64::fcmpx(UGeckoInstruction inst)
gpr.Unlock(WA);
}
void JitArm64::fctiwzx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
u32 b = inst.FB, d = inst.FD;
fpr.BindToRegister(d, d == b);
ARM64Reg VB = fpr.R(b);
ARM64Reg VD = fpr.R(d);
ARM64Reg V0 = fpr.GetReg();
// Generate 0xFFF8000000000000ULL
m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL);
m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7);
if (fpr.IsLower(d))
{
m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
}
else
{
ARM64Reg V1 = fpr.GetReg();
m_float_emit.FCVTN(32, EncodeRegToDouble(V1), EncodeRegToDouble(VB));
m_float_emit.FCVTS(EncodeRegToSingle(V1), EncodeRegToSingle(V1), ROUND_Z);
m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(V0));
m_float_emit.INS(64, VD, 0, V1, 0);
fpr.Unlock(V1);
}
fpr.Unlock(V0);
}

View File

@ -330,7 +330,7 @@ static GekkoOPTemplate table63[] =
{32, &JitArm64::fcmpx}, // fcmpo
{0, &JitArm64::fcmpx}, // fcmpu
{14, &JitArm64::FallBackToInterpreter}, // fctiwx
{15, &JitArm64::FallBackToInterpreter}, // fctiwzx
{15, &JitArm64::fctiwzx}, // fctiwzx
{72, &JitArm64::fmrx}, // fmrx
{136, &JitArm64::fnabsx}, // fnabsx
{40, &JitArm64::fnegx}, // fnegx