[AArch64] Implement fdivx/fdivsx/mfcr/mtcrf.

Gets the povray bench to better times than the Wii.
This commit is contained in:
Ryan Houdek 2015-08-24 15:27:12 -05:00
parent d96be9250c
commit 0666c0750b
6 changed files with 148 additions and 4 deletions

View File

@ -119,6 +119,8 @@ public:
void mftb(UGeckoInstruction inst); void mftb(UGeckoInstruction inst);
void mtspr(UGeckoInstruction inst); void mtspr(UGeckoInstruction inst);
void crXXX(UGeckoInstruction inst); void crXXX(UGeckoInstruction inst);
void mfcr(UGeckoInstruction inst);
void mtcrf(UGeckoInstruction inst);
// LoadStore // LoadStore
void lXX(UGeckoInstruction inst); void lXX(UGeckoInstruction inst);
@ -154,6 +156,8 @@ public:
void fcmpx(UGeckoInstruction inst); void fcmpx(UGeckoInstruction inst);
void frspx(UGeckoInstruction inst); void frspx(UGeckoInstruction inst);
void fctiwzx(UGeckoInstruction inst); void fctiwzx(UGeckoInstruction inst);
void fdivx(UGeckoInstruction inst);
void fdivsx(UGeckoInstruction inst);
// Paired // Paired
void ps_abs(UGeckoInstruction inst); void ps_abs(UGeckoInstruction inst);

View File

@ -593,3 +593,46 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
} }
fpr.Unlock(V0); fpr.Unlock(V0);
} }
void JitArm64::fdivx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
u32 a = inst.FA, b = inst.FB, d = inst.FD;
fpr.BindToRegister(d, true);
ARM64Reg VA = fpr.R(a);
ARM64Reg VB = fpr.R(b);
ARM64Reg VD = fpr.R(d);
if (fpr.IsLower(d))
{
m_float_emit.FDIV(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB));
}
else
{
ARM64Reg V0 = fpr.GetReg();
m_float_emit.FDIV(EncodeRegToDouble(V0), EncodeRegToDouble(VA), EncodeRegToDouble(VB));
m_float_emit.INS(64, VD, 0, V0, 0);
fpr.Unlock(V0);
}
}
void JitArm64::fdivsx(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc);
u32 a = inst.FA, b = inst.FB, d = inst.FD;
fpr.BindToRegister(d, d == a || d == b, false);
ARM64Reg VA = fpr.R(a);
ARM64Reg VB = fpr.R(b);
ARM64Reg VD = fpr.R(d, false);
m_float_emit.FDIV(EncodeRegToDouble(VD), EncodeRegToDouble(VA), EncodeRegToDouble(VB));
m_float_emit.INS(64, VD, 1, VD, 0);
}

View File

@ -579,3 +579,56 @@ void JitArm64::crXXX(UGeckoInstruction inst)
gpr.Unlock(WA); gpr.Unlock(WA);
gpr.Unlock(WB); gpr.Unlock(WB);
} }
void JitArm64::mfcr(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITSystemRegistersOff);
gpr.Lock(W0, W1, W2, W30);
MOVI2R(X0, (u64)asm_routines.mfcr);
BLR(X0);
gpr.Unlock(W1, W2, W30);
gpr.BindToRegister(inst.RD, false);
MOV(gpr.R(inst.RD), W0);
gpr.Unlock(W0);
}
void JitArm64::mtcrf(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITSystemRegistersOff);
u32 crm = inst.CRM;
if (crm != 0)
{
ARM64Reg RS = gpr.R(inst.RS);
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);
ARM64Reg WB = gpr.GetReg();
ARM64Reg XB = EncodeRegTo64(WB);
MOVI2R(XB, (u64)m_crTable);
for (int i = 0; i < 8; ++i)
{
if ((crm & (0x80 >> i)) != 0)
{
if (i != 7)
LSR(WA, RS, 28 - i * 4);
if (i != 0)
{
if (i != 7)
UBFX(WA, WA, 0, 4);
else
UBFX(WA, RS, 0, 4);
}
LDR(XA, XB, ArithOption(XA, true));
STR(INDEX_UNSIGNED, XA, X29, PPCSTATE_OFF(cr_val) + 8 * i);
}
}
gpr.Unlock(WA, WB);
}
}

View File

@ -286,9 +286,9 @@ static GekkoOPTemplate table31[] =
{759, &JitArm64::stfXX}, // stfdux {759, &JitArm64::stfXX}, // stfdux
{983, &JitArm64::FallBackToInterpreter}, // stfiwx {983, &JitArm64::FallBackToInterpreter}, // stfiwx
{19, &JitArm64::FallBackToInterpreter}, // mfcr {19, &JitArm64::mfcr}, // mfcr
{83, &JitArm64::mfmsr}, // mfmsr {83, &JitArm64::mfmsr}, // mfmsr
{144, &JitArm64::FallBackToInterpreter}, // mtcrf {144, &JitArm64::mtcrf}, // mtcrf
{146, &JitArm64::mtmsr}, // mtmsr {146, &JitArm64::mtmsr}, // mtmsr
{210, &JitArm64::mtsr}, // mtsr {210, &JitArm64::mtsr}, // mtsr
{242, &JitArm64::mtsrin}, // mtsrin {242, &JitArm64::mtsrin}, // mtsrin
@ -313,7 +313,7 @@ static GekkoOPTemplate table31[] =
static GekkoOPTemplate table59[] = static GekkoOPTemplate table59[] =
{ {
{18, &JitArm64::FallBackToInterpreter}, // fdivsx {18, &JitArm64::fdivsx}, // fdivsx
{20, &JitArm64::fsubsx}, // fsubsx {20, &JitArm64::fsubsx}, // fsubsx
{21, &JitArm64::faddsx}, // faddsx {21, &JitArm64::faddsx}, // faddsx
{24, &JitArm64::FallBackToInterpreter}, // fresx {24, &JitArm64::FallBackToInterpreter}, // fresx
@ -346,7 +346,7 @@ static GekkoOPTemplate table63[] =
static GekkoOPTemplate table63_2[] = static GekkoOPTemplate table63_2[] =
{ {
{18, &JitArm64::FallBackToInterpreter}, // fdivx {18, &JitArm64::fdivx}, // fdivx
{20, &JitArm64::fsubx}, // fsubx {20, &JitArm64::fsubx}, // fsubx
{21, &JitArm64::faddx}, // faddx {21, &JitArm64::faddx}, // faddx
{23, &JitArm64::fselx}, // fselx {23, &JitArm64::fselx}, // fselx

View File

@ -569,4 +569,47 @@ void JitArm64AsmRoutineManager::GenerateCommon()
pairedStoreQuantized[30] = storeSingleS8Slow; pairedStoreQuantized[30] = storeSingleS8Slow;
pairedStoreQuantized[31] = storeSingleS16Slow; pairedStoreQuantized[31] = storeSingleS16Slow;
mfcr = AlignCode16();
GenMfcr();
}
void JitArm64AsmRoutineManager::GenMfcr()
{
// Input: Nothing
// Returns: W0
// Clobbers: X1, X2
const u8* start = GetCodePtr();
for (int i = 0; i < 8; i++)
{
LDR(INDEX_UNSIGNED, X1, X29, PPCSTATE_OFF(cr_val) + 8 * i);
// SO
if (i == 0)
{
UBFX(X0, X1, 61, 1);
}
else
{
ORR(W0, WZR, W0, ArithOption(W0, ST_LSL, 4));
UBFX(X2, X1, 61, 1);
ORR(X0, X0, X2);
}
// EQ
ORR(W2, W0, 32 - 1, 0); // W0 | 1<<1
CMP(W1, WZR);
CSEL(W0, W2, W0, CC_EQ);
// GT
ORR(W2, W0, 32 - 2, 0); // W0 | 1<<2
CMP(X1, ZR);
CSEL(W0, W2, W0, CC_GT);
// LT
UBFX(X2, X1, 62, 1);
ORR(W0, W0, W2, ArithOption(W2, ST_LSL, 3));
}
RET(X30);
JitRegister::Register(start, GetCodePtr(), "JIT_Mfcr");
} }

View File

@ -12,6 +12,7 @@ class JitArm64AsmRoutineManager : public CommonAsmRoutinesBase, public Arm64Gen:
private: private:
void Generate(); void Generate();
void GenerateCommon(); void GenerateCommon();
void GenMfcr();
public: public:
void Init() void Init()