Merge pull request #3629 from degasus/arm

JitArm64: Single precision tracking.
This commit is contained in:
Ryan Houdek 2016-02-25 18:10:15 -05:00
commit a0c51806ec
8 changed files with 412 additions and 263 deletions

View File

@ -73,6 +73,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
m_float_emit.REV32(8, D0, D0);
m_float_emit.STR(64, Q0, X28, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
{
m_float_emit.REV32(8, D0, RS);
m_float_emit.STR(64, Q0, X28, addr);
}
else
{
m_float_emit.REV64(8, Q0, RS);
@ -86,7 +91,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
{
m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr);
m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
}
else
{
@ -198,6 +202,13 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
MOVI2R(X30, (u64)PowerPC::Write_U64);
BLR(X30);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
{
m_float_emit.UMOV(64, X0, RS, 0);
ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
MOVI2R(X30, (u64)PowerPC::Write_U64);
BLR(X30);
}
else
{
MOVI2R(X30, (u64)&PowerPC::Write_U64);
@ -214,7 +225,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
MOVI2R(X30, (u64)&PowerPC::Read_U32);
BLR(X30);
m_float_emit.INS(32, RS, 0, X0);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
}
else
{

View File

@ -33,34 +33,44 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
bool use_b = op5 != 25; // fmul uses no B
bool inputs_are_singles = fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) && (!use_c || fpr.IsSingle(c, !packed));
ARM64Reg VA, VB, VC, VD;
if (packed)
{
VA = fpr.R(a, REG_REG);
RegType type = inputs_are_singles ? REG_REG_SINGLE : REG_REG;
u8 size = inputs_are_singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad;
VA = reg_encoder(fpr.R(a, type));
if (use_b)
VB = fpr.R(b, REG_REG);
VB = reg_encoder(fpr.R(b, type));
if (use_c)
VC = fpr.R(c, REG_REG);
VD = fpr.RW(d, REG_REG);
VC = reg_encoder(fpr.R(c, type));
VD = reg_encoder(fpr.RW(d, type));
switch (op5)
{
case 18: m_float_emit.FDIV(64, VD, VA, VB); break;
case 20: m_float_emit.FSUB(64, VD, VA, VB); break;
case 21: m_float_emit.FADD(64, VD, VA, VB); break;
case 25: m_float_emit.FMUL(64, VD, VA, VC); break;
case 18: m_float_emit.FDIV(size, VD, VA, VB); break;
case 20: m_float_emit.FSUB(size, VD, VA, VB); break;
case 21: m_float_emit.FADD(size, VD, VA, VB); break;
case 25: m_float_emit.FMUL(size, VD, VA, VC); break;
default: _assert_msg_(DYNA_REC, 0, "fp_arith"); break;
}
}
else
{
VA = EncodeRegToDouble(fpr.R(a, REG_IS_LOADED));
RegType type = (inputs_are_singles && single) ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
RegType type_out = single ? (inputs_are_singles ? REG_DUP_SINGLE : REG_DUP) : REG_LOWER_PAIR;
ARM64Reg (*reg_encoder)(ARM64Reg) = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble;
VA = reg_encoder(fpr.R(a, type));
if (use_b)
VB = EncodeRegToDouble(fpr.R(b, REG_IS_LOADED));
VB = reg_encoder(fpr.R(b, type));
if (use_c)
VC = EncodeRegToDouble(fpr.R(c, REG_IS_LOADED));
VD = EncodeRegToDouble(fpr.RW(d, single ? REG_DUP : REG_LOWER_PAIR));
VC = reg_encoder(fpr.R(c, type));
VD = reg_encoder(fpr.RW(d, type_out));
switch (op5)
{
@ -95,33 +105,42 @@ void JitArm64::fp_logic(UGeckoInstruction inst)
if (op10 == 72 && b == d)
return;
bool single = fpr.IsSingle(b, !packed);
u8 size = single ? 32 : 64;
if (packed)
{
ARM64Reg VB = fpr.R(b, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
RegType type = single ? REG_REG_SINGLE : REG_REG;
ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToDouble : EncodeRegToQuad;
ARM64Reg VB = reg_encoder(fpr.R(b, type));
ARM64Reg VD = reg_encoder(fpr.RW(d, type));
switch (op10)
{
case 40: m_float_emit.FNEG(64, VD, VB); break;
case 40: m_float_emit.FNEG(size, VD, VB); break;
case 72: m_float_emit.ORR(VD, VB, VB); break;
case 136: m_float_emit.FABS(64, VD, VB);
m_float_emit.FNEG(64, VD, VD); break;
case 264: m_float_emit.FABS(64, VD, VB); break;
case 136: m_float_emit.FABS(size, VD, VB);
m_float_emit.FNEG(size, VD, VD); break;
case 264: m_float_emit.FABS(size, VD, VB); break;
default: _assert_msg_(DYNA_REC, 0, "fp_logic"); break;
}
}
else
{
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
ARM64Reg VD = fpr.RW(d);
RegType type = single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToSingle : EncodeRegToDouble;
ARM64Reg VB = fpr.R(b, type);
ARM64Reg VD = fpr.RW(d, type);
switch (op10)
{
case 40: m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); break;
case 72: m_float_emit.INS(64, VD, 0, VB, 0); break;
case 136: m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); break;
case 264: m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); break;
case 40: m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VB)); break;
case 72: m_float_emit.INS(size, VD, 0, VB, 0); break;
case 136: m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB));
m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VD)); break;
case 264: m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB)); break;
default: _assert_msg_(DYNA_REC, 0, "fp_logic"); break;
}
}
@ -135,13 +154,26 @@ void JitArm64::fselx(UGeckoInstruction inst)
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
ARM64Reg VA = fpr.R(a, REG_IS_LOADED);
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
ARM64Reg VC = fpr.R(c, REG_IS_LOADED);
ARM64Reg VD = fpr.RW(d);
if (fpr.IsSingle(a, true))
{
ARM64Reg VA = fpr.R(a, REG_LOWER_PAIR_SINGLE);
m_float_emit.FCMPE(EncodeRegToSingle(VA));
}
else
{
ARM64Reg VA = fpr.R(a, REG_LOWER_PAIR);
m_float_emit.FCMPE(EncodeRegToDouble(VA));
}
m_float_emit.FCMPE(EncodeRegToDouble(VA));
m_float_emit.FCSEL(EncodeRegToDouble(VD), EncodeRegToDouble(VC), EncodeRegToDouble(VB), CC_GE);
bool single = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
RegType type = single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToSingle : EncodeRegToDouble;
ARM64Reg VB = fpr.R(b, type);
ARM64Reg VC = fpr.R(c, type);
ARM64Reg VD = fpr.RW(d, type);
m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE);
}
void JitArm64::frspx(UGeckoInstruction inst)
@ -153,11 +185,22 @@ void JitArm64::frspx(UGeckoInstruction inst)
u32 b = inst.FB, d = inst.FD;
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
ARM64Reg VD = fpr.RW(d, REG_DUP);
if (fpr.IsSingle(b, true))
{
// Source is already in single precision, so no need to do anything but to copy to PSR1.
ARM64Reg VB = fpr.R(b, REG_LOWER_PAIR_SINGLE);
ARM64Reg VD = fpr.RW(d, REG_DUP_SINGLE);
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VD), EncodeRegToDouble(VD));
if (b != d)
m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB));
}
else
{
ARM64Reg VB = fpr.R(b, REG_LOWER_PAIR);
ARM64Reg VD = fpr.RW(d, REG_DUP_SINGLE);
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
}
}
void JitArm64::fcmpX(UGeckoInstruction inst)
@ -169,8 +212,12 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
u32 a = inst.FA, b = inst.FB;
int crf = inst.CRFD;
ARM64Reg VA = fpr.R(a, REG_IS_LOADED);
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true);
RegType type = singles ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToSingle : EncodeRegToDouble;
ARM64Reg VA = reg_encoder(fpr.R(a, type));
ARM64Reg VB = reg_encoder(fpr.R(b, type));
ARM64Reg WA = gpr.GetReg();
ARM64Reg XA = EncodeRegTo64(WA);
@ -179,7 +226,7 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
FixupBranch continue1, continue2, continue3;
ORR(XA, ZR, 32, 0, true);
m_float_emit.FCMP(EncodeRegToDouble(VA), EncodeRegToDouble(VB));
m_float_emit.FCMP(VA, VB);
if (a != b)
{
@ -231,7 +278,9 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
u32 b = inst.FB, d = inst.FD;
ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
bool single = fpr.IsSingle(b, true);
ARM64Reg VB = fpr.R(b, single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR);
ARM64Reg VD = fpr.RW(d);
ARM64Reg V0 = fpr.GetReg();
@ -240,8 +289,15 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL);
m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7);
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
if (single)
{
m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VB), ROUND_Z);
}
else
{
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
}
m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
fpr.Unlock(V0);
}

View File

@ -76,7 +76,7 @@ void JitArm64::lfXX(UGeckoInstruction inst)
u32 imm_addr = 0;
bool is_immediate = false;
RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP;
RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP_SINGLE;
gpr.Lock(W0, W30);
fpr.Lock(Q0);
@ -270,7 +270,16 @@ void JitArm64::stfXX(UGeckoInstruction inst)
gpr.Lock(W0, W1, W30);
fpr.Lock(Q0);
ARM64Reg V0 = fpr.R(inst.FS, REG_IS_LOADED);
bool single = (flags & BackPatchInfo::FLAG_SIZE_F32) && fpr.IsSingle(inst.FS, true);
ARM64Reg V0 = fpr.R(inst.FS, single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR);
if (single)
{
flags &= ~BackPatchInfo::FLAG_SIZE_F32;
flags |= BackPatchInfo::FLAG_SIZE_F32I;
}
ARM64Reg addr_reg = W1;
if (update)
@ -407,24 +416,29 @@ void JitArm64::stfXX(UGeckoInstruction inst)
ADD(X1, X30, pipe_off);
LDR(INDEX_UNSIGNED, W0, X30, count_off);
if (accessSize == 64)
if (flags & BackPatchInfo::FLAG_SIZE_F64)
{
m_float_emit.REV64(8, Q0, V0);
if (pipe_off)
m_float_emit.STR(64, Q0, X1, ArithOption(X0));
else
m_float_emit.STR(64, Q0, X30, ArithOption(X0));
}
else if (accessSize == 32)
else if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
m_float_emit.FCVT(32, 64, D0, EncodeRegToDouble(V0));
m_float_emit.REV32(8, D0, D0);
if (pipe_off)
m_float_emit.STR(32, D0, X1, ArithOption(X0));
else
m_float_emit.STR(32, D0, X30, ArithOption(X0));
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
{
m_float_emit.REV32(8, D0, V0);
}
if (pipe_off)
{
m_float_emit.STR(accessSize, accessSize == 64 ? Q0 : D0, X1, ArithOption(X0));
}
else
{
m_float_emit.STR(accessSize, accessSize == 64 ? Q0 : D0, X30, ArithOption(X0));
}
ADD(W0, W0, accessSize >> 3);
STR(INDEX_UNSIGNED, W0, X30, count_off);
js.fifoBytesThisBlock += accessSize >> 3;

View File

@ -62,20 +62,17 @@ void JitArm64::psq_l(UGeckoInstruction inst)
if (js.assumeNoPairedQuantize)
{
VS = fpr.RW(inst.RS, REG_REG);
VS = fpr.RW(inst.RS, REG_REG_SINGLE);
if (!inst.W)
{
ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
m_float_emit.REV32(8, VS, VS);
m_float_emit.FCVTL(64, VS, VS);
}
else
{
m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
m_float_emit.REV32(8, VS, VS);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
}
m_float_emit.REV32(8, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
}
else
{
@ -87,17 +84,14 @@ void JitArm64::psq_l(UGeckoInstruction inst)
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(X30);
VS = fpr.RW(inst.RS, REG_REG);
if (!inst.W)
m_float_emit.FCVTL(64, VS, D0);
else
m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), D0);
VS = fpr.RW(inst.RS, REG_REG_SINGLE);
m_float_emit.ORR(EncodeRegToDouble(VS), D0, D0);
}
if (inst.W)
{
m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double
m_float_emit.INS(64, VS, 1, Q0, 0);
m_float_emit.FMOV(S0, 0x70); // 1.0 as a Single
m_float_emit.INS(32, VS, 1, Q0, 0);
}
gpr.Unlock(W0, W1, W2, W30);
@ -121,8 +115,10 @@ void JitArm64::psq_st(UGeckoInstruction inst)
gpr.Lock(W0, W1, W2, W30);
fpr.Lock(Q0, Q1);
bool single = fpr.IsSingle(inst.RS);
ARM64Reg arm_addr = gpr.R(inst.RA);
ARM64Reg VS = fpr.R(inst.RS, REG_REG);
ARM64Reg VS = fpr.R(inst.RS, single ? REG_REG_SINGLE : REG_REG);
ARM64Reg scale_reg = W0;
ARM64Reg addr_reg = W1;
@ -156,7 +152,12 @@ void JitArm64::psq_st(UGeckoInstruction inst)
if (js.assumeNoPairedQuantize)
{
u32 flags = BackPatchInfo::FLAG_STORE;
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
if (single)
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32I : BackPatchInfo::FLAG_SIZE_F32X2I);
else
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
EmitBackpatchRoutine(flags,
jo.fastmem,
jo.fastmem,
@ -166,10 +167,17 @@ void JitArm64::psq_st(UGeckoInstruction inst)
}
else
{
if (inst.W)
m_float_emit.FCVT(32, 64, D0, VS);
if (single)
{
m_float_emit.ORR(D0, VS, VS);
}
else
m_float_emit.FCVTN(32, D0, VS);
{
if (inst.W)
m_float_emit.FCVT(32, 64, D0, VS);
else
m_float_emit.FCVTN(32, D0, VS);
}
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
UBFM(type_reg, scale_reg, 0, 2); // Type

View File

@ -25,36 +25,41 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
u32 a = inst.FA, b = inst.FB, d = inst.FD;
ARM64Reg VA = fpr.R(a, REG_REG);
ARM64Reg VB = fpr.R(b, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
bool singles = fpr.IsSingle(a) && fpr.IsSingle(b);
RegType type = singles ? REG_REG_SINGLE : REG_REG;
u8 size = singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
ARM64Reg VA = fpr.R(a, type);
ARM64Reg VB = fpr.R(b, type);
ARM64Reg VD = fpr.RW(d, type);
switch (inst.SUBOP10)
{
case 528: //00
m_float_emit.TRN1(64, VD, VA, VB);
m_float_emit.TRN1(size, VD, VA, VB);
break;
case 560: //01
m_float_emit.INS(64, VD, 0, VA, 0);
m_float_emit.INS(64, VD, 1, VB, 1);
m_float_emit.INS(size, VD, 0, VA, 0);
m_float_emit.INS(size, VD, 1, VB, 1);
break;
case 592: //10
if (d != a && d != b)
{
m_float_emit.INS(64, VD, 0, VA, 1);
m_float_emit.INS(64, VD, 1, VB, 0);
m_float_emit.INS(size, VD, 0, VA, 1);
m_float_emit.INS(size, VD, 1, VB, 0);
}
else
{
ARM64Reg V0 = fpr.GetReg();
m_float_emit.INS(64, V0, 0, VA, 1);
m_float_emit.INS(64, V0, 1, VB, 0);
m_float_emit.ORR(VD, V0, V0);
m_float_emit.INS(size, V0, 0, VA, 1);
m_float_emit.INS(size, V0, 1, VB, 0);
m_float_emit.ORR(reg_encoder(VD), reg_encoder(V0), reg_encoder(V0));
fpr.Unlock(V0);
}
break;
case 624: //11
m_float_emit.TRN2(64, VD, VA, VB);
m_float_emit.TRN2(size, VD, VA, VB);
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
@ -73,13 +78,19 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
bool upper = inst.SUBOP5 == 13;
ARM64Reg VA = fpr.R(a, REG_REG);
ARM64Reg VC = fpr.R(c, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
RegType type = singles ? REG_REG_SINGLE : REG_REG;
u8 size = singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
ARM64Reg VA = fpr.R(a, type);
ARM64Reg VC = fpr.R(c, type);
ARM64Reg VD = fpr.RW(d, type);
ARM64Reg V0 = fpr.GetReg();
m_float_emit.DUP(64, V0, VC, upper ? 1 : 0);
m_float_emit.FMUL(64, VD, VA, V0);
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VC), upper ? 1 : 0);
m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(V0));
fpr.FixSinglePrecision(d);
fpr.Unlock(V0);
}
@ -94,41 +105,49 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
u32 op5 = inst.SUBOP5;
ARM64Reg VA = fpr.R(a, REG_REG);
ARM64Reg VB = fpr.R(b, REG_REG);
ARM64Reg VC = fpr.R(c, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
ARM64Reg V0 = fpr.GetReg();
bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
RegType type = singles ? REG_REG_SINGLE : REG_REG;
u8 size = singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
ARM64Reg VA = reg_encoder(fpr.R(a, type));
ARM64Reg VB = reg_encoder(fpr.R(b, type));
ARM64Reg VC = reg_encoder(fpr.R(c, type));
ARM64Reg VD = reg_encoder(fpr.RW(d, type));
ARM64Reg V0Q = fpr.GetReg();
ARM64Reg V0 = reg_encoder(V0Q);
// TODO: Do FMUL and FADD/FSUB in *one* host call to save accuracy.
switch (op5)
{
case 14: // ps_madds0
m_float_emit.DUP(64, V0, VC, 0);
m_float_emit.FMUL(64, V0, V0, VA);
m_float_emit.FADD(64, VD, V0, VB);
m_float_emit.DUP(size, V0, VC, 0);
m_float_emit.FMUL(size, V0, V0, VA);
m_float_emit.FADD(size, VD, V0, VB);
break;
case 15: // ps_madds1
m_float_emit.DUP(64, V0, VC, 1);
m_float_emit.FMUL(64, V0, V0, VA);
m_float_emit.FADD(64, VD, V0, VB);
m_float_emit.DUP(size, V0, VC, 1);
m_float_emit.FMUL(size, V0, V0, VA);
m_float_emit.FADD(size, VD, V0, VB);
break;
case 28: // ps_msub
m_float_emit.FMUL(64, V0, VA, VC);
m_float_emit.FSUB(64, VD, V0, VB);
m_float_emit.FMUL(size, V0, VA, VC);
m_float_emit.FSUB(size, VD, V0, VB);
break;
case 29: // ps_madd
m_float_emit.FMUL(64, V0, VA, VC);
m_float_emit.FADD(64, VD, V0, VB);
m_float_emit.FMUL(size, V0, VA, VC);
m_float_emit.FADD(size, VD, V0, VB);
break;
case 30: // ps_nmsub
m_float_emit.FMUL(64, V0, VA, VC);
m_float_emit.FSUB(64, VD, V0, VB);
m_float_emit.FNEG(64, VD, VD);
m_float_emit.FMUL(size, V0, VA, VC);
m_float_emit.FSUB(size, VD, V0, VB);
m_float_emit.FNEG(size, VD, VD);
break;
case 31: // ps_nmadd
m_float_emit.FMUL(64, V0, VA, VC);
m_float_emit.FADD(64, VD, V0, VB);
m_float_emit.FNEG(64, VD, VD);
m_float_emit.FMUL(size, V0, VA, VC);
m_float_emit.FADD(size, VD, V0, VB);
m_float_emit.FNEG(size, VD, VD);
break;
default:
_assert_msg_(DYNA_REC, 0, "ps_madd - invalid op");
@ -136,7 +155,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
}
fpr.FixSinglePrecision(d);
fpr.Unlock(V0);
fpr.Unlock(V0Q);
}
void JitArm64::ps_res(UGeckoInstruction inst)
@ -148,10 +167,16 @@ void JitArm64::ps_res(UGeckoInstruction inst)
u32 b = inst.FB, d = inst.FD;
ARM64Reg VB = fpr.R(b, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
bool singles = fpr.IsSingle(b);
RegType type = singles ? REG_REG_SINGLE : REG_REG;
u8 size = singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
ARM64Reg VB = fpr.R(b, type);
ARM64Reg VD = fpr.RW(d, type);
m_float_emit.FRSQRTE(size, reg_encoder(VD), reg_encoder(VB));
m_float_emit.FRSQRTE(64, VD, VB);
fpr.FixSinglePrecision(d);
}
@ -163,23 +188,29 @@ void JitArm64::ps_sel(UGeckoInstruction inst)
u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
ARM64Reg VA = fpr.R(a, REG_REG);
ARM64Reg VB = fpr.R(b, REG_REG);
ARM64Reg VC = fpr.R(c, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
RegType type = singles ? REG_REG_SINGLE : REG_REG;
u8 size = singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
if (d != a && d != b && d != c)
ARM64Reg VA = reg_encoder(fpr.R(a, type));
ARM64Reg VB = reg_encoder(fpr.R(b, type));
ARM64Reg VC = reg_encoder(fpr.R(c, type));
ARM64Reg VD = reg_encoder(fpr.RW(d, type));
if (d != b && d != c)
{
m_float_emit.FCMGE(64, VD, VA);
m_float_emit.FCMGE(size, VD, VA);
m_float_emit.BSL(VD, VC, VB);
}
else
{
ARM64Reg V0 = fpr.GetReg();
m_float_emit.FCMGE(64, V0, VA);
ARM64Reg V0Q = fpr.GetReg();
ARM64Reg V0 = reg_encoder(V0Q);
m_float_emit.FCMGE(size, V0, VA);
m_float_emit.BSL(V0, VC, VB);
m_float_emit.ORR(VD, V0, V0);
fpr.Unlock(V0);
fpr.Unlock(V0Q);
}
}
@ -194,23 +225,29 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
bool upper = inst.SUBOP5 == 11;
ARM64Reg VA = fpr.R(a, REG_REG);
ARM64Reg VB = fpr.R(b, REG_REG);
ARM64Reg VC = fpr.R(c, REG_REG);
ARM64Reg VD = fpr.RW(d, REG_REG);
bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
RegType type = singles ? REG_REG_SINGLE : REG_REG;
u8 size = singles ? 32 : 64;
ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
ARM64Reg VA = fpr.R(a, type);
ARM64Reg VB = fpr.R(b, type);
ARM64Reg VC = fpr.R(c, type);
ARM64Reg VD = fpr.RW(d, type);
ARM64Reg V0 = fpr.GetReg();
m_float_emit.DUP(64, V0, upper ? VA : VB, upper ? 0 : 1);
m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1);
if (d != c)
{
m_float_emit.FADD(64, VD, V0, upper ? VB : VA);
m_float_emit.INS(64, VD, upper ? 0 : 1, VC, upper ? 0 : 1);
m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA));
m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1);
}
else
{
m_float_emit.FADD(64, V0, V0, upper ? VB : VA);
m_float_emit.INS(64, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA));
m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
}
fpr.FixSinglePrecision(d);
fpr.Unlock(V0);

View File

@ -198,7 +198,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg)
{
ARM64Reg host_reg = GetReg();
m_emit->MOVI2R(host_reg, reg.GetImm());
reg.LoadToReg(host_reg);
reg.Load(host_reg);
reg.SetDirty(true);
return host_reg;
}
@ -208,7 +208,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg)
// This is a bit annoying. We try to keep these preloaded as much as possible
// This can also happen on cases where PPCAnalyst isn't feeing us proper register usage statistics
ARM64Reg host_reg = GetReg();
reg.LoadToReg(host_reg);
reg.Load(host_reg);
reg.SetDirty(false);
m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg]));
return host_reg;
@ -240,7 +240,7 @@ void Arm64GPRCache::BindToRegister(u32 preg, bool do_load)
if (reg.GetType() == REG_NOTLOADED)
{
ARM64Reg host_reg = GetReg();
reg.LoadToReg(host_reg);
reg.Load(host_reg);
if (do_load)
m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg]));
}
@ -307,12 +307,38 @@ ARM64Reg Arm64FPRCache::R(u32 preg, RegType type)
OpArg& reg = m_guest_registers[preg];
IncrementAllUsed();
reg.ResetLastUsed();
ARM64Reg host_reg = reg.GetReg();
switch (reg.GetType())
{
case REG_REG_SINGLE:
{
// We're asked for singles, so just return the register.
if (type == REG_REG_SINGLE || type == REG_LOWER_PAIR_SINGLE)
return host_reg;
// Else convert this register back to doubles.
m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, REG_REG);
// fall through
}
case REG_REG: // already in a reg
return reg.GetReg();
break;
{
return host_reg;
}
case REG_LOWER_PAIR_SINGLE:
{
// We're asked for the lower single, so just return the register.
if (type == REG_LOWER_PAIR_SINGLE)
return host_reg;
// Else convert this register back to a double.
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, REG_LOWER_PAIR);
// fall through
}
case REG_LOWER_PAIR:
{
if (type == REG_REG)
@ -320,48 +346,62 @@ ARM64Reg Arm64FPRCache::R(u32 preg, RegType type)
// Load the high 64bits from the file and insert them in to the high 64bits of the host register
ARM64Reg tmp_reg = GetReg();
m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, X29, PPCSTATE_OFF(ps[preg][1]));
m_float_emit->INS(64, reg.GetReg(), 1, tmp_reg, 0);
m_float_emit->INS(64, host_reg, 1, tmp_reg, 0);
UnlockRegister(tmp_reg);
// Change it over to a full 128bit register
reg.LoadToReg(reg.GetReg());
reg.Load(host_reg, REG_REG);
}
return reg.GetReg();
return host_reg;
}
case REG_DUP_SINGLE:
{
if (type == REG_LOWER_PAIR_SINGLE)
return host_reg;
if (type == REG_REG_SINGLE)
{
// Duplicate to the top and change over
m_float_emit->INS(32, host_reg, 1, host_reg, 0);
reg.Load(host_reg, REG_REG_SINGLE);
return host_reg;
}
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, REG_DUP);
// fall through
}
break;
case REG_DUP:
{
ARM64Reg host_reg = reg.GetReg();
if (type == REG_REG)
{
// We are requesting a full 128bit register
// but we are only available in the lower 64bits
// Duplicate to the top and change over
m_float_emit->INS(64, host_reg, 1, host_reg, 0);
reg.LoadToReg(host_reg);
reg.Load(host_reg, REG_REG);
}
return host_reg;
}
break;
case REG_NOTLOADED: // Register isn't loaded at /all/
{
ARM64Reg host_reg = GetReg();
host_reg = GetReg();
u32 load_size;
if (type == REG_REG)
{
load_size = 128;
reg.LoadToReg(host_reg);
reg.Load(host_reg, REG_REG);
}
else
{
load_size = 64;
reg.LoadLowerReg(host_reg);
reg.Load(host_reg, REG_LOWER_PAIR);
}
reg.SetDirty(false);
m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0]));
return host_reg;
}
break;
default:
_dbg_assert_msg_(DYNA_REC, false, "Invalid OpArg Type!");
break;
@ -380,90 +420,52 @@ ARM64Reg Arm64FPRCache::RW(u32 preg, RegType type)
reg.ResetLastUsed();
reg.SetDirty(true);
switch (reg.GetType())
// If not loaded at all, just alloc a new one.
if (reg.GetType() == REG_NOTLOADED)
{
case REG_NOTLOADED:
{
ARM64Reg host_reg = GetReg();
if (type == REG_LOWER_PAIR)
{
reg.LoadLowerReg(host_reg);
}
else if (type == REG_DUP)
{
reg.LoadDup(host_reg);
}
else
{
reg.LoadToReg(host_reg);
}
reg.Load(GetReg(), type);
return reg.GetReg();
}
break;
case REG_LOWER_PAIR:
// Only the lower value will be overwritten, so we must be extra careful to store PSR1 if dirty.
if ((type == REG_LOWER_PAIR || type == REG_LOWER_PAIR_SINGLE) && was_dirty)
{
// We must *not* change host_reg as this register might still be in use. So it's fine to
// store this register, but it's *not* fine to convert it to double. So for double convertion,
// a temporary register needs to be used.
ARM64Reg host_reg = reg.GetReg();
if (type == REG_REG)
ARM64Reg flush_reg = host_reg;
switch (reg.GetType())
{
// Change it over to a full 128bit register
reg.LoadToReg(host_reg);
}
else if (type == REG_DUP)
{
// Register is already the lower pair
// Just convert it over to a dup
reg.LoadDup(host_reg);
}
}
break;
case REG_REG:
{
ARM64Reg host_reg = reg.GetReg();
if (type == REG_LOWER_PAIR)
{
// If we only want the lower bits, let's store away the high bits and drop to a lower only register
case REG_REG_SINGLE:
flush_reg = GetReg();
m_float_emit->FCVTL(64, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg));
// fall through
case REG_REG:
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit store.
// It would take longer to do an insert to a temporary and a 64bit store than to just do this.
if (was_dirty)
m_float_emit->STR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0]));
reg.LoadLowerReg(host_reg);
m_float_emit->STR(128, INDEX_UNSIGNED, flush_reg, X29, PPCSTATE_OFF(ps[preg][0]));
break;
case REG_DUP_SINGLE:
flush_reg = GetReg();
m_float_emit->FCVT(64, 32, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg));
// fall through
case REG_DUP:
// Store PSR1 (which is equal to PSR0) in memory.
m_float_emit->STR(64, INDEX_UNSIGNED, flush_reg, X29, PPCSTATE_OFF(ps[preg][1]));
break;
default:
// All other types doesn't store anything in PSR1.
break;
}
else if (type == REG_DUP)
{
// If we are going from a full 128bit register to a duplicate
// then we can just change over
reg.LoadDup(host_reg);
}
}
break;
case REG_DUP:
{
ARM64Reg host_reg = reg.GetReg();
if (type == REG_REG)
{
// We are a duplicated register going to a full 128bit register
// Do an insert of our lower 64bits to the higher 64bits
m_float_emit->INS(64, host_reg, 1, host_reg, 0);
// Change over to the full 128bit register
reg.LoadToReg(host_reg);
}
else if (type == REG_LOWER_PAIR)
{
// We are duplicated changing over to a lower register
// We've got to be careful in this instance and do a store of our lower 64bits
// to the upper 64bits in the PowerPC state
// That way incase if we hit the path of DUP->LOWER->REG we get the correct bits back
if (was_dirty)
m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][1]));
reg.LoadLowerReg(host_reg);
}
}
break;
default:
// Do nothing
break;
if (host_reg != flush_reg)
Unlock(flush_reg);
}
reg.Load(reg.GetReg(), type);
return reg.GetReg();
}
@ -510,17 +512,37 @@ bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg)
void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state)
{
OpArg& reg = m_guest_registers[preg];
if (reg.GetType() == REG_REG ||
reg.GetType() == REG_LOWER_PAIR)
ARM64Reg host_reg = reg.GetReg();
RegType type = reg.GetType();
bool dirty = reg.IsDirty();
// If we're in single mode, just convert it back to a double.
if (type == REG_REG_SINGLE)
{
if (dirty)
m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
type = REG_REG;
}
if (type == REG_DUP_SINGLE || type == REG_LOWER_PAIR_SINGLE)
{
if (dirty)
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
if (type == REG_DUP_SINGLE)
type = REG_DUP;
else
type = REG_LOWER_PAIR;
}
if (type == REG_REG || type == REG_LOWER_PAIR)
{
ARM64Reg host_reg = reg.GetReg();
u32 store_size;
if (reg.GetType() == REG_REG)
if (type == REG_REG)
store_size = 128;
else
store_size = 64;
if (reg.IsDirty())
if (dirty)
m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0]));
if (!maintain_state)
@ -529,10 +551,9 @@ void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state)
reg.Flush();
}
}
else if (reg.GetType() == REG_DUP)
else if (type == REG_DUP)
{
ARM64Reg host_reg = reg.GetReg();
if (reg.IsDirty())
if (dirty)
{
// If the paired registers were at the start of ppcState we could do an STP here.
// Too bad moving them would break savestate compatibility between x86_64 and AArch64
@ -564,18 +585,25 @@ BitSet32 Arm64FPRCache::GetCallerSavedUsed()
return registers;
}
bool Arm64FPRCache::IsSingle(u32 preg, bool lower_only)
{
RegType type = m_guest_registers[preg].GetType();
return type == REG_REG_SINGLE || type == REG_DUP_SINGLE || (lower_only && type == REG_LOWER_PAIR_SINGLE);
}
void Arm64FPRCache::FixSinglePrecision(u32 preg)
{
ARM64Reg host_reg = m_guest_registers[preg].GetReg();
switch (m_guest_registers[preg].GetType())
OpArg& reg = m_guest_registers[preg];
ARM64Reg host_reg = reg.GetReg();
switch (reg.GetType())
{
case REG_DUP: // only PS0 needs to be converted
m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, REG_DUP_SINGLE);
break;
case REG_REG: // PS0 and PS1 needs to be converted
m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, REG_REG_SINGLE);
break;
default:
break;

View File

@ -22,7 +22,9 @@ enum RegType
REG_IMM, // Reg is really a IMM
REG_LOWER_PAIR, // Only the lower pair of a paired register
REG_DUP, // The lower reg is the same as the upper one (physical upper doesn't actually have the duplicated value)
REG_IS_LOADED, // We don't care what type it is, as long as the lower 64bits are loaded
REG_REG_SINGLE, // Both registers are loaded as single
REG_LOWER_PAIR_SINGLE, // Only the lower pair of a paired register, as single
REG_DUP_SINGLE, // The lower one contains both registers, as single
};
enum FlushMode
@ -56,19 +58,9 @@ public:
{
return m_value;
}
void LoadToReg(ARM64Reg reg)
void Load(ARM64Reg reg, RegType type = REG_REG)
{
m_type = REG_REG;
m_reg = reg;
}
void LoadLowerReg(ARM64Reg reg)
{
m_type = REG_LOWER_PAIR;
m_reg = reg;
}
void LoadDup(ARM64Reg reg)
{
m_type = REG_DUP;
m_type = type;
m_reg = reg;
}
void LoadToImm(u32 imm)
@ -278,6 +270,8 @@ public:
BitSet32 GetCallerSavedUsed() override;
bool IsSingle(u32 preg, bool lower_only = false);
void FixSinglePrecision(u32 preg);
protected:

View File

@ -9,22 +9,24 @@ struct BackPatchInfo
{
enum
{
FLAG_STORE = (1 << 0),
FLAG_LOAD = (1 << 1),
FLAG_SIZE_8 = (1 << 2),
FLAG_SIZE_16 = (1 << 3),
FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F32X2 = (1 << 6),
FLAG_SIZE_F64 = (1 << 7),
FLAG_REVERSE = (1 << 8),
FLAG_EXTEND = (1 << 9),
FLAG_SIZE_F32I = (1 << 10),
FLAG_ZERO_256 = (1 << 11),
FLAG_MASK_FLOAT = FLAG_SIZE_F32 |
FLAG_SIZE_F32X2 |
FLAG_SIZE_F64 |
FLAG_SIZE_F32I,
FLAG_STORE = (1 << 0),
FLAG_LOAD = (1 << 1),
FLAG_SIZE_8 = (1 << 2),
FLAG_SIZE_16 = (1 << 3),
FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F32X2 = (1 << 6),
FLAG_SIZE_F32X2I = (1 << 7),
FLAG_SIZE_F64 = (1 << 8),
FLAG_REVERSE = (1 << 9),
FLAG_EXTEND = (1 << 10),
FLAG_SIZE_F32I = (1 << 11),
FLAG_ZERO_256 = (1 << 12),
FLAG_MASK_FLOAT = FLAG_SIZE_F32 |
FLAG_SIZE_F32X2 |
FLAG_SIZE_F32X2I |
FLAG_SIZE_F64 |
FLAG_SIZE_F32I,
};
static u32 GetFlagSize(u32 flags)