And a bit more WIP JIT work, mostly floating-point improvements. It
should be getting to the point where the performance is reasonably comparable to the current JIT on most workloads. I'm not sure of the exact comparisons because I haven't done much in terms of checking the performance versus the current JIT on floating-point heavy stuff. The floating point stuff is still relatively unoptimized, and there are still a few relatively common functions that aren't being compiled yet, but the most critical work is done. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1807 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
acae6e6b69
commit
ce2b4bead9
|
@ -305,6 +305,10 @@ InstLoc IRBuilder::FoldUOp(unsigned Opcode, InstLoc Op1, unsigned extra) {
|
|||
if (getOpcode(*Op1) == ExpandPackedToMReg)
|
||||
return getOp1(Op1);
|
||||
}
|
||||
if (Opcode == DoubleToSingle) {
|
||||
if (getOpcode(*Op1) == DupSingleToMReg)
|
||||
return getOp1(Op1);
|
||||
}
|
||||
|
||||
return EmitUOp(Opcode, Op1, extra);
|
||||
}
|
||||
|
@ -590,6 +594,11 @@ static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
|
|||
return M(&FSlotSet[slot*16]);
|
||||
}
|
||||
|
||||
// Used for accessing the top half of a spilled double
|
||||
static OpArg fregLocForSlotPlusFour(RegInfo& RI, unsigned slot) {
|
||||
return M(&FSlotSet[slot*16+4]);
|
||||
}
|
||||
|
||||
static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
|
||||
unsigned newSpill = ++RI.numFSpills;
|
||||
RI.IInfo[I - RI.FirstI] |= newSpill << 16;
|
||||
|
@ -679,6 +688,16 @@ static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
|
|||
return loc.GetSimpleReg();
|
||||
}
|
||||
|
||||
static X64Reg fregEnsureInReg(RegInfo& RI, InstLoc I) {
|
||||
OpArg loc = fregLocForInst(RI, I);
|
||||
if (!loc.IsSimpleReg()) {
|
||||
X64Reg newReg = fregFindFreeReg(RI);
|
||||
RI.Jit->MOVAPD(newReg, loc);
|
||||
loc = R(newReg);
|
||||
}
|
||||
return loc.GetSimpleReg();
|
||||
}
|
||||
|
||||
static void regSpillCallerSaved(RegInfo& RI) {
|
||||
regSpill(RI, EDX);
|
||||
regSpill(RI, ECX);
|
||||
|
@ -1044,6 +1063,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case ExpandPackedToMReg:
|
||||
case CompactMRegToPacked:
|
||||
case FPNeg:
|
||||
case FSNeg:
|
||||
case FDNeg:
|
||||
if (thisUsed)
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
|
@ -1052,6 +1073,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case Load32:
|
||||
regMarkMemAddress(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
case LoadDouble:
|
||||
case LoadSingle:
|
||||
case LoadPaired:
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
|
@ -1086,9 +1108,17 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case ICmpSgt:
|
||||
case FSMul:
|
||||
case FSAdd:
|
||||
case FSSub:
|
||||
case FDMul:
|
||||
case FDAdd:
|
||||
case FDSub:
|
||||
case FPAdd:
|
||||
case FPMul:
|
||||
case FPSub:
|
||||
case FPMerge00:
|
||||
case FPMerge01:
|
||||
case FPMerge10:
|
||||
case FPMerge11:
|
||||
case InsertDoubleInMReg:
|
||||
if (thisUsed) {
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
|
@ -1104,6 +1134,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
regMarkMemAddress(RI, I, getOp2(I), 2);
|
||||
break;
|
||||
case StoreSingle:
|
||||
case StoreDouble:
|
||||
case StorePaired:
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
regMarkUse(RI, I, getOp2(I), 2);
|
||||
|
@ -1417,6 +1448,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case LoadDouble: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||
Jit->ADD(32, R(ECX), Imm8(4));
|
||||
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
Jit->MOVD_xmm(reg, R(ECX));
|
||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
Jit->MOVD_xmm(XMM0, R(ECX));
|
||||
Jit->PUNPCKLDQ(reg, R(XMM0));
|
||||
RI.fregs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case LoadPaired: {
|
||||
if (!thisUsed) break;
|
||||
regSpill(RI, EAX);
|
||||
|
@ -1449,6 +1495,30 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
regClearInst(RI, getOp2(I));
|
||||
break;
|
||||
}
|
||||
case StoreDouble: {
|
||||
regSpill(RI, EAX);
|
||||
// FIXME: Use 64-bit where possible
|
||||
// FIXME: Use unsafe write with pshufb where possible
|
||||
unsigned fspill = fregGetSpill(RI, getOp1(I));
|
||||
if (!fspill) {
|
||||
// Force the value to spill, so we can use
|
||||
// memory operations to load it
|
||||
fspill = fregCreateSpill(RI, getOp1(I));
|
||||
X64Reg reg = fregLocForInst(RI, getOp1(I)).GetSimpleReg();
|
||||
RI.Jit->MOVAPD(fregLocForSlot(RI, fspill), reg);
|
||||
}
|
||||
Jit->MOV(32, R(EAX), fregLocForSlotPlusFour(RI, fspill));
|
||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
|
||||
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0);
|
||||
Jit->MOV(32, R(EAX), fregLocForSlot(RI, fspill));
|
||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
|
||||
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4);
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
fregClearInst(RI, getOp1(I));
|
||||
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||
regClearInst(RI, getOp2(I));
|
||||
break;
|
||||
}
|
||||
case StorePaired: {
|
||||
regSpill(RI, EAX);
|
||||
regSpill(RI, EDX);
|
||||
|
@ -1501,6 +1571,28 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FSNeg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
static const u32 GC_ALIGNED16(ssSignBits[4]) =
|
||||
{0x80000000};
|
||||
Jit->PXOR(reg, M((void*)&ssSignBits));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FDNeg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
static const u64 GC_ALIGNED16(ssSignBits[2]) =
|
||||
{0x8000000000000000ULL};
|
||||
Jit->PXOR(reg, M((void*)&ssSignBits));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FPNeg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
|
@ -1522,8 +1614,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
}
|
||||
case StoreFReg: {
|
||||
unsigned ppcreg = *I >> 16;
|
||||
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0);
|
||||
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]),
|
||||
fregEnsureInReg(RI, getOp1(I)));
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
|
@ -1553,6 +1645,42 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FSSub: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->SUBSS(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FDMul: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MULSD(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FDAdd: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->ADDSD(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FDSub: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->SUBSD(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FPAdd: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
|
@ -1580,6 +1708,47 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FPMerge00: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FPMerge01: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
// Note reversed operands!
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
|
||||
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVSS(reg, R(XMM0));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FPMerge10: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp2(I)));
|
||||
Jit->MOVSS(reg, R(XMM0));
|
||||
Jit->SHUFPS(reg, R(reg), 0xF1);
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FPMerge11: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I)));
|
||||
Jit->SHUFPD(reg, R(reg), 0x1);
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case CInt32:
|
||||
case CInt16: {
|
||||
if (!thisUsed) break;
|
||||
|
|
|
@ -157,13 +157,24 @@ namespace IREmitter {
|
|||
LoadFReg,
|
||||
FSMul,
|
||||
FSAdd,
|
||||
FSSub,
|
||||
FSNeg,
|
||||
FPAdd,
|
||||
FPMul,
|
||||
FPSub,
|
||||
FPNeg,
|
||||
FDMul,
|
||||
FDAdd,
|
||||
FDSub,
|
||||
FDNeg,
|
||||
FPMerge00,
|
||||
FPMerge01,
|
||||
FPMerge10,
|
||||
FPMerge11,
|
||||
FResult_End,
|
||||
StorePaired,
|
||||
StoreSingle,
|
||||
StoreDouble,
|
||||
StoreFReg,
|
||||
|
||||
// "Trinary" operators
|
||||
|
@ -380,6 +391,9 @@ namespace IREmitter {
|
|||
InstLoc EmitStoreSingle(InstLoc value, InstLoc addr) {
|
||||
return FoldBiOp(StoreSingle, value, addr);
|
||||
}
|
||||
InstLoc EmitStoreDouble(InstLoc value, InstLoc addr) {
|
||||
return FoldBiOp(StoreDouble, value, addr);
|
||||
}
|
||||
InstLoc EmitStorePaired(InstLoc value, InstLoc addr, unsigned quantReg) {
|
||||
return FoldBiOp(StorePaired, value, addr, quantReg);
|
||||
}
|
||||
|
@ -410,6 +424,24 @@ namespace IREmitter {
|
|||
InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FSAdd, op1, op2);
|
||||
}
|
||||
InstLoc EmitFSSub(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FSSub, op1, op2);
|
||||
}
|
||||
InstLoc EmitFSNeg(InstLoc op1) {
|
||||
return FoldUOp(FSNeg, op1);
|
||||
}
|
||||
InstLoc EmitFDMul(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FDMul, op1, op2);
|
||||
}
|
||||
InstLoc EmitFDAdd(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FDAdd, op1, op2);
|
||||
}
|
||||
InstLoc EmitFDSub(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FDSub, op1, op2);
|
||||
}
|
||||
InstLoc EmitFDNeg(InstLoc op1) {
|
||||
return FoldUOp(FDNeg, op1);
|
||||
}
|
||||
InstLoc EmitFPAdd(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FPAdd, op1, op2);
|
||||
}
|
||||
|
@ -419,6 +451,18 @@ namespace IREmitter {
|
|||
InstLoc EmitFPSub(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FPSub, op1, op2);
|
||||
}
|
||||
InstLoc EmitFPMerge00(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FPMerge00, op1, op2);
|
||||
}
|
||||
InstLoc EmitFPMerge01(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FPMerge01, op1, op2);
|
||||
}
|
||||
InstLoc EmitFPMerge10(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FPMerge10, op1, op2);
|
||||
}
|
||||
InstLoc EmitFPMerge11(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FPMerge11, op1, op2);
|
||||
}
|
||||
InstLoc EmitFPNeg(InstLoc op1) {
|
||||
return FoldUOp(FPNeg, op1);
|
||||
}
|
||||
|
|
|
@ -28,31 +28,40 @@
|
|||
|
||||
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
||||
{
|
||||
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) {
|
||||
if (inst.Rc || inst.OPCD != 59 || (inst.SUBOP5 != 25 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21)) {
|
||||
Default(inst); return;
|
||||
}
|
||||
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
|
||||
val = ibuild.EmitDoubleToSingle(val);
|
||||
bool dupe = inst.OPCD == 59;
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 25: //mul
|
||||
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
|
||||
val = ibuild.EmitFDMul(val, ibuild.EmitLoadFReg(inst.FC));
|
||||
break;
|
||||
case 18: //div
|
||||
case 20: //sub
|
||||
val = ibuild.EmitFDSub(val, ibuild.EmitLoadFReg(inst.FB));
|
||||
break;
|
||||
case 21: //add
|
||||
val = ibuild.EmitFDAdd(val, ibuild.EmitLoadFReg(inst.FB));
|
||||
break;
|
||||
case 23: //sel
|
||||
case 24: //res
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
|
||||
}
|
||||
val = ibuild.EmitDupSingleToMReg(val);
|
||||
|
||||
if (inst.OPCD == 59) {
|
||||
val = ibuild.EmitDoubleToSingle(val);
|
||||
val = ibuild.EmitDupSingleToMReg(val);
|
||||
} else {
|
||||
val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
|
||||
}
|
||||
ibuild.EmitStoreFReg(val, inst.FD);
|
||||
}
|
||||
|
||||
void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
{
|
||||
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) {
|
||||
if (inst.Rc || inst.OPCD != 59) {
|
||||
Default(inst); return;
|
||||
}
|
||||
|
||||
|
@ -61,7 +70,12 @@
|
|||
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
|
||||
val = ibuild.EmitDoubleToSingle(val);
|
||||
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
|
||||
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
|
||||
if (inst.SUBOP5 & 1)
|
||||
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
|
||||
else
|
||||
val = ibuild.EmitFSSub(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
|
||||
if (inst.SUBOP5 & 2)
|
||||
val = ibuild.EmitFSNeg(val);
|
||||
val = ibuild.EmitDupSingleToMReg(val);
|
||||
ibuild.EmitStoreFReg(val, inst.FD);
|
||||
}
|
||||
|
@ -78,57 +92,6 @@
|
|||
|
||||
void Jit64::fcmpx(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
if (jo.fpAccurateFlags)
|
||||
{
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
bool ordered = inst.SUBOP10 == 32;
|
||||
/*
|
||||
double fa = rPS0(_inst.FA);
|
||||
double fb = rPS0(_inst.FB);
|
||||
u32 compareResult;
|
||||
|
||||
if(IsNAN(fa) || IsNAN(fb)) compareResult = 1;
|
||||
else if(fa < fb) compareResult = 8;
|
||||
else if(fa > fb) compareResult = 4;
|
||||
else compareResult = 2;
|
||||
|
||||
FPSCR.FPRF = compareResult;
|
||||
CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult << ((7 - _inst.CRFD) * 4));
|
||||
*/
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
int crf = inst.CRFD;
|
||||
int shift = crf * 4;
|
||||
//FPSCR
|
||||
//XOR(32,R(EAX),R(EAX));
|
||||
|
||||
fpr.Lock(a,b);
|
||||
if (a != b)
|
||||
fpr.LoadToX64(a, true);
|
||||
|
||||
// USES_CR
|
||||
if (ordered)
|
||||
COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
|
||||
else
|
||||
UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
|
||||
FixupBranch pLesser = J_CC(CC_B);
|
||||
FixupBranch pGreater = J_CC(CC_A);
|
||||
// _x86Reg == 0
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
|
||||
FixupBranch continue1 = J();
|
||||
// _x86Reg > 0
|
||||
SetJumpTarget(pGreater);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
|
||||
FixupBranch continue2 = J();
|
||||
// _x86Reg < 0
|
||||
SetJumpTarget(pLesser);
|
||||
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
|
||||
SetJumpTarget(continue1);
|
||||
SetJumpTarget(continue2);
|
||||
fpr.UnlockAll();
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -65,121 +65,26 @@ void Jit64::lfs(UGeckoInstruction inst)
|
|||
|
||||
void Jit64::lfd(UGeckoInstruction inst)
|
||||
{
|
||||
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
|
||||
int d = inst.RD;
|
||||
int a = inst.RA;
|
||||
if (!a)
|
||||
{
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
||||
gpr.FlushLockX(ABI_PARAM1);
|
||||
gpr.Lock(a);
|
||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||
// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
|
||||
fpr.LoadToX64(d, true);
|
||||
fpr.Lock(d);
|
||||
X64Reg xd = fpr.RX(d);
|
||||
if (cpu_info.bSSSE3) {
|
||||
#ifdef _M_X64
|
||||
MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
|
||||
#else
|
||||
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
|
||||
#endif
|
||||
PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
|
||||
MOVSD(xd, R(XMM0));
|
||||
} else {
|
||||
#ifdef _M_X64
|
||||
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
|
||||
BSWAP(64, EAX);
|
||||
MOV(64, M(&temp64), R(EAX));
|
||||
MOVSD(XMM0, M(&temp64));
|
||||
MOVSD(xd, R(XMM0));
|
||||
#else
|
||||
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M((void*)((u32)&temp64+4)), R(EAX));
|
||||
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(&temp64), R(EAX));
|
||||
MOVSD(XMM0, M(&temp64));
|
||||
MOVSD(xd, R(XMM0));
|
||||
#if 0
|
||||
// Alternate implementation; possibly faster
|
||||
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
|
||||
PSHUFLW(XMM0, R(XMM0), 0x1B);
|
||||
PSRLW(XMM0, 8);
|
||||
MOVSD(xd, R(XMM0));
|
||||
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
|
||||
PSHUFLW(XMM0, R(XMM0), 0x1B);
|
||||
PSLLW(XMM0, 8);
|
||||
POR(xd, R(XMM0));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
|
||||
if (inst.RA)
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
val = ibuild.EmitLoadFReg(inst.RD);
|
||||
val = ibuild.EmitInsertDoubleInMReg(ibuild.EmitLoadDouble(addr), val);
|
||||
ibuild.EmitStoreFReg(val, inst.RD);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void Jit64::stfd(UGeckoInstruction inst)
|
||||
{
|
||||
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
|
||||
int s = inst.RS;
|
||||
int a = inst.RA;
|
||||
if (!a)
|
||||
{
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
||||
gpr.FlushLockX(ABI_PARAM1);
|
||||
gpr.Lock(a);
|
||||
fpr.Lock(s);
|
||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||
#ifdef _M_IX86
|
||||
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
||||
#endif
|
||||
if (cpu_info.bSSSE3) {
|
||||
MOVAPD(XMM0, fpr.R(s));
|
||||
PSHUFB(XMM0, M((void *)bswapShuffle1x8));
|
||||
#ifdef _M_X64
|
||||
MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0);
|
||||
#else
|
||||
MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0);
|
||||
#endif
|
||||
} else {
|
||||
#ifdef _M_X64
|
||||
fpr.LoadToX64(s, true, false);
|
||||
MOVSD(M(&temp64), fpr.RX(s));
|
||||
MOV(64, R(EAX), M(&temp64));
|
||||
BSWAP(64, EAX);
|
||||
MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX));
|
||||
#else
|
||||
fpr.LoadToX64(s, true, false);
|
||||
MOVSD(M(&temp64), fpr.RX(s));
|
||||
MOV(32, R(EAX), M(&temp64));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX));
|
||||
MOV(32, R(EAX), M((void*)((u32)&temp64 + 4)));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX));
|
||||
#endif
|
||||
}
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16),
|
||||
val = ibuild.EmitLoadFReg(inst.RS);
|
||||
if (inst.RA)
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
if (inst.OPCD & 1)
|
||||
ibuild.EmitStoreGReg(addr, inst.RA);
|
||||
ibuild.EmitStoreDouble(val, addr);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -270,42 +270,35 @@
|
|||
//TODO: find easy cases and optimize them, do a breakout like ps_arith
|
||||
void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
if (inst.Rc) {
|
||||
Default(inst); return;
|
||||
}
|
||||
int d = inst.FD;
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
fpr.Lock(a,b,d);
|
||||
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
IREmitter::InstLoc val = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FA)),
|
||||
rhs = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FB));
|
||||
|
||||
switch (inst.SUBOP10)
|
||||
{
|
||||
case 528:
|
||||
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
|
||||
val = ibuild.EmitFPMerge00(val, rhs);
|
||||
break; //00
|
||||
case 560:
|
||||
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
|
||||
val = ibuild.EmitFPMerge01(val, rhs);
|
||||
break; //01
|
||||
case 592:
|
||||
SHUFPD(XMM0, fpr.R(b), 1);
|
||||
val = ibuild.EmitFPMerge10(val, rhs);
|
||||
break; //10
|
||||
case 624:
|
||||
UNPCKHPD(XMM0, fpr.R(b));
|
||||
val = ibuild.EmitFPMerge11(val, rhs);
|
||||
break; //11
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
||||
}
|
||||
fpr.LoadToX64(d, false);
|
||||
MOVAPD(fpr.RX(d), Gen::R(XMM0));
|
||||
fpr.UnlockAll();
|
||||
val = ibuild.EmitExpandPackedToMReg(val);
|
||||
ibuild.EmitStoreFReg(val, inst.FD);
|
||||
}
|
||||
|
||||
|
||||
//TODO: add optimized cases
|
||||
void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||
{
|
||||
if (inst.Rc || (inst.SUBOP5 != 28 && inst.SUBOP5 != 29 && inst.SUBOP5 != 30)) {
|
||||
|
|
Loading…
Reference in New Issue