A bit more progress on my JIT WIP: biggest changes are some substantial
work on floating-point. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1743 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
35128bb041
commit
29a033e1dd
|
@ -97,8 +97,9 @@ Inter-block dead condition register elimination (Likely significant win
|
|||
Optimize conditions for conditional branches.
|
||||
General dead register elimination.
|
||||
Inter-block inlining.
|
||||
Track down a few correctness bugs (I think there's something wrong
|
||||
with my branches, but I haven't been able to figure it out).
|
||||
Track down issues with new JIT + dual-core mode (I think I'm going to
|
||||
need help with this one; I'm not very familiar with the
|
||||
dual-core code.)
|
||||
Specialized slw/srw/sraw; I think there are some tricks that could
|
||||
have a non-trivial effect, and there are significantly shorter
|
||||
implementations for 64-bit involving abusing 64-bit shifts.
|
||||
|
@ -502,16 +503,21 @@ struct RegInfo {
|
|||
InstLoc FirstI;
|
||||
std::vector<unsigned> IInfo;
|
||||
InstLoc regs[16];
|
||||
InstLoc fregs[16];
|
||||
unsigned numSpills;
|
||||
unsigned numFSpills;
|
||||
bool MakeProfile;
|
||||
bool UseProfile;
|
||||
unsigned numProfiledLoads;
|
||||
unsigned exitNumber;
|
||||
|
||||
RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) {
|
||||
for (unsigned i = 0; i < 16; i++)
|
||||
for (unsigned i = 0; i < 16; i++) {
|
||||
regs[i] = 0;
|
||||
fregs[i] = 0;
|
||||
}
|
||||
numSpills = 0;
|
||||
numFSpills = 0;
|
||||
numProfiledLoads = 0;
|
||||
exitNumber = 0;
|
||||
MakeProfile = UseProfile = false;
|
||||
|
@ -533,6 +539,7 @@ static unsigned regReadUse(RegInfo& R, InstLoc I) {
|
|||
|
||||
static unsigned SlotSet[1000];
|
||||
static unsigned ProfiledLoads[1000];
|
||||
static u8 GC_ALIGNED16(FSlotSet[16*1000]);
|
||||
|
||||
static OpArg regLocForSlot(RegInfo& RI, unsigned slot) {
|
||||
return M(&SlotSet[slot - 1]);
|
||||
|
@ -558,57 +565,86 @@ static void regSpill(RegInfo& RI, X64Reg reg) {
|
|||
RI.regs[reg] = 0;
|
||||
}
|
||||
|
||||
static X64Reg regFindFreeReg(RegInfo& RI) {
|
||||
if (RI.regs[EDI] == 0) return EDI;
|
||||
if (RI.regs[ESI] == 0) return ESI;
|
||||
if (RI.regs[EBP] == 0) return EBP;
|
||||
if (RI.regs[EBX] == 0) return EBX;
|
||||
if (RI.regs[EDX] == 0) return EDX;
|
||||
if (RI.regs[EAX] == 0) return EAX;
|
||||
static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
|
||||
return M(&FSlotSet[slot*16]);
|
||||
}
|
||||
|
||||
static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
|
||||
unsigned newSpill = ++RI.numFSpills;
|
||||
RI.IInfo[I - RI.FirstI] |= newSpill << 16;
|
||||
return newSpill;
|
||||
}
|
||||
|
||||
static unsigned fregGetSpill(RegInfo& RI, InstLoc I) {
|
||||
return RI.IInfo[I - RI.FirstI] >> 16;
|
||||
}
|
||||
|
||||
static void fregSpill(RegInfo& RI, X64Reg reg) {
|
||||
if (!RI.fregs[reg]) return;
|
||||
unsigned slot = fregGetSpill(RI, RI.fregs[reg]);
|
||||
if (!slot) {
|
||||
slot = fregCreateSpill(RI, RI.fregs[reg]);
|
||||
RI.Jit->MOVAPD(fregLocForSlot(RI, slot), reg);
|
||||
}
|
||||
RI.fregs[reg] = 0;
|
||||
}
|
||||
|
||||
// ECX is scratch, so we don't allocate it
|
||||
static X64Reg regs[] = {EDI, ESI, EBP, EBX, EDX, EAX};
|
||||
static X64Reg RegAllocOrder[] = {EDI, ESI, EBP, EBX, EDX, EAX};
|
||||
static unsigned RegAllocSize = sizeof(RegAllocOrder) / sizeof(X64Reg);
|
||||
static X64Reg FRegAllocOrder[] = {XMM2, XMM3, XMM4, XMM5, XMM6, XMM7};
|
||||
static unsigned FRegAllocSize = sizeof(FRegAllocOrder) / sizeof(X64Reg);
|
||||
|
||||
static X64Reg regFindFreeReg(RegInfo& RI) {
|
||||
for (unsigned i = 0; i < RegAllocSize; i++)
|
||||
if (RI.regs[RegAllocOrder[i]] == 0)
|
||||
return RegAllocOrder[i];
|
||||
|
||||
static unsigned nextReg = 0;
|
||||
X64Reg reg = regs[nextReg++ % 6];
|
||||
X64Reg reg = RegAllocOrder[nextReg++ % RegAllocSize];
|
||||
regSpill(RI, reg);
|
||||
return reg;
|
||||
}
|
||||
|
||||
static X64Reg fregFindFreeReg(RegInfo& RI) {
|
||||
for (unsigned i = 0; i < FRegAllocSize; i++)
|
||||
if (RI.fregs[FRegAllocOrder[i]] == 0)
|
||||
return FRegAllocOrder[i];
|
||||
// XMM0/1 are scratch, so we don't allocate it
|
||||
fregSpill(RI, XMM7);
|
||||
return XMM7;
|
||||
}
|
||||
|
||||
static OpArg regLocForInst(RegInfo& RI, InstLoc I) {
|
||||
if (RI.regs[EDI] == I) return R(EDI);
|
||||
if (RI.regs[ESI] == I) return R(ESI);
|
||||
if (RI.regs[EBP] == I) return R(EBP);
|
||||
if (RI.regs[EBX] == I) return R(EBX);
|
||||
if (RI.regs[EDX] == I) return R(EDX);
|
||||
if (RI.regs[EAX] == I) return R(EAX);
|
||||
if (RI.regs[ECX] == I) return R(ECX);
|
||||
for (unsigned i = 0; i < RegAllocSize; i++)
|
||||
if (RI.regs[RegAllocOrder[i]] == I)
|
||||
return R(RegAllocOrder[i]);
|
||||
|
||||
if (regGetSpill(RI, I) == 0)
|
||||
PanicAlert("Retrieving unknown spill slot?!");
|
||||
return regLocForSlot(RI, regGetSpill(RI, I));
|
||||
}
|
||||
|
||||
static OpArg fregLocForInst(RegInfo& RI, InstLoc I) {
|
||||
for (unsigned i = 0; i < FRegAllocSize; i++)
|
||||
if (RI.fregs[FRegAllocOrder[i]] == I)
|
||||
return R(FRegAllocOrder[i]);
|
||||
|
||||
if (fregGetSpill(RI, I) == 0)
|
||||
PanicAlert("Retrieving unknown spill slot?!");
|
||||
return fregLocForSlot(RI, fregGetSpill(RI, I));
|
||||
}
|
||||
|
||||
static void regClearInst(RegInfo& RI, InstLoc I) {
|
||||
if (RI.regs[EDI] == I) {
|
||||
RI.regs[EDI] = 0;
|
||||
}
|
||||
if (RI.regs[ESI] == I) {
|
||||
RI.regs[ESI] = 0;
|
||||
}
|
||||
if (RI.regs[EBP] == I) {
|
||||
RI.regs[EBP] = 0;
|
||||
}
|
||||
if (RI.regs[EBX] == I) {
|
||||
RI.regs[EBX] = 0;
|
||||
}
|
||||
if (RI.regs[EDX] == I) {
|
||||
RI.regs[EDX] = 0;
|
||||
}
|
||||
if (RI.regs[EAX] == I) {
|
||||
RI.regs[EAX] = 0;
|
||||
}
|
||||
if (RI.regs[ECX] == I) {
|
||||
RI.regs[ECX] = 0;
|
||||
for (unsigned i = 0; i < RegAllocSize; i++)
|
||||
if (RI.regs[RegAllocOrder[i]] == I)
|
||||
RI.regs[RegAllocOrder[i]] = 0;
|
||||
}
|
||||
|
||||
static void fregClearInst(RegInfo& RI, InstLoc I) {
|
||||
for (unsigned i = 0; i < FRegAllocSize; i++)
|
||||
if (RI.fregs[FRegAllocOrder[i]] == I)
|
||||
RI.fregs[FRegAllocOrder[i]] = 0;
|
||||
}
|
||||
|
||||
static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
|
||||
|
@ -645,6 +681,20 @@ static X64Reg regBinLHSReg(RegInfo& RI, InstLoc I) {
|
|||
return reg;
|
||||
}
|
||||
|
||||
static void regNormalRegClear(RegInfo& RI, InstLoc I) {
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
regClearInst(RI, getOp1(I));
|
||||
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||
regClearInst(RI, getOp2(I));
|
||||
}
|
||||
|
||||
static void fregNormalRegClear(RegInfo& RI, InstLoc I) {
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
fregClearInst(RI, getOp1(I));
|
||||
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||
fregClearInst(RI, getOp2(I));
|
||||
}
|
||||
|
||||
static void regEmitBinInst(RegInfo& RI, InstLoc I,
|
||||
void (Jit64::*op)(int, const OpArg&,
|
||||
const OpArg&)) {
|
||||
|
@ -660,11 +710,11 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I,
|
|||
(RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I)));
|
||||
}
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
}
|
||||
|
||||
// Mark and calculation routines for profiled load/store addresses
|
||||
// Could be extended to unprofiled addresses.
|
||||
// FIXME: Finish/activate!
|
||||
static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
|
||||
if (isImm(*AI)) {
|
||||
unsigned addr = RI.Build->GetImmValue(AI);
|
||||
|
@ -743,7 +793,6 @@ static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI,
|
|||
}
|
||||
return MDisp(baseReg, offset);
|
||||
}
|
||||
// end FIXME
|
||||
|
||||
static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
|
||||
if (RI.UseProfile) {
|
||||
|
@ -844,7 +893,6 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
|
|||
RI.Jit->js.fifoBytesThisBlock += Size >> 3;
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
regClearInst(RI, getOp1(I));
|
||||
//regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
|
||||
regClearDeadMemAddress(RI, I, getOp2(I), 2);
|
||||
return;
|
||||
}
|
||||
|
@ -878,6 +926,7 @@ static void regEmitShiftInst(RegInfo& RI, InstLoc I,
|
|||
RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
|
||||
(RI.Jit->*op)(32, R(reg), R(ECX));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
}
|
||||
|
||||
static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I,
|
||||
|
@ -930,7 +979,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts());
|
||||
RI.Build = ibuild;
|
||||
RI.UseProfile = UseProfile;
|
||||
RI.MakeProfile = !RI.UseProfile;
|
||||
RI.MakeProfile = false;//!RI.UseProfile;
|
||||
// Pass to compute liveness
|
||||
ibuild->StartBackPass();
|
||||
for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
|
||||
|
@ -949,12 +998,14 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case LoadCarry:
|
||||
case LoadCTR:
|
||||
case LoadMSR:
|
||||
case LoadFReg:
|
||||
case BlockEnd:
|
||||
case BlockStart:
|
||||
case InterpreterFallback:
|
||||
case SystemCall:
|
||||
case RFIExit:
|
||||
case InterpreterBranch:
|
||||
case IdleLoop:
|
||||
// No liveness effects
|
||||
break;
|
||||
case Tramp:
|
||||
|
@ -965,6 +1016,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case SExt16:
|
||||
case BSwap32:
|
||||
case BSwap16:
|
||||
case DupSingleToMReg:
|
||||
case DoubleToSingle:
|
||||
case ExpandPackedToMReg:
|
||||
if (thisUsed)
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
|
@ -973,6 +1027,10 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case Load32:
|
||||
regMarkMemAddress(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
case LoadSingle:
|
||||
case LoadPaired:
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
case StoreCR:
|
||||
case StoreCarry:
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
|
@ -981,6 +1039,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case StoreLink:
|
||||
case StoreCTR:
|
||||
case StoreMSR:
|
||||
case StoreFReg:
|
||||
if (!isImm(*getOp1(I)))
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
break;
|
||||
|
@ -1000,6 +1059,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case ICmpUgt:
|
||||
case ICmpSle:
|
||||
case ICmpSgt:
|
||||
case FSMul:
|
||||
case FSAdd:
|
||||
case InsertDoubleInMReg:
|
||||
if (thisUsed) {
|
||||
regMarkUse(RI, I, getOp1(I), 1);
|
||||
if (!isImm(*getOp2(I)))
|
||||
|
@ -1041,6 +1103,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
case InterpreterFallback: {
|
||||
unsigned InstCode = ibuild->GetImmValue(getOp1(I));
|
||||
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
|
||||
// There really shouldn't be anything live across an
|
||||
// interpreter call at the moment, but optimizing interpreter
|
||||
// calls isn't completely out of the question...
|
||||
regSpillCallerSaved(RI);
|
||||
Jit->MOV(32, M(&PC), Imm32(InstLoc));
|
||||
Jit->MOV(32, M(&NPC), Imm32(InstLoc+4));
|
||||
|
@ -1089,6 +1154,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
unsigned ppcreg = *I >> 16;
|
||||
regStoreInstToConstLoc(RI, 32, getOp1(I),
|
||||
&PowerPC::ppcState.gpr[ppcreg]);
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case StoreCR: {
|
||||
|
@ -1096,18 +1162,22 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
unsigned ppcreg = *I >> 16;
|
||||
// CAUTION: uses 8-bit reg!
|
||||
Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX));
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case StoreLink: {
|
||||
regStoreInstToConstLoc(RI, 32, getOp1(I), &LR);
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case StoreCTR: {
|
||||
regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR);
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case StoreMSR: {
|
||||
regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR);
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case StoreCarry: {
|
||||
|
@ -1118,6 +1188,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
Jit->SetJumpTarget(nocarry);
|
||||
Jit->JitClearCA();
|
||||
Jit->SetJumpTarget(cont);
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case Load8: {
|
||||
|
@ -1150,6 +1221,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVSX(32, 8, reg, R(ECX));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case SExt16: {
|
||||
|
@ -1157,6 +1229,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
X64Reg reg = regUReg(RI, I);
|
||||
Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I)));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case And: {
|
||||
|
@ -1199,6 +1272,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I)));
|
||||
}
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case Rol: {
|
||||
|
@ -1228,6 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
X64Reg reg = regFindFreeReg(RI);
|
||||
Jit->MOVZX(32, 8, reg, R(ECX));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case ICmpUgt: {
|
||||
|
@ -1237,6 +1312,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
X64Reg reg = regFindFreeReg(RI);
|
||||
Jit->MOVZX(32, 8, reg, R(ECX));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case ICmpSle: {
|
||||
|
@ -1246,6 +1322,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
X64Reg reg = regFindFreeReg(RI);
|
||||
Jit->MOVZX(32, 8, reg, R(ECX));
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case ICmpCRUnsigned: {
|
||||
|
@ -1264,6 +1341,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
Jit->SetJumpTarget(continue1);
|
||||
Jit->SetJumpTarget(continue2);
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case ICmpCRSigned: {
|
||||
|
@ -1282,6 +1360,102 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
Jit->SetJumpTarget(continue1);
|
||||
Jit->SetJumpTarget(continue2);
|
||||
RI.regs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case LoadSingle: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||
Jit->MOVD_xmm(reg, R(ECX));
|
||||
RI.fregs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case LoadPaired: {
|
||||
if (!thisUsed) break;
|
||||
regSpill(RI, EAX);
|
||||
regSpill(RI, EDX);
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
unsigned quantreg = *I >> 16;
|
||||
Jit->MOVZX(32, 16, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + quantreg]) + 2));
|
||||
Jit->MOVZX(32, 8, EDX, R(AL));
|
||||
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]!
|
||||
Jit->SHL(32, R(EDX), Imm8(2));
|
||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||
Jit->CALLptr(MDisp(EDX, (u32)asm_routines.pairedLoadQuantized));
|
||||
Jit->MOVAPD(reg, R(XMM0));
|
||||
RI.fregs[reg] = I;
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case DupSingleToMReg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVDDUP(reg, R(reg));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case InsertDoubleInMReg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
|
||||
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVSD(reg, R(XMM0));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case ExpandPackedToMReg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->CVTPS2PD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case LoadFReg: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
unsigned ppcreg = *I >> 8;
|
||||
Jit->MOVAPD(reg, M(&PowerPC::ppcState.ps[ppcreg]));
|
||||
RI.fregs[reg] = I;
|
||||
break;
|
||||
}
|
||||
case StoreFReg: {
|
||||
unsigned ppcreg = *I >> 16;
|
||||
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0);
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case DoubleToSingle: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FSMul: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->MULSS(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case FSAdd: {
|
||||
if (!thisUsed) break;
|
||||
X64Reg reg = fregFindFreeReg(RI);
|
||||
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||
Jit->ADDSS(reg, fregLocForInst(RI, getOp2(I)));
|
||||
RI.fregs[reg] = I;
|
||||
fregNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case CInt32:
|
||||
|
@ -1328,6 +1502,15 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
}
|
||||
case BranchUncond: {
|
||||
regWriteExit(RI, getOp1(I));
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
}
|
||||
case IdleLoop: {
|
||||
unsigned IdleParam = ibuild->GetImmValue(getOp1(I));
|
||||
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
|
||||
Jit->ABI_CallFunctionC((void *)&PowerPC::OnIdle, IdleParam);
|
||||
Jit->MOV(32, M(&PowerPC::ppcState.pc), Imm32(InstLoc + 12));
|
||||
Jit->JMP(asm_routines.testExceptions, true);
|
||||
break;
|
||||
}
|
||||
case SystemCall: {
|
||||
|
@ -1378,26 +1561,16 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
|||
PanicAlert("Unknown JIT instruction; aborting!");
|
||||
exit(1);
|
||||
}
|
||||
if (getOpcode(*I) != Tramp &&
|
||||
getOpcode(*I) != BranchCond &&
|
||||
getOpcode(*I) != Load8 &&
|
||||
getOpcode(*I) != Load16 &&
|
||||
getOpcode(*I) != Load32 &&
|
||||
getOpcode(*I) != Store8 &&
|
||||
getOpcode(*I) != Store16 &&
|
||||
getOpcode(*I) != Store32 &&
|
||||
1) {
|
||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||
regClearInst(RI, getOp1(I));
|
||||
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||
regClearInst(RI, getOp2(I));
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < 8; i++) {
|
||||
if (RI.regs[i]) {
|
||||
PanicAlert("Incomplete cleanup!");
|
||||
exit(1);
|
||||
}
|
||||
if (RI.fregs[i]) {
|
||||
PanicAlert("Incomplete cleanup!");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (UseProfile && RI.numSpills)
|
||||
|
@ -1412,8 +1585,8 @@ void Jit64::WriteCode() {
|
|||
|
||||
void ProfiledReJit() {
|
||||
u8* x = (u8*)jit.GetCodePtr();
|
||||
jit.SetCodePtr(jit.js.normalEntry);
|
||||
jit.SetCodePtr(jit.js.rewriteStart);
|
||||
DoWriteCode(&jit.ibuild, &jit, true);
|
||||
jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.normalEntry;
|
||||
jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.rewriteStart;
|
||||
jit.SetCodePtr(x);
|
||||
}
|
||||
|
|
|
@ -80,6 +80,7 @@ namespace IREmitter {
|
|||
Store16,
|
||||
Store32,
|
||||
BranchCond,
|
||||
#if 0
|
||||
// Floating-point
|
||||
// There are three floating-point formats: single, double,
|
||||
// and packed. For any operation where the format of the
|
||||
|
@ -141,8 +142,18 @@ namespace IREmitter {
|
|||
ForceToSingle,
|
||||
ForceToDouble,
|
||||
ForceToMReg,
|
||||
LoadFPReg,
|
||||
StoreFPReg,
|
||||
#endif
|
||||
LoadSingle,
|
||||
LoadDouble,
|
||||
LoadPaired, // This handles quantizers itself
|
||||
DoubleToSingle,
|
||||
DupSingleToMReg,
|
||||
InsertDoubleInMReg,
|
||||
ExpandPackedToMReg,
|
||||
LoadFReg,
|
||||
StoreFReg,
|
||||
FSMul,
|
||||
FSAdd,
|
||||
|
||||
// "Trinary" operators
|
||||
// FIXME: Need to change representation!
|
||||
|
@ -156,6 +167,7 @@ namespace IREmitter {
|
|||
SystemCall,
|
||||
RFIExit,
|
||||
InterpreterBranch,
|
||||
IdleLoop,
|
||||
|
||||
// "Opcode" representing a register too far away to
|
||||
// reference directly; this is a size optimization
|
||||
|
@ -365,6 +377,42 @@ namespace IREmitter {
|
|||
InstLoc EmitRFIExit() {
|
||||
return FoldZeroOp(RFIExit, 0);
|
||||
}
|
||||
InstLoc EmitIdleLoop(InstLoc idleParam, InstLoc pc) {
|
||||
return FoldBiOp(IdleLoop, idleParam, pc);
|
||||
}
|
||||
InstLoc EmitLoadSingle(InstLoc addr) {
|
||||
return FoldUOp(LoadSingle, addr);
|
||||
}
|
||||
InstLoc EmitLoadDouble(InstLoc addr) {
|
||||
return FoldUOp(LoadDouble, addr);
|
||||
}
|
||||
InstLoc EmitLoadPaired(InstLoc addr, unsigned quantReg) {
|
||||
return FoldUOp(LoadPaired, addr, quantReg);
|
||||
}
|
||||
InstLoc EmitLoadFReg(unsigned freg) {
|
||||
return FoldZeroOp(LoadFReg, freg);
|
||||
}
|
||||
InstLoc EmitStoreFReg(InstLoc val, unsigned freg) {
|
||||
return FoldUOp(StoreFReg, val, freg);
|
||||
}
|
||||
InstLoc EmitDupSingleToMReg(InstLoc val) {
|
||||
return FoldUOp(DupSingleToMReg, val);
|
||||
}
|
||||
InstLoc EmitInsertDoubleInMReg(InstLoc val, InstLoc reg) {
|
||||
return FoldBiOp(InsertDoubleInMReg, val, reg);
|
||||
}
|
||||
InstLoc EmitExpandPackedToMReg(InstLoc val) {
|
||||
return FoldUOp(ExpandPackedToMReg, val);
|
||||
}
|
||||
InstLoc EmitFSMul(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FSMul, op1, op2);
|
||||
}
|
||||
InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
|
||||
return FoldBiOp(FSAdd, op1, op2);
|
||||
}
|
||||
InstLoc EmitDoubleToSingle(InstLoc op1) {
|
||||
return FoldUOp(DoubleToSingle, op1);
|
||||
}
|
||||
|
||||
void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
|
||||
void StartForwardPass() { curReadPtr = &InstList[0]; }
|
||||
|
|
|
@ -420,12 +420,11 @@ namespace CPUCompare
|
|||
SetJumpTarget(skip);
|
||||
|
||||
const u8 *normalEntry = GetCodePtr();
|
||||
js.normalEntry = (u8*)normalEntry;
|
||||
|
||||
if (ImHereDebug)
|
||||
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
|
||||
|
||||
if (false && js.fpa.any)
|
||||
if (js.fpa.any)
|
||||
{
|
||||
//This block uses FPU - needs to add FP exception bailout
|
||||
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
|
||||
|
@ -445,24 +444,10 @@ namespace CPUCompare
|
|||
SetJumpTarget(b1);
|
||||
}
|
||||
|
||||
// Conditionally add profiling code.
|
||||
if (Profiler::g_ProfileBlocks) {
|
||||
ADD(32, M(&b->runCount), Imm8(1));
|
||||
#ifdef _WIN32
|
||||
b->ticCounter.QuadPart = 0;
|
||||
b->ticStart.QuadPart = 0;
|
||||
b->ticStop.QuadPart = 0;
|
||||
#else
|
||||
//TODO
|
||||
#endif
|
||||
// get start tic
|
||||
PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart);
|
||||
}
|
||||
js.rewriteStart = (u8*)GetCodePtr();
|
||||
|
||||
//Start up the register allocators
|
||||
//They use the information in gpa/fpa to preload commonly used registers.
|
||||
//gpr.Start(js.gpa);
|
||||
//fpr.Start(js.fpa);
|
||||
// Start up IR builder (structure that collects the
|
||||
// instruction processed by the JIT routines)
|
||||
ibuild.Reset();
|
||||
|
||||
js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
|
||||
|
@ -519,6 +504,7 @@ namespace CPUCompare
|
|||
break;
|
||||
}
|
||||
|
||||
// Perform actual code generation
|
||||
WriteCode();
|
||||
|
||||
b->flags = js.block_flags;
|
||||
|
|
|
@ -95,7 +95,7 @@ private:
|
|||
PPCAnalyst::BlockRegStats gpa;
|
||||
PPCAnalyst::BlockRegStats fpa;
|
||||
PPCAnalyst::CodeOp *op;
|
||||
u8* normalEntry;
|
||||
u8* rewriteStart;
|
||||
|
||||
JitBlock *curBlock;
|
||||
};
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "../PowerPC.h"
|
||||
#include "../../CoreTiming.h"
|
||||
#include "MemoryUtil.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#include "ABI.h"
|
||||
#include "Jit.h"
|
||||
|
@ -168,6 +169,176 @@ void AsmRoutineManager::Generate()
|
|||
GenerateCommon();
|
||||
}
|
||||
|
||||
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
const float m_quantizeTableS[] =
|
||||
{
|
||||
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
|
||||
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
|
||||
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
|
||||
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
|
||||
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
|
||||
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
|
||||
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
|
||||
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
|
||||
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
|
||||
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
|
||||
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
|
||||
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
|
||||
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
|
||||
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
|
||||
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
|
||||
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
|
||||
};
|
||||
|
||||
const float m_dequantizeTableS[] =
|
||||
{
|
||||
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
|
||||
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
|
||||
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
|
||||
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
|
||||
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
|
||||
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
|
||||
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
|
||||
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
|
||||
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
|
||||
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
|
||||
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
|
||||
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
|
||||
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
|
||||
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
|
||||
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
|
||||
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
|
||||
};
|
||||
|
||||
float psTemp[2];
|
||||
|
||||
void AsmRoutineManager::GenQuantizedLoads() {
|
||||
const u8* loadPairedIllegal = AlignCode4();
|
||||
UD2();
|
||||
const u8* loadPairedFloat = AlignCode4();
|
||||
if (cpu_info.bSSSE3) {
|
||||
#ifdef _M_X64
|
||||
MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
|
||||
#endif
|
||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||
} else {
|
||||
#ifdef _M_X64
|
||||
MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
|
||||
BSWAP(64, RCX);
|
||||
ROL(64, RCX, Imm8(32));
|
||||
MOVQ_xmm(XMM0, R(RCX));
|
||||
#else
|
||||
#if 0
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PSHUFLW(XMM0, R(XMM0), 0xB1);
|
||||
MOVAPD(XMM1, R(XMM0));
|
||||
PSRLW(XMM0, 8);
|
||||
PSLLW(XMM1, 8);
|
||||
POR(XMM0, R(XMM1));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(&psTemp[0]), R(RAX));
|
||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
|
||||
MOVQ_xmm(XMM0, M(&psTemp[0]));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU8 = AlignCode4();
|
||||
#ifdef _M_X64
|
||||
MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
|
||||
#endif
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLBW(XMM0, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||
PUNPCKLDQ(XMM1, R(XMM1));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS8 = AlignCode4();
|
||||
#ifdef _M_X64
|
||||
MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
|
||||
#endif
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
PUNPCKLBW(XMM0, R(XMM0));
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 24);
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||
PUNPCKLDQ(XMM1, R(XMM1));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedU16 = AlignCode4();
|
||||
#ifdef _M_X64
|
||||
MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
|
||||
#endif
|
||||
BSWAP(32, ECX);
|
||||
ROL(32, R(ECX), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||
PUNPCKLDQ(XMM1, R(XMM1));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
const u8* loadPairedS16 = AlignCode4();
|
||||
#ifdef _M_X64
|
||||
MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
|
||||
#else
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
|
||||
#endif
|
||||
BSWAP(32, ECX);
|
||||
ROL(32, R(ECX), Imm8(16));
|
||||
MOVD_xmm(XMM0, R(ECX));
|
||||
PUNPCKLWD(XMM0, R(XMM0));
|
||||
PSRAD(XMM0, 16);
|
||||
CVTDQ2PS(XMM0, R(XMM0));
|
||||
SHR(32, R(EAX), Imm8(6));
|
||||
AND(32, R(EAX), Imm32(0xFC));
|
||||
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||
PUNPCKLDQ(XMM1, R(XMM1));
|
||||
MULPS(XMM0, R(XMM1));
|
||||
RET();
|
||||
|
||||
pairedLoadQuantized[0] = loadPairedFloat;
|
||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||
pairedLoadQuantized[4] = loadPairedU8;
|
||||
pairedLoadQuantized[5] = loadPairedU16;
|
||||
pairedLoadQuantized[6] = loadPairedS8;
|
||||
pairedLoadQuantized[7] = loadPairedS16;
|
||||
}
|
||||
|
||||
void AsmRoutineManager::GenFifoWrite(int size)
|
||||
{
|
||||
|
@ -257,6 +428,8 @@ void AsmRoutineManager::GenerateCommon()
|
|||
SUB(32, M(&CoreTiming::downcount), Imm8(0));
|
||||
JMP(dispatcher, true);
|
||||
|
||||
GenQuantizedLoads();
|
||||
|
||||
computeRcFp = AlignCode16();
|
||||
//CMPSD(R(XMM0), M(&zero),
|
||||
// TODO
|
||||
|
|
|
@ -42,6 +42,7 @@ private:
|
|||
void GenFifoWrite(int size);
|
||||
void GenFifoFloatWrite();
|
||||
void GenFifoXmm64Write();
|
||||
void GenQuantizedLoads();
|
||||
|
||||
public:
|
||||
void Init() {
|
||||
|
@ -80,6 +81,8 @@ public:
|
|||
|
||||
const u8 *doReJit;
|
||||
|
||||
const u8 *pairedLoadQuantized[8];
|
||||
|
||||
bool compareEnabled;
|
||||
};
|
||||
|
||||
|
|
|
@ -39,6 +39,9 @@
|
|||
|
||||
// Zelda and many more games seem to pass the Acid Test.
|
||||
|
||||
//#define NORMALBRANCH_START Default(inst); ibuild.EmitInterpreterBranch(); return;
|
||||
#define NORMALBRANCH_START
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
void Jit64::sc(UGeckoInstruction inst)
|
||||
|
@ -53,6 +56,7 @@ using namespace Gen;
|
|||
|
||||
void Jit64::bx(UGeckoInstruction inst)
|
||||
{
|
||||
NORMALBRANCH_START
|
||||
if (inst.LK)
|
||||
ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));
|
||||
|
||||
|
@ -67,6 +71,7 @@ using namespace Gen;
|
|||
|
||||
void Jit64::bcx(UGeckoInstruction inst)
|
||||
{
|
||||
NORMALBRANCH_START
|
||||
if (inst.LK)
|
||||
ibuild.EmitStoreLink(
|
||||
ibuild.EmitIntConst(js.compilerPC + 4));
|
||||
|
@ -117,6 +122,7 @@ using namespace Gen;
|
|||
|
||||
void Jit64::bcctrx(UGeckoInstruction inst)
|
||||
{
|
||||
NORMALBRANCH_START
|
||||
Default(inst);
|
||||
ibuild.EmitInterpreterBranch();
|
||||
return;
|
||||
|
@ -124,6 +130,7 @@ using namespace Gen;
|
|||
|
||||
void Jit64::bclrx(UGeckoInstruction inst)
|
||||
{
|
||||
NORMALBRANCH_START
|
||||
if (inst.hex == 0x4e800020) {
|
||||
ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
|
||||
return;
|
||||
|
|
|
@ -29,141 +29,54 @@
|
|||
#define INSTRUCTION_START
|
||||
// #define INSTRUCTION_START Default(inst); return;
|
||||
|
||||
const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
||||
const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
||||
const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
|
||||
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
if (d == a)
|
||||
{
|
||||
fpr.LoadToX64(d, true);
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b && reversible)
|
||||
{
|
||||
fpr.LoadToX64(d, true);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else if (a != d && b != d)
|
||||
{
|
||||
// Sources different from d, can use rather quick solution
|
||||
fpr.LoadToX64(d, !dupe);
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (b != d)
|
||||
{
|
||||
fpr.LoadToX64(d, !dupe);
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
||||
}
|
||||
else // Other combo, must use two temps :(
|
||||
{
|
||||
MOVSD(XMM0, fpr.R(a));
|
||||
MOVSD(XMM1, fpr.R(b));
|
||||
fpr.LoadToX64(d, !dupe);
|
||||
(this->*op)(XMM0, Gen::R(XMM1));
|
||||
MOVSD(fpr.RX(d), Gen::R(XMM0));
|
||||
}
|
||||
if (dupe) {
|
||||
ForceSinglePrecisionS(fpr.RX(d));
|
||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||
}
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
if (inst.Rc) {
|
||||
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) {
|
||||
Default(inst); return;
|
||||
}
|
||||
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
|
||||
val = ibuild.EmitDoubleToSingle(val);
|
||||
bool dupe = inst.OPCD == 59;
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add
|
||||
case 25: //mul
|
||||
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
|
||||
case 18: //div
|
||||
case 20: //sub
|
||||
case 21: //add
|
||||
case 23: //sel
|
||||
Default(inst);
|
||||
break;
|
||||
case 24: //res
|
||||
Default(inst);
|
||||
break;
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
|
||||
}
|
||||
val = ibuild.EmitDupSingleToMReg(val);
|
||||
ibuild.EmitStoreFReg(val, inst.FD);
|
||||
}
|
||||
|
||||
void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
if (inst.Rc) {
|
||||
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) {
|
||||
Default(inst); return;
|
||||
}
|
||||
|
||||
bool single_precision = inst.OPCD == 59;
|
||||
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
|
||||
fpr.Lock(a, b, c, d);
|
||||
MOVSD(XMM0, fpr.R(a));
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 28: //msub
|
||||
MULSD(XMM0, fpr.R(c));
|
||||
SUBSD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 29: //madd
|
||||
MULSD(XMM0, fpr.R(c));
|
||||
ADDSD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
MULSD(XMM0, fpr.R(c));
|
||||
SUBSD(XMM0, fpr.R(b));
|
||||
XORPD(XMM0, M((void*)&psSignBits2));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
MULSD(XMM0, fpr.R(c));
|
||||
ADDSD(XMM0, fpr.R(b));
|
||||
XORPD(XMM0, M((void*)&psSignBits2));
|
||||
break;
|
||||
}
|
||||
fpr.LoadToX64(d, false);
|
||||
//YES it is necessary to dupe the result :(
|
||||
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
|
||||
if (single_precision) {
|
||||
ForceSinglePrecisionS(XMM0);
|
||||
MOVDDUP(fpr.RX(d), R(XMM0));
|
||||
} else {
|
||||
MOVSD(fpr.RX(d), R(XMM0));
|
||||
}
|
||||
fpr.UnlockAll();
|
||||
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
|
||||
val = ibuild.EmitDoubleToSingle(val);
|
||||
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
|
||||
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
|
||||
val = ibuild.EmitDupSingleToMReg(val);
|
||||
ibuild.EmitStoreFReg(val, inst.FD);
|
||||
}
|
||||
|
||||
void Jit64::fmrx(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
if (inst.Rc) {
|
||||
Default(inst); return;
|
||||
}
|
||||
int d = inst.FD;
|
||||
int b = inst.FB;
|
||||
fpr.LoadToX64(d, true); // we don't want to destroy the high bit
|
||||
MOVSD(fpr.RX(d), fpr.R(b));
|
||||
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FB);
|
||||
val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
|
||||
ibuild.EmitStoreFReg(val, inst.FD);
|
||||
}
|
||||
|
||||
void Jit64::fcmpx(UGeckoInstruction inst)
|
||||
|
|
|
@ -71,6 +71,20 @@ void Jit64::lhax(UGeckoInstruction inst)
|
|||
void Jit64::lXz(UGeckoInstruction inst)
|
||||
{
|
||||
INSTRUCTION_START
|
||||
|
||||
if (Core::GetStartupParameter().bSkipIdle &&
|
||||
inst.OPCD == 32 &&
|
||||
(inst.hex & 0xFFFF0000) == 0x800D0000 &&
|
||||
(Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 ||
|
||||
(Core::GetStartupParameter().bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
|
||||
Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
|
||||
{
|
||||
ibuild.EmitIdleLoop(ibuild.EmitIntConst(PowerPC::ppcState.gpr[inst.RA] + (s32)(s16)inst.SIMM_16),
|
||||
ibuild.EmitIntConst(js.compilerPC));
|
||||
js.compilerPC += 8;
|
||||
return;
|
||||
}
|
||||
|
||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
|
||||
if (inst.RA)
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
|
|
|
@ -57,39 +57,13 @@ u32 GC_ALIGNED16(temp32);
|
|||
|
||||
void Jit64::lfs(UGeckoInstruction inst)
|
||||
{
|
||||
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
|
||||
int d = inst.RD;
|
||||
int a = inst.RA;
|
||||
if (!a)
|
||||
{
|
||||
Default(inst);
|
||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
|
||||
if (inst.RA)
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
|
||||
ibuild.EmitStoreFReg(val, inst.RD);
|
||||
return;
|
||||
}
|
||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
||||
gpr.FlushLockX(ABI_PARAM1);
|
||||
gpr.Lock(a);
|
||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||
if (jo.assumeFPLoadFromMem)
|
||||
{
|
||||
UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
SafeLoadRegToEAX(ABI_PARAM1, 32, offset);
|
||||
}
|
||||
|
||||
MOV(32, M(&temp32), R(EAX));
|
||||
fpr.Lock(d);
|
||||
fpr.LoadToX64(d, false);
|
||||
CVTSS2SD(fpr.RX(d), M(&temp32));
|
||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
|
||||
void Jit64::lfd(UGeckoInstruction inst)
|
||||
|
@ -291,32 +265,10 @@ void Jit64::stfsx(UGeckoInstruction inst)
|
|||
|
||||
void Jit64::lfsx(UGeckoInstruction inst)
|
||||
{
|
||||
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
|
||||
fpr.Lock(inst.RS);
|
||||
fpr.LoadToX64(inst.RS, false, true);
|
||||
MOV(32, R(EAX), gpr.R(inst.RB));
|
||||
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB), val;
|
||||
if (inst.RA)
|
||||
ADD(32, R(EAX), gpr.R(inst.RA));
|
||||
if (cpu_info.bSSSE3) {
|
||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
||||
#ifdef _M_IX86
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
|
||||
#else
|
||||
MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
|
||||
#endif
|
||||
PSHUFB(r, M((void *)bswapShuffle1x4));
|
||||
CVTSS2SD(r, R(r));
|
||||
MOVDDUP(r, R(r));
|
||||
} else {
|
||||
UnsafeLoadRegToReg(EAX, EAX, 32, false);
|
||||
MOV(32, M(&temp32), R(EAX));
|
||||
CVTSS2SD(XMM0, M(&temp32));
|
||||
MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
|
||||
}
|
||||
fpr.UnlockAll();
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
|
||||
ibuild.EmitStoreFReg(val, inst.RD);
|
||||
}
|
||||
|
||||
|
|
|
@ -40,419 +40,20 @@
|
|||
#define INSTRUCTION_START
|
||||
// #define INSTRUCTION_START Default(inst); return;
|
||||
|
||||
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
|
||||
static u64 GC_ALIGNED16(temp64);
|
||||
|
||||
// TODO(ector): Improve 64-bit version
|
||||
static void WriteDual32(u64 value, u32 address)
|
||||
{
|
||||
Memory::Write_U32((u32)(value >> 32), address);
|
||||
Memory::Write_U32((u32)value, address + 4);
|
||||
}
|
||||
|
||||
const double GC_ALIGNED16(m_quantizeTableD[]) =
|
||||
{
|
||||
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
|
||||
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
|
||||
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
|
||||
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
|
||||
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
|
||||
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
|
||||
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
|
||||
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
|
||||
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
|
||||
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
|
||||
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
|
||||
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
|
||||
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
|
||||
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
|
||||
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
|
||||
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
|
||||
};
|
||||
|
||||
const double GC_ALIGNED16(m_dequantizeTableD[]) =
|
||||
{
|
||||
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
|
||||
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
|
||||
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
|
||||
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
|
||||
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
|
||||
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
|
||||
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
|
||||
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
|
||||
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
|
||||
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
|
||||
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
|
||||
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
|
||||
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
|
||||
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
|
||||
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
|
||||
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
|
||||
};
|
||||
|
||||
// The big problem is likely instructions that set the quantizers in the same block.
|
||||
// We will have to break block after quantizers are written to.
|
||||
void Jit64::psq_st(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
|
||||
|
||||
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
|
||||
{
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
if (!inst.RA)
|
||||
{
|
||||
// This really should never happen. Unless we change this to also support stwux
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
|
||||
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
|
||||
const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
|
||||
int stScale = gqr.ST_SCALE;
|
||||
bool update = inst.OPCD == 61;
|
||||
|
||||
int offset = inst.SIMM_12;
|
||||
int a = inst.RA;
|
||||
int s = inst.RS; // Fp numbers
|
||||
|
||||
if (inst.W) {
|
||||
// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update);
|
||||
// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
|
||||
// floats so that's what we'll work on.
|
||||
switch (stType)
|
||||
{
|
||||
case QUANTIZE_FLOAT:
|
||||
{
|
||||
// This one has quite a bit of optimization potential.
|
||||
if (gpr.R(a).IsImm())
|
||||
{
|
||||
PanicAlert("Imm: %08x", gpr.R(a).offset);
|
||||
}
|
||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
||||
gpr.Lock(a);
|
||||
fpr.Lock(s);
|
||||
if (update)
|
||||
gpr.LoadToX64(a, true, true);
|
||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
||||
if (offset)
|
||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
||||
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
|
||||
if (update && offset)
|
||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
||||
CVTSD2SS(XMM0, fpr.R(s));
|
||||
MOVD_xmm(M(&temp64), XMM0);
|
||||
MOV(32, R(ABI_PARAM1), M(&temp64));
|
||||
FixupBranch argh = J_CC(CC_NZ);
|
||||
BSWAP(32, ABI_PARAM1);
|
||||
#ifdef _M_X64
|
||||
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
||||
#else
|
||||
MOV(32, R(EAX), R(ABI_PARAM2));
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
|
||||
#endif
|
||||
FixupBranch skip_call = J();
|
||||
SetJumpTarget(argh);
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
SetJumpTarget(skip_call);
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
return;
|
||||
}
|
||||
default:
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (stType == QUANTIZE_FLOAT)
|
||||
{
|
||||
if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
|
||||
{
|
||||
u32 addr = (u32)(gpr.R(a).offset + offset);
|
||||
if (addr == 0xCC008000) {
|
||||
// Writing to FIFO. Let's do fast method.
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
|
||||
CALL((void*)asm_routines.fifoDirectWriteXmm64);
|
||||
js.fifoBytesThisBlock += 8;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
||||
gpr.Lock(a);
|
||||
fpr.Lock(s);
|
||||
if (update)
|
||||
gpr.LoadToX64(a, true, true);
|
||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
||||
if (offset)
|
||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
||||
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
|
||||
if (update && offset)
|
||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
SHUFPS(XMM0, R(XMM0), 1);
|
||||
MOVQ_xmm(M(&temp64), XMM0);
|
||||
#ifdef _M_X64
|
||||
MOV(64, R(ABI_PARAM1), M(&temp64));
|
||||
FixupBranch argh = J_CC(CC_NZ);
|
||||
BSWAP(64, ABI_PARAM1);
|
||||
MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
||||
FixupBranch arg2 = J();
|
||||
SetJumpTarget(argh);
|
||||
CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
|
||||
#else
|
||||
FixupBranch argh = J_CC(CC_NZ);
|
||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
|
||||
BSWAP(32, ABI_PARAM1);
|
||||
AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1));
|
||||
MOV(32, R(ABI_PARAM1), M(&temp64));
|
||||
BSWAP(32, ABI_PARAM1);
|
||||
MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1));
|
||||
FixupBranch arg2 = J();
|
||||
SetJumpTarget(argh);
|
||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
|
||||
ADD(32, R(ABI_PARAM2), Imm32(4));
|
||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
||||
#endif
|
||||
SetJumpTarget(arg2);
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
else if (stType == QUANTIZE_U8)
|
||||
{
|
||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
||||
gpr.Lock(a);
|
||||
fpr.Lock(s);
|
||||
if (update)
|
||||
gpr.LoadToX64(a, true, update);
|
||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
||||
if (offset)
|
||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
||||
if (update && offset)
|
||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
||||
MOVAPD(XMM0, fpr.R(s));
|
||||
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
|
||||
MULPD(XMM0, R(XMM1));
|
||||
CVTPD2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKUSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(M(&temp64), XMM0);
|
||||
MOV(16, R(ABI_PARAM1), M(&temp64));
|
||||
#ifdef _M_X64
|
||||
MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
||||
#else
|
||||
MOV(32, R(EAX), R(ABI_PARAM2));
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
|
||||
#endif
|
||||
if (update)
|
||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
else if (stType == QUANTIZE_S16)
|
||||
{
|
||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
||||
gpr.Lock(a);
|
||||
fpr.Lock(s);
|
||||
if (update)
|
||||
gpr.LoadToX64(a, true, update);
|
||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
||||
if (offset)
|
||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
||||
if (update)
|
||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
||||
MOVAPD(XMM0, fpr.R(s));
|
||||
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
|
||||
MULPD(XMM0, R(XMM1));
|
||||
SHUFPD(XMM0, R(XMM0), 1);
|
||||
CVTPD2DQ(XMM0, R(XMM0));
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
MOVD_xmm(M(&temp64), XMM0);
|
||||
MOV(32, R(ABI_PARAM1), M(&temp64));
|
||||
BSWAP(32, ABI_PARAM1);
|
||||
#ifdef _M_X64
|
||||
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
||||
#else
|
||||
MOV(32, R(EAX), R(ABI_PARAM2));
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
|
||||
#endif
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
else {
|
||||
// Dodger uses this.
|
||||
// mario tennis
|
||||
//PanicAlert("st %i:%i", stType, inst.W);
|
||||
Default(inst);
|
||||
}
|
||||
Default(inst); return;
|
||||
}
|
||||
|
||||
void Jit64::psq_l(UGeckoInstruction inst)
|
||||
{
|
||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
|
||||
{Default(inst); return;} // turn off from debugger
|
||||
INSTRUCTION_START;
|
||||
|
||||
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
|
||||
|
||||
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
|
||||
{
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
|
||||
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
|
||||
const EQuantizeType ldType = static_cast<EQuantizeType>(gqr.LD_TYPE);
|
||||
int ldScale = gqr.LD_SCALE;
|
||||
bool update = inst.OPCD == 57;
|
||||
if (!inst.RA || inst.W)
|
||||
{
|
||||
// 0 1 during load
|
||||
//PanicAlert("ld:%i %i", ldType, (int)inst.W);
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
int offset = inst.SIMM_12;
|
||||
switch (ldType) {
|
||||
case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address.
|
||||
{
|
||||
#ifdef _M_X64
|
||||
gpr.LoadToX64(inst.RA, true, update);
|
||||
fpr.LoadToX64(inst.RS, false);
|
||||
if (cpu_info.bSSSE3) {
|
||||
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
|
||||
MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
||||
PSHUFB(xd, M((void *)pbswapShuffle2x4));
|
||||
CVTPS2PD(xd, R(xd));
|
||||
} else {
|
||||
MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
||||
BSWAP(64, RAX);
|
||||
MOV(64, M(&psTemp[0]), R(RAX));
|
||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
||||
CVTPS2PD(r, M(&psTemp[0]));
|
||||
SHUFPD(r, R(r), 1);
|
||||
}
|
||||
if (update && offset != 0)
|
||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
||||
break;
|
||||
#else
|
||||
if (cpu_info.bSSSE3) {
|
||||
gpr.LoadToX64(inst.RA, true, update);
|
||||
fpr.LoadToX64(inst.RS, false);
|
||||
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
|
||||
MOV(32, R(EAX), gpr.R(inst.RA));
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset));
|
||||
PSHUFB(xd, M((void *)pbswapShuffle2x4));
|
||||
CVTPS2PD(xd, R(xd));
|
||||
} else {
|
||||
gpr.FlushLockX(ECX);
|
||||
gpr.LoadToX64(inst.RA, true, update);
|
||||
// This can probably be optimized somewhat.
|
||||
LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
|
||||
BSWAP(32, RAX);
|
||||
MOV(32, M(&psTemp[0]), R(RAX));
|
||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
|
||||
BSWAP(32, RAX);
|
||||
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
|
||||
fpr.LoadToX64(inst.RS, false, true);
|
||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
||||
CVTPS2PD(r, M(&psTemp[0]));
|
||||
gpr.UnlockAllX();
|
||||
}
|
||||
if (update && offset != 0)
|
||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
case QUANTIZE_U8:
|
||||
{
|
||||
gpr.LoadToX64(inst.RA, true, update);
|
||||
#ifdef _M_X64
|
||||
MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
||||
#else
|
||||
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base));
|
||||
#endif
|
||||
MOV(32, M(&temp64), R(EAX));
|
||||
MOVD_xmm(XMM0, M(&temp64));
|
||||
// SSE4 optimization opportunity here.
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLBW(XMM0, R(XMM1));
|
||||
PUNPCKLWD(XMM0, R(XMM1));
|
||||
CVTDQ2PD(XMM0, R(XMM0));
|
||||
fpr.LoadToX64(inst.RS, false, true);
|
||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
||||
MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale]));
|
||||
MULPD(r, R(XMM0));
|
||||
if (update && offset != 0)
|
||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
||||
}
|
||||
break;
|
||||
case QUANTIZE_S16:
|
||||
{
|
||||
gpr.LoadToX64(inst.RA, true, update);
|
||||
#ifdef _M_X64
|
||||
MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
||||
#else
|
||||
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
|
||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base));
|
||||
#endif
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, M(&temp64), R(EAX));
|
||||
fpr.LoadToX64(inst.RS, false, true);
|
||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
||||
MOVD_xmm(XMM0, M(&temp64));
|
||||
PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword..
|
||||
PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P
|
||||
CVTDQ2PD(XMM0, R(XMM0));
|
||||
MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale]));
|
||||
MULPD(r, R(XMM0));
|
||||
SHUFPD(r, R(r), 1);
|
||||
if (update && offset != 0)
|
||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
||||
}
|
||||
break;
|
||||
|
||||
/*
|
||||
Dynamic quantizer. Todo when we have a test set.
|
||||
MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3)); // it's in the high byte.
|
||||
AND(32, R(EAX), Imm8(0x3F));
|
||||
MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD));
|
||||
MOVDDUP(r, MComplex(RCX, EAX, 8, 0));
|
||||
*/
|
||||
default:
|
||||
// 4 0
|
||||
// 6 0 //power tennis
|
||||
// 5 0
|
||||
// PanicAlert("ld:%i %i", ldType, (int)inst.W);
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
|
||||
//u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12;
|
||||
if (inst.W) {Default(inst); return;}
|
||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val;
|
||||
if (inst.RA)
|
||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||
val = ibuild.EmitLoadPaired(addr, inst.I);
|
||||
val = ibuild.EmitExpandPackedToMReg(val);
|
||||
ibuild.EmitStoreFReg(val, inst.RD);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue