A bit more progress on my JIT WIP: biggest changes are some substantial
work on floating-point. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1743 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
35128bb041
commit
29a033e1dd
|
@ -97,8 +97,9 @@ Inter-block dead condition register elimination (Likely significant win
|
||||||
Optimize conditions for conditional branches.
|
Optimize conditions for conditional branches.
|
||||||
General dead register elimination.
|
General dead register elimination.
|
||||||
Inter-block inlining.
|
Inter-block inlining.
|
||||||
Track down a few correctness bugs (I think there's something wrong
|
Track down issues with new JIT + dual-core mode (I think I'm going to
|
||||||
with my branches, but I haven't been able to figure it out).
|
need help with this one; I'm not very familiar with the
|
||||||
|
dual-core code.)
|
||||||
Specialized slw/srw/sraw; I think there are some tricks that could
|
Specialized slw/srw/sraw; I think there are some tricks that could
|
||||||
have a non-trivial effect, and there are significantly shorter
|
have a non-trivial effect, and there are significantly shorter
|
||||||
implementations for 64-bit involving abusing 64-bit shifts.
|
implementations for 64-bit involving abusing 64-bit shifts.
|
||||||
|
@ -502,16 +503,21 @@ struct RegInfo {
|
||||||
InstLoc FirstI;
|
InstLoc FirstI;
|
||||||
std::vector<unsigned> IInfo;
|
std::vector<unsigned> IInfo;
|
||||||
InstLoc regs[16];
|
InstLoc regs[16];
|
||||||
|
InstLoc fregs[16];
|
||||||
unsigned numSpills;
|
unsigned numSpills;
|
||||||
|
unsigned numFSpills;
|
||||||
bool MakeProfile;
|
bool MakeProfile;
|
||||||
bool UseProfile;
|
bool UseProfile;
|
||||||
unsigned numProfiledLoads;
|
unsigned numProfiledLoads;
|
||||||
unsigned exitNumber;
|
unsigned exitNumber;
|
||||||
|
|
||||||
RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) {
|
RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) {
|
||||||
for (unsigned i = 0; i < 16; i++)
|
for (unsigned i = 0; i < 16; i++) {
|
||||||
regs[i] = 0;
|
regs[i] = 0;
|
||||||
|
fregs[i] = 0;
|
||||||
|
}
|
||||||
numSpills = 0;
|
numSpills = 0;
|
||||||
|
numFSpills = 0;
|
||||||
numProfiledLoads = 0;
|
numProfiledLoads = 0;
|
||||||
exitNumber = 0;
|
exitNumber = 0;
|
||||||
MakeProfile = UseProfile = false;
|
MakeProfile = UseProfile = false;
|
||||||
|
@ -533,6 +539,7 @@ static unsigned regReadUse(RegInfo& R, InstLoc I) {
|
||||||
|
|
||||||
static unsigned SlotSet[1000];
|
static unsigned SlotSet[1000];
|
||||||
static unsigned ProfiledLoads[1000];
|
static unsigned ProfiledLoads[1000];
|
||||||
|
static u8 GC_ALIGNED16(FSlotSet[16*1000]);
|
||||||
|
|
||||||
static OpArg regLocForSlot(RegInfo& RI, unsigned slot) {
|
static OpArg regLocForSlot(RegInfo& RI, unsigned slot) {
|
||||||
return M(&SlotSet[slot - 1]);
|
return M(&SlotSet[slot - 1]);
|
||||||
|
@ -558,57 +565,86 @@ static void regSpill(RegInfo& RI, X64Reg reg) {
|
||||||
RI.regs[reg] = 0;
|
RI.regs[reg] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
|
||||||
|
return M(&FSlotSet[slot*16]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
|
||||||
|
unsigned newSpill = ++RI.numFSpills;
|
||||||
|
RI.IInfo[I - RI.FirstI] |= newSpill << 16;
|
||||||
|
return newSpill;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned fregGetSpill(RegInfo& RI, InstLoc I) {
|
||||||
|
return RI.IInfo[I - RI.FirstI] >> 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fregSpill(RegInfo& RI, X64Reg reg) {
|
||||||
|
if (!RI.fregs[reg]) return;
|
||||||
|
unsigned slot = fregGetSpill(RI, RI.fregs[reg]);
|
||||||
|
if (!slot) {
|
||||||
|
slot = fregCreateSpill(RI, RI.fregs[reg]);
|
||||||
|
RI.Jit->MOVAPD(fregLocForSlot(RI, slot), reg);
|
||||||
|
}
|
||||||
|
RI.fregs[reg] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ECX is scratch, so we don't allocate it
|
||||||
|
static X64Reg RegAllocOrder[] = {EDI, ESI, EBP, EBX, EDX, EAX};
|
||||||
|
static unsigned RegAllocSize = sizeof(RegAllocOrder) / sizeof(X64Reg);
|
||||||
|
static X64Reg FRegAllocOrder[] = {XMM2, XMM3, XMM4, XMM5, XMM6, XMM7};
|
||||||
|
static unsigned FRegAllocSize = sizeof(FRegAllocOrder) / sizeof(X64Reg);
|
||||||
|
|
||||||
static X64Reg regFindFreeReg(RegInfo& RI) {
|
static X64Reg regFindFreeReg(RegInfo& RI) {
|
||||||
if (RI.regs[EDI] == 0) return EDI;
|
for (unsigned i = 0; i < RegAllocSize; i++)
|
||||||
if (RI.regs[ESI] == 0) return ESI;
|
if (RI.regs[RegAllocOrder[i]] == 0)
|
||||||
if (RI.regs[EBP] == 0) return EBP;
|
return RegAllocOrder[i];
|
||||||
if (RI.regs[EBX] == 0) return EBX;
|
|
||||||
if (RI.regs[EDX] == 0) return EDX;
|
|
||||||
if (RI.regs[EAX] == 0) return EAX;
|
|
||||||
// ECX is scratch, so we don't allocate it
|
|
||||||
static X64Reg regs[] = {EDI, ESI, EBP, EBX, EDX, EAX};
|
|
||||||
static unsigned nextReg = 0;
|
static unsigned nextReg = 0;
|
||||||
X64Reg reg = regs[nextReg++ % 6];
|
X64Reg reg = RegAllocOrder[nextReg++ % RegAllocSize];
|
||||||
regSpill(RI, reg);
|
regSpill(RI, reg);
|
||||||
return reg;
|
return reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static X64Reg fregFindFreeReg(RegInfo& RI) {
|
||||||
|
for (unsigned i = 0; i < FRegAllocSize; i++)
|
||||||
|
if (RI.fregs[FRegAllocOrder[i]] == 0)
|
||||||
|
return FRegAllocOrder[i];
|
||||||
|
// XMM0/1 are scratch, so we don't allocate it
|
||||||
|
fregSpill(RI, XMM7);
|
||||||
|
return XMM7;
|
||||||
|
}
|
||||||
|
|
||||||
static OpArg regLocForInst(RegInfo& RI, InstLoc I) {
|
static OpArg regLocForInst(RegInfo& RI, InstLoc I) {
|
||||||
if (RI.regs[EDI] == I) return R(EDI);
|
for (unsigned i = 0; i < RegAllocSize; i++)
|
||||||
if (RI.regs[ESI] == I) return R(ESI);
|
if (RI.regs[RegAllocOrder[i]] == I)
|
||||||
if (RI.regs[EBP] == I) return R(EBP);
|
return R(RegAllocOrder[i]);
|
||||||
if (RI.regs[EBX] == I) return R(EBX);
|
|
||||||
if (RI.regs[EDX] == I) return R(EDX);
|
|
||||||
if (RI.regs[EAX] == I) return R(EAX);
|
|
||||||
if (RI.regs[ECX] == I) return R(ECX);
|
|
||||||
|
|
||||||
if (regGetSpill(RI, I) == 0)
|
if (regGetSpill(RI, I) == 0)
|
||||||
PanicAlert("Retrieving unknown spill slot?!");
|
PanicAlert("Retrieving unknown spill slot?!");
|
||||||
return regLocForSlot(RI, regGetSpill(RI, I));
|
return regLocForSlot(RI, regGetSpill(RI, I));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static OpArg fregLocForInst(RegInfo& RI, InstLoc I) {
|
||||||
|
for (unsigned i = 0; i < FRegAllocSize; i++)
|
||||||
|
if (RI.fregs[FRegAllocOrder[i]] == I)
|
||||||
|
return R(FRegAllocOrder[i]);
|
||||||
|
|
||||||
|
if (fregGetSpill(RI, I) == 0)
|
||||||
|
PanicAlert("Retrieving unknown spill slot?!");
|
||||||
|
return fregLocForSlot(RI, fregGetSpill(RI, I));
|
||||||
|
}
|
||||||
|
|
||||||
static void regClearInst(RegInfo& RI, InstLoc I) {
|
static void regClearInst(RegInfo& RI, InstLoc I) {
|
||||||
if (RI.regs[EDI] == I) {
|
for (unsigned i = 0; i < RegAllocSize; i++)
|
||||||
RI.regs[EDI] = 0;
|
if (RI.regs[RegAllocOrder[i]] == I)
|
||||||
}
|
RI.regs[RegAllocOrder[i]] = 0;
|
||||||
if (RI.regs[ESI] == I) {
|
}
|
||||||
RI.regs[ESI] = 0;
|
|
||||||
}
|
static void fregClearInst(RegInfo& RI, InstLoc I) {
|
||||||
if (RI.regs[EBP] == I) {
|
for (unsigned i = 0; i < FRegAllocSize; i++)
|
||||||
RI.regs[EBP] = 0;
|
if (RI.fregs[FRegAllocOrder[i]] == I)
|
||||||
}
|
RI.fregs[FRegAllocOrder[i]] = 0;
|
||||||
if (RI.regs[EBX] == I) {
|
|
||||||
RI.regs[EBX] = 0;
|
|
||||||
}
|
|
||||||
if (RI.regs[EDX] == I) {
|
|
||||||
RI.regs[EDX] = 0;
|
|
||||||
}
|
|
||||||
if (RI.regs[EAX] == I) {
|
|
||||||
RI.regs[EAX] = 0;
|
|
||||||
}
|
|
||||||
if (RI.regs[ECX] == I) {
|
|
||||||
RI.regs[ECX] = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
|
static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
|
||||||
|
@ -645,6 +681,20 @@ static X64Reg regBinLHSReg(RegInfo& RI, InstLoc I) {
|
||||||
return reg;
|
return reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void regNormalRegClear(RegInfo& RI, InstLoc I) {
|
||||||
|
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||||
|
regClearInst(RI, getOp1(I));
|
||||||
|
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||||
|
regClearInst(RI, getOp2(I));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fregNormalRegClear(RegInfo& RI, InstLoc I) {
|
||||||
|
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||||
|
fregClearInst(RI, getOp1(I));
|
||||||
|
if (RI.IInfo[I - RI.FirstI] & 8)
|
||||||
|
fregClearInst(RI, getOp2(I));
|
||||||
|
}
|
||||||
|
|
||||||
static void regEmitBinInst(RegInfo& RI, InstLoc I,
|
static void regEmitBinInst(RegInfo& RI, InstLoc I,
|
||||||
void (Jit64::*op)(int, const OpArg&,
|
void (Jit64::*op)(int, const OpArg&,
|
||||||
const OpArg&)) {
|
const OpArg&)) {
|
||||||
|
@ -660,11 +710,11 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I,
|
||||||
(RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I)));
|
(RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I)));
|
||||||
}
|
}
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark and calculation routines for profiled load/store addresses
|
// Mark and calculation routines for profiled load/store addresses
|
||||||
// Could be extended to unprofiled addresses.
|
// Could be extended to unprofiled addresses.
|
||||||
// FIXME: Finish/activate!
|
|
||||||
static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
|
static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
|
||||||
if (isImm(*AI)) {
|
if (isImm(*AI)) {
|
||||||
unsigned addr = RI.Build->GetImmValue(AI);
|
unsigned addr = RI.Build->GetImmValue(AI);
|
||||||
|
@ -743,7 +793,6 @@ static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI,
|
||||||
}
|
}
|
||||||
return MDisp(baseReg, offset);
|
return MDisp(baseReg, offset);
|
||||||
}
|
}
|
||||||
// end FIXME
|
|
||||||
|
|
||||||
static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
|
static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
|
||||||
if (RI.UseProfile) {
|
if (RI.UseProfile) {
|
||||||
|
@ -844,7 +893,6 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
|
||||||
RI.Jit->js.fifoBytesThisBlock += Size >> 3;
|
RI.Jit->js.fifoBytesThisBlock += Size >> 3;
|
||||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
if (RI.IInfo[I - RI.FirstI] & 4)
|
||||||
regClearInst(RI, getOp1(I));
|
regClearInst(RI, getOp1(I));
|
||||||
//regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
|
|
||||||
regClearDeadMemAddress(RI, I, getOp2(I), 2);
|
regClearDeadMemAddress(RI, I, getOp2(I), 2);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -878,6 +926,7 @@ static void regEmitShiftInst(RegInfo& RI, InstLoc I,
|
||||||
RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
|
RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
|
||||||
(RI.Jit->*op)(32, R(reg), R(ECX));
|
(RI.Jit->*op)(32, R(reg), R(ECX));
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I,
|
static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I,
|
||||||
|
@ -930,7 +979,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts());
|
RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts());
|
||||||
RI.Build = ibuild;
|
RI.Build = ibuild;
|
||||||
RI.UseProfile = UseProfile;
|
RI.UseProfile = UseProfile;
|
||||||
RI.MakeProfile = !RI.UseProfile;
|
RI.MakeProfile = false;//!RI.UseProfile;
|
||||||
// Pass to compute liveness
|
// Pass to compute liveness
|
||||||
ibuild->StartBackPass();
|
ibuild->StartBackPass();
|
||||||
for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
|
for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
|
||||||
|
@ -949,12 +998,14 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
case LoadCarry:
|
case LoadCarry:
|
||||||
case LoadCTR:
|
case LoadCTR:
|
||||||
case LoadMSR:
|
case LoadMSR:
|
||||||
|
case LoadFReg:
|
||||||
case BlockEnd:
|
case BlockEnd:
|
||||||
case BlockStart:
|
case BlockStart:
|
||||||
case InterpreterFallback:
|
case InterpreterFallback:
|
||||||
case SystemCall:
|
case SystemCall:
|
||||||
case RFIExit:
|
case RFIExit:
|
||||||
case InterpreterBranch:
|
case InterpreterBranch:
|
||||||
|
case IdleLoop:
|
||||||
// No liveness effects
|
// No liveness effects
|
||||||
break;
|
break;
|
||||||
case Tramp:
|
case Tramp:
|
||||||
|
@ -965,6 +1016,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
case SExt16:
|
case SExt16:
|
||||||
case BSwap32:
|
case BSwap32:
|
||||||
case BSwap16:
|
case BSwap16:
|
||||||
|
case DupSingleToMReg:
|
||||||
|
case DoubleToSingle:
|
||||||
|
case ExpandPackedToMReg:
|
||||||
if (thisUsed)
|
if (thisUsed)
|
||||||
regMarkUse(RI, I, getOp1(I), 1);
|
regMarkUse(RI, I, getOp1(I), 1);
|
||||||
break;
|
break;
|
||||||
|
@ -973,6 +1027,10 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
case Load32:
|
case Load32:
|
||||||
regMarkMemAddress(RI, I, getOp1(I), 1);
|
regMarkMemAddress(RI, I, getOp1(I), 1);
|
||||||
break;
|
break;
|
||||||
|
case LoadSingle:
|
||||||
|
case LoadPaired:
|
||||||
|
regMarkUse(RI, I, getOp1(I), 1);
|
||||||
|
break;
|
||||||
case StoreCR:
|
case StoreCR:
|
||||||
case StoreCarry:
|
case StoreCarry:
|
||||||
regMarkUse(RI, I, getOp1(I), 1);
|
regMarkUse(RI, I, getOp1(I), 1);
|
||||||
|
@ -981,6 +1039,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
case StoreLink:
|
case StoreLink:
|
||||||
case StoreCTR:
|
case StoreCTR:
|
||||||
case StoreMSR:
|
case StoreMSR:
|
||||||
|
case StoreFReg:
|
||||||
if (!isImm(*getOp1(I)))
|
if (!isImm(*getOp1(I)))
|
||||||
regMarkUse(RI, I, getOp1(I), 1);
|
regMarkUse(RI, I, getOp1(I), 1);
|
||||||
break;
|
break;
|
||||||
|
@ -1000,6 +1059,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
case ICmpUgt:
|
case ICmpUgt:
|
||||||
case ICmpSle:
|
case ICmpSle:
|
||||||
case ICmpSgt:
|
case ICmpSgt:
|
||||||
|
case FSMul:
|
||||||
|
case FSAdd:
|
||||||
|
case InsertDoubleInMReg:
|
||||||
if (thisUsed) {
|
if (thisUsed) {
|
||||||
regMarkUse(RI, I, getOp1(I), 1);
|
regMarkUse(RI, I, getOp1(I), 1);
|
||||||
if (!isImm(*getOp2(I)))
|
if (!isImm(*getOp2(I)))
|
||||||
|
@ -1041,6 +1103,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
case InterpreterFallback: {
|
case InterpreterFallback: {
|
||||||
unsigned InstCode = ibuild->GetImmValue(getOp1(I));
|
unsigned InstCode = ibuild->GetImmValue(getOp1(I));
|
||||||
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
|
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
|
||||||
|
// There really shouldn't be anything live across an
|
||||||
|
// interpreter call at the moment, but optimizing interpreter
|
||||||
|
// calls isn't completely out of the question...
|
||||||
regSpillCallerSaved(RI);
|
regSpillCallerSaved(RI);
|
||||||
Jit->MOV(32, M(&PC), Imm32(InstLoc));
|
Jit->MOV(32, M(&PC), Imm32(InstLoc));
|
||||||
Jit->MOV(32, M(&NPC), Imm32(InstLoc+4));
|
Jit->MOV(32, M(&NPC), Imm32(InstLoc+4));
|
||||||
|
@ -1089,6 +1154,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
unsigned ppcreg = *I >> 16;
|
unsigned ppcreg = *I >> 16;
|
||||||
regStoreInstToConstLoc(RI, 32, getOp1(I),
|
regStoreInstToConstLoc(RI, 32, getOp1(I),
|
||||||
&PowerPC::ppcState.gpr[ppcreg]);
|
&PowerPC::ppcState.gpr[ppcreg]);
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case StoreCR: {
|
case StoreCR: {
|
||||||
|
@ -1096,18 +1162,22 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
unsigned ppcreg = *I >> 16;
|
unsigned ppcreg = *I >> 16;
|
||||||
// CAUTION: uses 8-bit reg!
|
// CAUTION: uses 8-bit reg!
|
||||||
Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX));
|
Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX));
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case StoreLink: {
|
case StoreLink: {
|
||||||
regStoreInstToConstLoc(RI, 32, getOp1(I), &LR);
|
regStoreInstToConstLoc(RI, 32, getOp1(I), &LR);
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case StoreCTR: {
|
case StoreCTR: {
|
||||||
regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR);
|
regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR);
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case StoreMSR: {
|
case StoreMSR: {
|
||||||
regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR);
|
regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR);
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case StoreCarry: {
|
case StoreCarry: {
|
||||||
|
@ -1118,6 +1188,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
Jit->SetJumpTarget(nocarry);
|
Jit->SetJumpTarget(nocarry);
|
||||||
Jit->JitClearCA();
|
Jit->JitClearCA();
|
||||||
Jit->SetJumpTarget(cont);
|
Jit->SetJumpTarget(cont);
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case Load8: {
|
case Load8: {
|
||||||
|
@ -1150,6 +1221,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||||
Jit->MOVSX(32, 8, reg, R(ECX));
|
Jit->MOVSX(32, 8, reg, R(ECX));
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SExt16: {
|
case SExt16: {
|
||||||
|
@ -1157,6 +1229,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
X64Reg reg = regUReg(RI, I);
|
X64Reg reg = regUReg(RI, I);
|
||||||
Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I)));
|
Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I)));
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case And: {
|
case And: {
|
||||||
|
@ -1199,6 +1272,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I)));
|
Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I)));
|
||||||
}
|
}
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case Rol: {
|
case Rol: {
|
||||||
|
@ -1228,6 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
X64Reg reg = regFindFreeReg(RI);
|
X64Reg reg = regFindFreeReg(RI);
|
||||||
Jit->MOVZX(32, 8, reg, R(ECX));
|
Jit->MOVZX(32, 8, reg, R(ECX));
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ICmpUgt: {
|
case ICmpUgt: {
|
||||||
|
@ -1237,6 +1312,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
X64Reg reg = regFindFreeReg(RI);
|
X64Reg reg = regFindFreeReg(RI);
|
||||||
Jit->MOVZX(32, 8, reg, R(ECX));
|
Jit->MOVZX(32, 8, reg, R(ECX));
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ICmpSle: {
|
case ICmpSle: {
|
||||||
|
@ -1246,6 +1322,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
X64Reg reg = regFindFreeReg(RI);
|
X64Reg reg = regFindFreeReg(RI);
|
||||||
Jit->MOVZX(32, 8, reg, R(ECX));
|
Jit->MOVZX(32, 8, reg, R(ECX));
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ICmpCRUnsigned: {
|
case ICmpCRUnsigned: {
|
||||||
|
@ -1264,6 +1341,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
Jit->SetJumpTarget(continue1);
|
Jit->SetJumpTarget(continue1);
|
||||||
Jit->SetJumpTarget(continue2);
|
Jit->SetJumpTarget(continue2);
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case ICmpCRSigned: {
|
case ICmpCRSigned: {
|
||||||
|
@ -1282,6 +1360,102 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
Jit->SetJumpTarget(continue1);
|
Jit->SetJumpTarget(continue1);
|
||||||
Jit->SetJumpTarget(continue2);
|
Jit->SetJumpTarget(continue2);
|
||||||
RI.regs[reg] = I;
|
RI.regs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case LoadSingle: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||||
|
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
|
||||||
|
Jit->MOVD_xmm(reg, R(ECX));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case LoadPaired: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
regSpill(RI, EAX);
|
||||||
|
regSpill(RI, EDX);
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
unsigned quantreg = *I >> 16;
|
||||||
|
Jit->MOVZX(32, 16, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + quantreg]) + 2));
|
||||||
|
Jit->MOVZX(32, 8, EDX, R(AL));
|
||||||
|
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]!
|
||||||
|
Jit->SHL(32, R(EDX), Imm8(2));
|
||||||
|
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
|
||||||
|
Jit->CALLptr(MDisp(EDX, (u32)asm_routines.pairedLoadQuantized));
|
||||||
|
Jit->MOVAPD(reg, R(XMM0));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case DupSingleToMReg: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
|
||||||
|
Jit->MOVDDUP(reg, R(reg));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case InsertDoubleInMReg: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
|
||||||
|
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||||
|
Jit->MOVSD(reg, R(XMM0));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case ExpandPackedToMReg: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->CVTPS2PD(reg, fregLocForInst(RI, getOp1(I)));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case LoadFReg: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
unsigned ppcreg = *I >> 8;
|
||||||
|
Jit->MOVAPD(reg, M(&PowerPC::ppcState.ps[ppcreg]));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case StoreFReg: {
|
||||||
|
unsigned ppcreg = *I >> 16;
|
||||||
|
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
|
||||||
|
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0);
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case DoubleToSingle: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case FSMul: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||||
|
Jit->MULSS(reg, fregLocForInst(RI, getOp2(I)));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case FSAdd: {
|
||||||
|
if (!thisUsed) break;
|
||||||
|
X64Reg reg = fregFindFreeReg(RI);
|
||||||
|
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
|
||||||
|
Jit->ADDSS(reg, fregLocForInst(RI, getOp2(I)));
|
||||||
|
RI.fregs[reg] = I;
|
||||||
|
fregNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case CInt32:
|
case CInt32:
|
||||||
|
@ -1328,6 +1502,15 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
}
|
}
|
||||||
case BranchUncond: {
|
case BranchUncond: {
|
||||||
regWriteExit(RI, getOp1(I));
|
regWriteExit(RI, getOp1(I));
|
||||||
|
regNormalRegClear(RI, I);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case IdleLoop: {
|
||||||
|
unsigned IdleParam = ibuild->GetImmValue(getOp1(I));
|
||||||
|
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
|
||||||
|
Jit->ABI_CallFunctionC((void *)&PowerPC::OnIdle, IdleParam);
|
||||||
|
Jit->MOV(32, M(&PowerPC::ppcState.pc), Imm32(InstLoc + 12));
|
||||||
|
Jit->JMP(asm_routines.testExceptions, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SystemCall: {
|
case SystemCall: {
|
||||||
|
@ -1378,26 +1561,16 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
|
||||||
PanicAlert("Unknown JIT instruction; aborting!");
|
PanicAlert("Unknown JIT instruction; aborting!");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (getOpcode(*I) != Tramp &&
|
|
||||||
getOpcode(*I) != BranchCond &&
|
|
||||||
getOpcode(*I) != Load8 &&
|
|
||||||
getOpcode(*I) != Load16 &&
|
|
||||||
getOpcode(*I) != Load32 &&
|
|
||||||
getOpcode(*I) != Store8 &&
|
|
||||||
getOpcode(*I) != Store16 &&
|
|
||||||
getOpcode(*I) != Store32 &&
|
|
||||||
1) {
|
|
||||||
if (RI.IInfo[I - RI.FirstI] & 4)
|
|
||||||
regClearInst(RI, getOp1(I));
|
|
||||||
if (RI.IInfo[I - RI.FirstI] & 8)
|
|
||||||
regClearInst(RI, getOp2(I));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for (unsigned i = 0; i < 8; i++) {
|
for (unsigned i = 0; i < 8; i++) {
|
||||||
if (RI.regs[i]) {
|
if (RI.regs[i]) {
|
||||||
PanicAlert("Incomplete cleanup!");
|
PanicAlert("Incomplete cleanup!");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
if (RI.fregs[i]) {
|
||||||
|
PanicAlert("Incomplete cleanup!");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (UseProfile && RI.numSpills)
|
if (UseProfile && RI.numSpills)
|
||||||
|
@ -1412,8 +1585,8 @@ void Jit64::WriteCode() {
|
||||||
|
|
||||||
void ProfiledReJit() {
|
void ProfiledReJit() {
|
||||||
u8* x = (u8*)jit.GetCodePtr();
|
u8* x = (u8*)jit.GetCodePtr();
|
||||||
jit.SetCodePtr(jit.js.normalEntry);
|
jit.SetCodePtr(jit.js.rewriteStart);
|
||||||
DoWriteCode(&jit.ibuild, &jit, true);
|
DoWriteCode(&jit.ibuild, &jit, true);
|
||||||
jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.normalEntry;
|
jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.rewriteStart;
|
||||||
jit.SetCodePtr(x);
|
jit.SetCodePtr(x);
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,6 +80,7 @@ namespace IREmitter {
|
||||||
Store16,
|
Store16,
|
||||||
Store32,
|
Store32,
|
||||||
BranchCond,
|
BranchCond,
|
||||||
|
#if 0
|
||||||
// Floating-point
|
// Floating-point
|
||||||
// There are three floating-point formats: single, double,
|
// There are three floating-point formats: single, double,
|
||||||
// and packed. For any operation where the format of the
|
// and packed. For any operation where the format of the
|
||||||
|
@ -141,8 +142,18 @@ namespace IREmitter {
|
||||||
ForceToSingle,
|
ForceToSingle,
|
||||||
ForceToDouble,
|
ForceToDouble,
|
||||||
ForceToMReg,
|
ForceToMReg,
|
||||||
LoadFPReg,
|
#endif
|
||||||
StoreFPReg,
|
LoadSingle,
|
||||||
|
LoadDouble,
|
||||||
|
LoadPaired, // This handles quantizers itself
|
||||||
|
DoubleToSingle,
|
||||||
|
DupSingleToMReg,
|
||||||
|
InsertDoubleInMReg,
|
||||||
|
ExpandPackedToMReg,
|
||||||
|
LoadFReg,
|
||||||
|
StoreFReg,
|
||||||
|
FSMul,
|
||||||
|
FSAdd,
|
||||||
|
|
||||||
// "Trinary" operators
|
// "Trinary" operators
|
||||||
// FIXME: Need to change representation!
|
// FIXME: Need to change representation!
|
||||||
|
@ -156,6 +167,7 @@ namespace IREmitter {
|
||||||
SystemCall,
|
SystemCall,
|
||||||
RFIExit,
|
RFIExit,
|
||||||
InterpreterBranch,
|
InterpreterBranch,
|
||||||
|
IdleLoop,
|
||||||
|
|
||||||
// "Opcode" representing a register too far away to
|
// "Opcode" representing a register too far away to
|
||||||
// reference directly; this is a size optimization
|
// reference directly; this is a size optimization
|
||||||
|
@ -365,6 +377,42 @@ namespace IREmitter {
|
||||||
InstLoc EmitRFIExit() {
|
InstLoc EmitRFIExit() {
|
||||||
return FoldZeroOp(RFIExit, 0);
|
return FoldZeroOp(RFIExit, 0);
|
||||||
}
|
}
|
||||||
|
InstLoc EmitIdleLoop(InstLoc idleParam, InstLoc pc) {
|
||||||
|
return FoldBiOp(IdleLoop, idleParam, pc);
|
||||||
|
}
|
||||||
|
InstLoc EmitLoadSingle(InstLoc addr) {
|
||||||
|
return FoldUOp(LoadSingle, addr);
|
||||||
|
}
|
||||||
|
InstLoc EmitLoadDouble(InstLoc addr) {
|
||||||
|
return FoldUOp(LoadDouble, addr);
|
||||||
|
}
|
||||||
|
InstLoc EmitLoadPaired(InstLoc addr, unsigned quantReg) {
|
||||||
|
return FoldUOp(LoadPaired, addr, quantReg);
|
||||||
|
}
|
||||||
|
InstLoc EmitLoadFReg(unsigned freg) {
|
||||||
|
return FoldZeroOp(LoadFReg, freg);
|
||||||
|
}
|
||||||
|
InstLoc EmitStoreFReg(InstLoc val, unsigned freg) {
|
||||||
|
return FoldUOp(StoreFReg, val, freg);
|
||||||
|
}
|
||||||
|
InstLoc EmitDupSingleToMReg(InstLoc val) {
|
||||||
|
return FoldUOp(DupSingleToMReg, val);
|
||||||
|
}
|
||||||
|
InstLoc EmitInsertDoubleInMReg(InstLoc val, InstLoc reg) {
|
||||||
|
return FoldBiOp(InsertDoubleInMReg, val, reg);
|
||||||
|
}
|
||||||
|
InstLoc EmitExpandPackedToMReg(InstLoc val) {
|
||||||
|
return FoldUOp(ExpandPackedToMReg, val);
|
||||||
|
}
|
||||||
|
InstLoc EmitFSMul(InstLoc op1, InstLoc op2) {
|
||||||
|
return FoldBiOp(FSMul, op1, op2);
|
||||||
|
}
|
||||||
|
InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
|
||||||
|
return FoldBiOp(FSAdd, op1, op2);
|
||||||
|
}
|
||||||
|
InstLoc EmitDoubleToSingle(InstLoc op1) {
|
||||||
|
return FoldUOp(DoubleToSingle, op1);
|
||||||
|
}
|
||||||
|
|
||||||
void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
|
void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
|
||||||
void StartForwardPass() { curReadPtr = &InstList[0]; }
|
void StartForwardPass() { curReadPtr = &InstList[0]; }
|
||||||
|
|
|
@ -420,12 +420,11 @@ namespace CPUCompare
|
||||||
SetJumpTarget(skip);
|
SetJumpTarget(skip);
|
||||||
|
|
||||||
const u8 *normalEntry = GetCodePtr();
|
const u8 *normalEntry = GetCodePtr();
|
||||||
js.normalEntry = (u8*)normalEntry;
|
|
||||||
|
|
||||||
if (ImHereDebug)
|
if (ImHereDebug)
|
||||||
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
|
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
|
||||||
|
|
||||||
if (false && js.fpa.any)
|
if (js.fpa.any)
|
||||||
{
|
{
|
||||||
//This block uses FPU - needs to add FP exception bailout
|
//This block uses FPU - needs to add FP exception bailout
|
||||||
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
|
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
|
||||||
|
@ -445,24 +444,10 @@ namespace CPUCompare
|
||||||
SetJumpTarget(b1);
|
SetJumpTarget(b1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Conditionally add profiling code.
|
js.rewriteStart = (u8*)GetCodePtr();
|
||||||
if (Profiler::g_ProfileBlocks) {
|
|
||||||
ADD(32, M(&b->runCount), Imm8(1));
|
|
||||||
#ifdef _WIN32
|
|
||||||
b->ticCounter.QuadPart = 0;
|
|
||||||
b->ticStart.QuadPart = 0;
|
|
||||||
b->ticStop.QuadPart = 0;
|
|
||||||
#else
|
|
||||||
//TODO
|
|
||||||
#endif
|
|
||||||
// get start tic
|
|
||||||
PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart);
|
|
||||||
}
|
|
||||||
|
|
||||||
//Start up the register allocators
|
// Start up IR builder (structure that collects the
|
||||||
//They use the information in gpa/fpa to preload commonly used registers.
|
// instruction processed by the JIT routines)
|
||||||
//gpr.Start(js.gpa);
|
|
||||||
//fpr.Start(js.fpa);
|
|
||||||
ibuild.Reset();
|
ibuild.Reset();
|
||||||
|
|
||||||
js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
|
js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
|
||||||
|
@ -519,6 +504,7 @@ namespace CPUCompare
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Perform actual code generation
|
||||||
WriteCode();
|
WriteCode();
|
||||||
|
|
||||||
b->flags = js.block_flags;
|
b->flags = js.block_flags;
|
||||||
|
|
|
@ -95,7 +95,7 @@ private:
|
||||||
PPCAnalyst::BlockRegStats gpa;
|
PPCAnalyst::BlockRegStats gpa;
|
||||||
PPCAnalyst::BlockRegStats fpa;
|
PPCAnalyst::BlockRegStats fpa;
|
||||||
PPCAnalyst::CodeOp *op;
|
PPCAnalyst::CodeOp *op;
|
||||||
u8* normalEntry;
|
u8* rewriteStart;
|
||||||
|
|
||||||
JitBlock *curBlock;
|
JitBlock *curBlock;
|
||||||
};
|
};
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include "../PowerPC.h"
|
#include "../PowerPC.h"
|
||||||
#include "../../CoreTiming.h"
|
#include "../../CoreTiming.h"
|
||||||
#include "MemoryUtil.h"
|
#include "MemoryUtil.h"
|
||||||
|
#include "CPUDetect.h"
|
||||||
|
|
||||||
#include "ABI.h"
|
#include "ABI.h"
|
||||||
#include "Jit.h"
|
#include "Jit.h"
|
||||||
|
@ -168,6 +169,176 @@ void AsmRoutineManager::Generate()
|
||||||
GenerateCommon();
|
GenerateCommon();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
|
|
||||||
|
const float m_quantizeTableS[] =
|
||||||
|
{
|
||||||
|
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
|
||||||
|
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
|
||||||
|
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
|
||||||
|
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
|
||||||
|
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
|
||||||
|
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
|
||||||
|
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
|
||||||
|
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
|
||||||
|
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
|
||||||
|
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
|
||||||
|
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
|
||||||
|
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
|
||||||
|
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
|
||||||
|
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
|
||||||
|
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
|
||||||
|
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
|
||||||
|
};
|
||||||
|
|
||||||
|
const float m_dequantizeTableS[] =
|
||||||
|
{
|
||||||
|
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
|
||||||
|
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
|
||||||
|
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
|
||||||
|
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
|
||||||
|
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
|
||||||
|
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
|
||||||
|
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
|
||||||
|
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
|
||||||
|
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
|
||||||
|
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
|
||||||
|
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
|
||||||
|
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
|
||||||
|
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
|
||||||
|
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
|
||||||
|
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
|
||||||
|
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
|
||||||
|
};
|
||||||
|
|
||||||
|
float psTemp[2];
|
||||||
|
|
||||||
|
void AsmRoutineManager::GenQuantizedLoads() {
|
||||||
|
const u8* loadPairedIllegal = AlignCode4();
|
||||||
|
UD2();
|
||||||
|
const u8* loadPairedFloat = AlignCode4();
|
||||||
|
if (cpu_info.bSSSE3) {
|
||||||
|
#ifdef _M_X64
|
||||||
|
MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
|
||||||
|
#else
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
|
||||||
|
#endif
|
||||||
|
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||||
|
} else {
|
||||||
|
#ifdef _M_X64
|
||||||
|
MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
|
||||||
|
BSWAP(64, RCX);
|
||||||
|
ROL(64, RCX, Imm8(32));
|
||||||
|
MOVQ_xmm(XMM0, R(RCX));
|
||||||
|
#else
|
||||||
|
#if 0
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PSHUFLW(XMM0, R(XMM0), 0xB1);
|
||||||
|
MOVAPD(XMM1, R(XMM0));
|
||||||
|
PSRLW(XMM0, 8);
|
||||||
|
PSLLW(XMM1, 8);
|
||||||
|
POR(XMM0, R(XMM1));
|
||||||
|
#else
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
|
||||||
|
BSWAP(32, EAX);
|
||||||
|
MOV(32, M(&psTemp[0]), R(RAX));
|
||||||
|
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
|
||||||
|
BSWAP(32, EAX);
|
||||||
|
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
|
||||||
|
MOVQ_xmm(XMM0, M(&psTemp[0]));
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedU8 = AlignCode4();
|
||||||
|
#ifdef _M_X64
|
||||||
|
MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
|
||||||
|
#else
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
|
||||||
|
#endif
|
||||||
|
MOVD_xmm(XMM0, R(ECX));
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PUNPCKLBW(XMM0, R(XMM1));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(EAX), Imm8(6));
|
||||||
|
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||||
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedS8 = AlignCode4();
|
||||||
|
#ifdef _M_X64
|
||||||
|
MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
|
||||||
|
#else
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
|
||||||
|
#endif
|
||||||
|
MOVD_xmm(XMM0, R(ECX));
|
||||||
|
PUNPCKLBW(XMM0, R(XMM0));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
|
PSRAD(XMM0, 24);
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(EAX), Imm8(6));
|
||||||
|
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||||
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedU16 = AlignCode4();
|
||||||
|
#ifdef _M_X64
|
||||||
|
MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
|
||||||
|
#else
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
|
||||||
|
#endif
|
||||||
|
BSWAP(32, ECX);
|
||||||
|
ROL(32, R(ECX), Imm8(16));
|
||||||
|
MOVD_xmm(XMM0, R(ECX));
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(EAX), Imm8(6));
|
||||||
|
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||||
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedS16 = AlignCode4();
|
||||||
|
#ifdef _M_X64
|
||||||
|
MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
|
||||||
|
#else
|
||||||
|
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
|
||||||
|
#endif
|
||||||
|
BSWAP(32, ECX);
|
||||||
|
ROL(32, R(ECX), Imm8(16));
|
||||||
|
MOVD_xmm(XMM0, R(ECX));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
|
PSRAD(XMM0, 16);
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(EAX), Imm8(6));
|
||||||
|
AND(32, R(EAX), Imm32(0xFC));
|
||||||
|
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
|
||||||
|
PUNPCKLDQ(XMM1, R(XMM1));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
pairedLoadQuantized[0] = loadPairedFloat;
|
||||||
|
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[4] = loadPairedU8;
|
||||||
|
pairedLoadQuantized[5] = loadPairedU16;
|
||||||
|
pairedLoadQuantized[6] = loadPairedS8;
|
||||||
|
pairedLoadQuantized[7] = loadPairedS16;
|
||||||
|
}
|
||||||
|
|
||||||
void AsmRoutineManager::GenFifoWrite(int size)
|
void AsmRoutineManager::GenFifoWrite(int size)
|
||||||
{
|
{
|
||||||
|
@ -257,6 +428,8 @@ void AsmRoutineManager::GenerateCommon()
|
||||||
SUB(32, M(&CoreTiming::downcount), Imm8(0));
|
SUB(32, M(&CoreTiming::downcount), Imm8(0));
|
||||||
JMP(dispatcher, true);
|
JMP(dispatcher, true);
|
||||||
|
|
||||||
|
GenQuantizedLoads();
|
||||||
|
|
||||||
computeRcFp = AlignCode16();
|
computeRcFp = AlignCode16();
|
||||||
//CMPSD(R(XMM0), M(&zero),
|
//CMPSD(R(XMM0), M(&zero),
|
||||||
// TODO
|
// TODO
|
||||||
|
|
|
@ -42,6 +42,7 @@ private:
|
||||||
void GenFifoWrite(int size);
|
void GenFifoWrite(int size);
|
||||||
void GenFifoFloatWrite();
|
void GenFifoFloatWrite();
|
||||||
void GenFifoXmm64Write();
|
void GenFifoXmm64Write();
|
||||||
|
void GenQuantizedLoads();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void Init() {
|
void Init() {
|
||||||
|
@ -80,6 +81,8 @@ public:
|
||||||
|
|
||||||
const u8 *doReJit;
|
const u8 *doReJit;
|
||||||
|
|
||||||
|
const u8 *pairedLoadQuantized[8];
|
||||||
|
|
||||||
bool compareEnabled;
|
bool compareEnabled;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,9 @@
|
||||||
|
|
||||||
// Zelda and many more games seem to pass the Acid Test.
|
// Zelda and many more games seem to pass the Acid Test.
|
||||||
|
|
||||||
|
//#define NORMALBRANCH_START Default(inst); ibuild.EmitInterpreterBranch(); return;
|
||||||
|
#define NORMALBRANCH_START
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
void Jit64::sc(UGeckoInstruction inst)
|
void Jit64::sc(UGeckoInstruction inst)
|
||||||
|
@ -53,6 +56,7 @@ using namespace Gen;
|
||||||
|
|
||||||
void Jit64::bx(UGeckoInstruction inst)
|
void Jit64::bx(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
|
NORMALBRANCH_START
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));
|
ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));
|
||||||
|
|
||||||
|
@ -67,6 +71,7 @@ using namespace Gen;
|
||||||
|
|
||||||
void Jit64::bcx(UGeckoInstruction inst)
|
void Jit64::bcx(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
|
NORMALBRANCH_START
|
||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
ibuild.EmitStoreLink(
|
ibuild.EmitStoreLink(
|
||||||
ibuild.EmitIntConst(js.compilerPC + 4));
|
ibuild.EmitIntConst(js.compilerPC + 4));
|
||||||
|
@ -117,6 +122,7 @@ using namespace Gen;
|
||||||
|
|
||||||
void Jit64::bcctrx(UGeckoInstruction inst)
|
void Jit64::bcctrx(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
|
NORMALBRANCH_START
|
||||||
Default(inst);
|
Default(inst);
|
||||||
ibuild.EmitInterpreterBranch();
|
ibuild.EmitInterpreterBranch();
|
||||||
return;
|
return;
|
||||||
|
@ -124,6 +130,7 @@ using namespace Gen;
|
||||||
|
|
||||||
void Jit64::bclrx(UGeckoInstruction inst)
|
void Jit64::bclrx(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
|
NORMALBRANCH_START
|
||||||
if (inst.hex == 0x4e800020) {
|
if (inst.hex == 0x4e800020) {
|
||||||
ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
|
ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -29,141 +29,54 @@
|
||||||
#define INSTRUCTION_START
|
#define INSTRUCTION_START
|
||||||
// #define INSTRUCTION_START Default(inst); return;
|
// #define INSTRUCTION_START Default(inst); return;
|
||||||
|
|
||||||
const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
|
||||||
const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
|
||||||
const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
|
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
|
|
||||||
{
|
|
||||||
fpr.Lock(d, a, b);
|
|
||||||
if (d == a)
|
|
||||||
{
|
|
||||||
fpr.LoadToX64(d, true);
|
|
||||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
|
||||||
}
|
|
||||||
else if (d == b && reversible)
|
|
||||||
{
|
|
||||||
fpr.LoadToX64(d, true);
|
|
||||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
|
||||||
}
|
|
||||||
else if (a != d && b != d)
|
|
||||||
{
|
|
||||||
// Sources different from d, can use rather quick solution
|
|
||||||
fpr.LoadToX64(d, !dupe);
|
|
||||||
MOVSD(fpr.RX(d), fpr.R(a));
|
|
||||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
|
||||||
}
|
|
||||||
else if (b != d)
|
|
||||||
{
|
|
||||||
fpr.LoadToX64(d, !dupe);
|
|
||||||
MOVSD(XMM0, fpr.R(b));
|
|
||||||
MOVSD(fpr.RX(d), fpr.R(a));
|
|
||||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
|
||||||
}
|
|
||||||
else // Other combo, must use two temps :(
|
|
||||||
{
|
|
||||||
MOVSD(XMM0, fpr.R(a));
|
|
||||||
MOVSD(XMM1, fpr.R(b));
|
|
||||||
fpr.LoadToX64(d, !dupe);
|
|
||||||
(this->*op)(XMM0, Gen::R(XMM1));
|
|
||||||
MOVSD(fpr.RX(d), Gen::R(XMM0));
|
|
||||||
}
|
|
||||||
if (dupe) {
|
|
||||||
ForceSinglePrecisionS(fpr.RX(d));
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
|
||||||
}
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) {
|
||||||
{Default(inst); return;} // turn off from debugger
|
|
||||||
INSTRUCTION_START;
|
|
||||||
if (inst.Rc) {
|
|
||||||
Default(inst); return;
|
Default(inst); return;
|
||||||
}
|
}
|
||||||
|
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
|
||||||
|
val = ibuild.EmitDoubleToSingle(val);
|
||||||
bool dupe = inst.OPCD == 59;
|
bool dupe = inst.OPCD == 59;
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
|
case 25: //mul
|
||||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
|
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
|
||||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add
|
case 18: //div
|
||||||
|
case 20: //sub
|
||||||
|
case 21: //add
|
||||||
case 23: //sel
|
case 23: //sel
|
||||||
Default(inst);
|
|
||||||
break;
|
|
||||||
case 24: //res
|
case 24: //res
|
||||||
Default(inst);
|
|
||||||
break;
|
|
||||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
|
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
|
||||||
}
|
}
|
||||||
|
val = ibuild.EmitDupSingleToMReg(val);
|
||||||
|
ibuild.EmitStoreFReg(val, inst.FD);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::fmaddXX(UGeckoInstruction inst)
|
void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) {
|
||||||
{Default(inst); return;} // turn off from debugger
|
|
||||||
INSTRUCTION_START;
|
|
||||||
if (inst.Rc) {
|
|
||||||
Default(inst); return;
|
Default(inst); return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool single_precision = inst.OPCD == 59;
|
bool single_precision = inst.OPCD == 59;
|
||||||
|
|
||||||
int a = inst.FA;
|
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
|
||||||
int b = inst.FB;
|
val = ibuild.EmitDoubleToSingle(val);
|
||||||
int c = inst.FC;
|
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
|
||||||
int d = inst.FD;
|
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
|
||||||
|
val = ibuild.EmitDupSingleToMReg(val);
|
||||||
fpr.Lock(a, b, c, d);
|
ibuild.EmitStoreFReg(val, inst.FD);
|
||||||
MOVSD(XMM0, fpr.R(a));
|
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
|
||||||
case 28: //msub
|
|
||||||
MULSD(XMM0, fpr.R(c));
|
|
||||||
SUBSD(XMM0, fpr.R(b));
|
|
||||||
break;
|
|
||||||
case 29: //madd
|
|
||||||
MULSD(XMM0, fpr.R(c));
|
|
||||||
ADDSD(XMM0, fpr.R(b));
|
|
||||||
break;
|
|
||||||
case 30: //nmsub
|
|
||||||
MULSD(XMM0, fpr.R(c));
|
|
||||||
SUBSD(XMM0, fpr.R(b));
|
|
||||||
XORPD(XMM0, M((void*)&psSignBits2));
|
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
MULSD(XMM0, fpr.R(c));
|
|
||||||
ADDSD(XMM0, fpr.R(b));
|
|
||||||
XORPD(XMM0, M((void*)&psSignBits2));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
fpr.LoadToX64(d, false);
|
|
||||||
//YES it is necessary to dupe the result :(
|
|
||||||
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
|
|
||||||
if (single_precision) {
|
|
||||||
ForceSinglePrecisionS(XMM0);
|
|
||||||
MOVDDUP(fpr.RX(d), R(XMM0));
|
|
||||||
} else {
|
|
||||||
MOVSD(fpr.RX(d), R(XMM0));
|
|
||||||
}
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::fmrx(UGeckoInstruction inst)
|
void Jit64::fmrx(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
|
|
||||||
{Default(inst); return;} // turn off from debugger
|
|
||||||
INSTRUCTION_START;
|
|
||||||
if (inst.Rc) {
|
if (inst.Rc) {
|
||||||
Default(inst); return;
|
Default(inst); return;
|
||||||
}
|
}
|
||||||
int d = inst.FD;
|
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FB);
|
||||||
int b = inst.FB;
|
val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
|
||||||
fpr.LoadToX64(d, true); // we don't want to destroy the high bit
|
ibuild.EmitStoreFReg(val, inst.FD);
|
||||||
MOVSD(fpr.RX(d), fpr.R(b));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::fcmpx(UGeckoInstruction inst)
|
void Jit64::fcmpx(UGeckoInstruction inst)
|
||||||
|
|
|
@ -71,6 +71,20 @@ void Jit64::lhax(UGeckoInstruction inst)
|
||||||
void Jit64::lXz(UGeckoInstruction inst)
|
void Jit64::lXz(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
|
||||||
|
if (Core::GetStartupParameter().bSkipIdle &&
|
||||||
|
inst.OPCD == 32 &&
|
||||||
|
(inst.hex & 0xFFFF0000) == 0x800D0000 &&
|
||||||
|
(Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 ||
|
||||||
|
(Core::GetStartupParameter().bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
|
||||||
|
Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
|
||||||
|
{
|
||||||
|
ibuild.EmitIdleLoop(ibuild.EmitIntConst(PowerPC::ppcState.gpr[inst.RA] + (s32)(s16)inst.SIMM_16),
|
||||||
|
ibuild.EmitIntConst(js.compilerPC));
|
||||||
|
js.compilerPC += 8;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
|
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
|
||||||
if (inst.RA)
|
if (inst.RA)
|
||||||
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||||
|
|
|
@ -57,38 +57,12 @@ u32 GC_ALIGNED16(temp32);
|
||||||
|
|
||||||
void Jit64::lfs(UGeckoInstruction inst)
|
void Jit64::lfs(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
|
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
|
||||||
{Default(inst); return;} // turn off from debugger
|
if (inst.RA)
|
||||||
INSTRUCTION_START;
|
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||||
|
val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
|
||||||
int d = inst.RD;
|
ibuild.EmitStoreFReg(val, inst.RD);
|
||||||
int a = inst.RA;
|
return;
|
||||||
if (!a)
|
|
||||||
{
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
s32 offset = (s32)(s16)inst.SIMM_16;
|
|
||||||
gpr.FlushLockX(ABI_PARAM1);
|
|
||||||
gpr.Lock(a);
|
|
||||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
|
||||||
if (jo.assumeFPLoadFromMem)
|
|
||||||
{
|
|
||||||
UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
SafeLoadRegToEAX(ABI_PARAM1, 32, offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
MOV(32, M(&temp32), R(EAX));
|
|
||||||
fpr.Lock(d);
|
|
||||||
fpr.LoadToX64(d, false);
|
|
||||||
CVTSS2SD(fpr.RX(d), M(&temp32));
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
|
||||||
gpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -291,32 +265,10 @@ void Jit64::stfsx(UGeckoInstruction inst)
|
||||||
|
|
||||||
void Jit64::lfsx(UGeckoInstruction inst)
|
void Jit64::lfsx(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
|
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB), val;
|
||||||
{Default(inst); return;} // turn off from debugger
|
|
||||||
INSTRUCTION_START;
|
|
||||||
|
|
||||||
fpr.Lock(inst.RS);
|
|
||||||
fpr.LoadToX64(inst.RS, false, true);
|
|
||||||
MOV(32, R(EAX), gpr.R(inst.RB));
|
|
||||||
if (inst.RA)
|
if (inst.RA)
|
||||||
ADD(32, R(EAX), gpr.R(inst.RA));
|
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||||
if (cpu_info.bSSSE3) {
|
val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
|
||||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
ibuild.EmitStoreFReg(val, inst.RD);
|
||||||
#ifdef _M_IX86
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
|
|
||||||
#else
|
|
||||||
MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
|
|
||||||
#endif
|
|
||||||
PSHUFB(r, M((void *)bswapShuffle1x4));
|
|
||||||
CVTSS2SD(r, R(r));
|
|
||||||
MOVDDUP(r, R(r));
|
|
||||||
} else {
|
|
||||||
UnsafeLoadRegToReg(EAX, EAX, 32, false);
|
|
||||||
MOV(32, M(&temp32), R(EAX));
|
|
||||||
CVTSS2SD(XMM0, M(&temp32));
|
|
||||||
MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
|
|
||||||
}
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,419 +40,20 @@
|
||||||
#define INSTRUCTION_START
|
#define INSTRUCTION_START
|
||||||
// #define INSTRUCTION_START Default(inst); return;
|
// #define INSTRUCTION_START Default(inst); return;
|
||||||
|
|
||||||
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
||||||
const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
||||||
|
|
||||||
static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
|
|
||||||
static u64 GC_ALIGNED16(temp64);
|
|
||||||
|
|
||||||
// TODO(ector): Improve 64-bit version
|
|
||||||
static void WriteDual32(u64 value, u32 address)
|
|
||||||
{
|
|
||||||
Memory::Write_U32((u32)(value >> 32), address);
|
|
||||||
Memory::Write_U32((u32)value, address + 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
const double GC_ALIGNED16(m_quantizeTableD[]) =
|
|
||||||
{
|
|
||||||
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
|
|
||||||
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
|
|
||||||
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
|
|
||||||
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
|
|
||||||
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
|
|
||||||
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
|
|
||||||
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
|
|
||||||
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
|
|
||||||
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
|
|
||||||
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
|
|
||||||
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
|
|
||||||
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
|
|
||||||
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
|
|
||||||
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
|
|
||||||
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
|
|
||||||
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
|
|
||||||
};
|
|
||||||
|
|
||||||
const double GC_ALIGNED16(m_dequantizeTableD[]) =
|
|
||||||
{
|
|
||||||
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
|
|
||||||
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
|
|
||||||
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
|
|
||||||
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
|
|
||||||
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
|
|
||||||
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
|
|
||||||
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
|
|
||||||
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
|
|
||||||
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
|
|
||||||
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
|
|
||||||
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
|
|
||||||
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
|
|
||||||
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
|
|
||||||
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
|
|
||||||
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
|
|
||||||
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
|
|
||||||
};
|
|
||||||
|
|
||||||
// The big problem is likely instructions that set the quantizers in the same block.
|
// The big problem is likely instructions that set the quantizers in the same block.
|
||||||
// We will have to break block after quantizers are written to.
|
// We will have to break block after quantizers are written to.
|
||||||
void Jit64::psq_st(UGeckoInstruction inst)
|
void Jit64::psq_st(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
|
Default(inst); return;
|
||||||
{Default(inst); return;} // turn off from debugger
|
|
||||||
INSTRUCTION_START;
|
|
||||||
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
|
|
||||||
|
|
||||||
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
|
|
||||||
{
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!inst.RA)
|
|
||||||
{
|
|
||||||
// This really should never happen. Unless we change this to also support stwux
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
|
|
||||||
const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
|
|
||||||
int stScale = gqr.ST_SCALE;
|
|
||||||
bool update = inst.OPCD == 61;
|
|
||||||
|
|
||||||
int offset = inst.SIMM_12;
|
|
||||||
int a = inst.RA;
|
|
||||||
int s = inst.RS; // Fp numbers
|
|
||||||
|
|
||||||
if (inst.W) {
|
|
||||||
// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update);
|
|
||||||
// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
|
|
||||||
// floats so that's what we'll work on.
|
|
||||||
switch (stType)
|
|
||||||
{
|
|
||||||
case QUANTIZE_FLOAT:
|
|
||||||
{
|
|
||||||
// This one has quite a bit of optimization potential.
|
|
||||||
if (gpr.R(a).IsImm())
|
|
||||||
{
|
|
||||||
PanicAlert("Imm: %08x", gpr.R(a).offset);
|
|
||||||
}
|
|
||||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
|
||||||
gpr.Lock(a);
|
|
||||||
fpr.Lock(s);
|
|
||||||
if (update)
|
|
||||||
gpr.LoadToX64(a, true, true);
|
|
||||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
|
||||||
if (offset)
|
|
||||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
|
||||||
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
|
|
||||||
if (update && offset)
|
|
||||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
|
||||||
CVTSD2SS(XMM0, fpr.R(s));
|
|
||||||
MOVD_xmm(M(&temp64), XMM0);
|
|
||||||
MOV(32, R(ABI_PARAM1), M(&temp64));
|
|
||||||
FixupBranch argh = J_CC(CC_NZ);
|
|
||||||
BSWAP(32, ABI_PARAM1);
|
|
||||||
#ifdef _M_X64
|
|
||||||
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
|
||||||
#else
|
|
||||||
MOV(32, R(EAX), R(ABI_PARAM2));
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
|
|
||||||
#endif
|
|
||||||
FixupBranch skip_call = J();
|
|
||||||
SetJumpTarget(argh);
|
|
||||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
|
||||||
SetJumpTarget(skip_call);
|
|
||||||
gpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
fpr.UnlockAll();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (stType == QUANTIZE_FLOAT)
|
|
||||||
{
|
|
||||||
if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
u32 addr = (u32)(gpr.R(a).offset + offset);
|
|
||||||
if (addr == 0xCC008000) {
|
|
||||||
// Writing to FIFO. Let's do fast method.
|
|
||||||
CVTPD2PS(XMM0, fpr.R(s));
|
|
||||||
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
|
|
||||||
CALL((void*)asm_routines.fifoDirectWriteXmm64);
|
|
||||||
js.fifoBytesThisBlock += 8;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
|
||||||
gpr.Lock(a);
|
|
||||||
fpr.Lock(s);
|
|
||||||
if (update)
|
|
||||||
gpr.LoadToX64(a, true, true);
|
|
||||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
|
||||||
if (offset)
|
|
||||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
|
||||||
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
|
|
||||||
if (update && offset)
|
|
||||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
|
||||||
CVTPD2PS(XMM0, fpr.R(s));
|
|
||||||
SHUFPS(XMM0, R(XMM0), 1);
|
|
||||||
MOVQ_xmm(M(&temp64), XMM0);
|
|
||||||
#ifdef _M_X64
|
|
||||||
MOV(64, R(ABI_PARAM1), M(&temp64));
|
|
||||||
FixupBranch argh = J_CC(CC_NZ);
|
|
||||||
BSWAP(64, ABI_PARAM1);
|
|
||||||
MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
|
||||||
FixupBranch arg2 = J();
|
|
||||||
SetJumpTarget(argh);
|
|
||||||
CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
|
|
||||||
#else
|
|
||||||
FixupBranch argh = J_CC(CC_NZ);
|
|
||||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
|
|
||||||
BSWAP(32, ABI_PARAM1);
|
|
||||||
AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1));
|
|
||||||
MOV(32, R(ABI_PARAM1), M(&temp64));
|
|
||||||
BSWAP(32, ABI_PARAM1);
|
|
||||||
MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1));
|
|
||||||
FixupBranch arg2 = J();
|
|
||||||
SetJumpTarget(argh);
|
|
||||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
|
|
||||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
|
||||||
MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
|
|
||||||
ADD(32, R(ABI_PARAM2), Imm32(4));
|
|
||||||
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
|
|
||||||
#endif
|
|
||||||
SetJumpTarget(arg2);
|
|
||||||
gpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
else if (stType == QUANTIZE_U8)
|
|
||||||
{
|
|
||||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
|
||||||
gpr.Lock(a);
|
|
||||||
fpr.Lock(s);
|
|
||||||
if (update)
|
|
||||||
gpr.LoadToX64(a, true, update);
|
|
||||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
|
||||||
if (offset)
|
|
||||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
|
||||||
if (update && offset)
|
|
||||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
|
||||||
MOVAPD(XMM0, fpr.R(s));
|
|
||||||
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
|
|
||||||
MULPD(XMM0, R(XMM1));
|
|
||||||
CVTPD2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
PACKUSWB(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(M(&temp64), XMM0);
|
|
||||||
MOV(16, R(ABI_PARAM1), M(&temp64));
|
|
||||||
#ifdef _M_X64
|
|
||||||
MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
|
||||||
#else
|
|
||||||
MOV(32, R(EAX), R(ABI_PARAM2));
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
|
|
||||||
#endif
|
|
||||||
if (update)
|
|
||||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
|
||||||
gpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
else if (stType == QUANTIZE_S16)
|
|
||||||
{
|
|
||||||
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
|
|
||||||
gpr.Lock(a);
|
|
||||||
fpr.Lock(s);
|
|
||||||
if (update)
|
|
||||||
gpr.LoadToX64(a, true, update);
|
|
||||||
MOV(32, R(ABI_PARAM2), gpr.R(a));
|
|
||||||
if (offset)
|
|
||||||
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
|
|
||||||
if (update)
|
|
||||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
|
||||||
MOVAPD(XMM0, fpr.R(s));
|
|
||||||
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
|
|
||||||
MULPD(XMM0, R(XMM1));
|
|
||||||
SHUFPD(XMM0, R(XMM0), 1);
|
|
||||||
CVTPD2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(M(&temp64), XMM0);
|
|
||||||
MOV(32, R(ABI_PARAM1), M(&temp64));
|
|
||||||
BSWAP(32, ABI_PARAM1);
|
|
||||||
#ifdef _M_X64
|
|
||||||
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
|
|
||||||
#else
|
|
||||||
MOV(32, R(EAX), R(ABI_PARAM2));
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
|
|
||||||
#endif
|
|
||||||
gpr.UnlockAll();
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Dodger uses this.
|
|
||||||
// mario tennis
|
|
||||||
//PanicAlert("st %i:%i", stType, inst.W);
|
|
||||||
Default(inst);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::psq_l(UGeckoInstruction inst)
|
void Jit64::psq_l(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
|
if (inst.W) {Default(inst); return;}
|
||||||
{Default(inst); return;} // turn off from debugger
|
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val;
|
||||||
INSTRUCTION_START;
|
if (inst.RA)
|
||||||
|
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
|
||||||
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
|
val = ibuild.EmitLoadPaired(addr, inst.I);
|
||||||
|
val = ibuild.EmitExpandPackedToMReg(val);
|
||||||
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
|
ibuild.EmitStoreFReg(val, inst.RD);
|
||||||
{
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
|
|
||||||
const EQuantizeType ldType = static_cast<EQuantizeType>(gqr.LD_TYPE);
|
|
||||||
int ldScale = gqr.LD_SCALE;
|
|
||||||
bool update = inst.OPCD == 57;
|
|
||||||
if (!inst.RA || inst.W)
|
|
||||||
{
|
|
||||||
// 0 1 during load
|
|
||||||
//PanicAlert("ld:%i %i", ldType, (int)inst.W);
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
int offset = inst.SIMM_12;
|
|
||||||
switch (ldType) {
|
|
||||||
case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address.
|
|
||||||
{
|
|
||||||
#ifdef _M_X64
|
|
||||||
gpr.LoadToX64(inst.RA, true, update);
|
|
||||||
fpr.LoadToX64(inst.RS, false);
|
|
||||||
if (cpu_info.bSSSE3) {
|
|
||||||
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
|
|
||||||
MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
|
||||||
PSHUFB(xd, M((void *)pbswapShuffle2x4));
|
|
||||||
CVTPS2PD(xd, R(xd));
|
|
||||||
} else {
|
|
||||||
MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
|
||||||
BSWAP(64, RAX);
|
|
||||||
MOV(64, M(&psTemp[0]), R(RAX));
|
|
||||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
|
||||||
CVTPS2PD(r, M(&psTemp[0]));
|
|
||||||
SHUFPD(r, R(r), 1);
|
|
||||||
}
|
|
||||||
if (update && offset != 0)
|
|
||||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
|
||||||
break;
|
|
||||||
#else
|
|
||||||
if (cpu_info.bSSSE3) {
|
|
||||||
gpr.LoadToX64(inst.RA, true, update);
|
|
||||||
fpr.LoadToX64(inst.RS, false);
|
|
||||||
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
|
|
||||||
MOV(32, R(EAX), gpr.R(inst.RA));
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset));
|
|
||||||
PSHUFB(xd, M((void *)pbswapShuffle2x4));
|
|
||||||
CVTPS2PD(xd, R(xd));
|
|
||||||
} else {
|
|
||||||
gpr.FlushLockX(ECX);
|
|
||||||
gpr.LoadToX64(inst.RA, true, update);
|
|
||||||
// This can probably be optimized somewhat.
|
|
||||||
LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
|
|
||||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
|
|
||||||
BSWAP(32, RAX);
|
|
||||||
MOV(32, M(&psTemp[0]), R(RAX));
|
|
||||||
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
|
|
||||||
BSWAP(32, RAX);
|
|
||||||
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
|
|
||||||
fpr.LoadToX64(inst.RS, false, true);
|
|
||||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
|
||||||
CVTPS2PD(r, M(&psTemp[0]));
|
|
||||||
gpr.UnlockAllX();
|
|
||||||
}
|
|
||||||
if (update && offset != 0)
|
|
||||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
case QUANTIZE_U8:
|
|
||||||
{
|
|
||||||
gpr.LoadToX64(inst.RA, true, update);
|
|
||||||
#ifdef _M_X64
|
|
||||||
MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
|
||||||
#else
|
|
||||||
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base));
|
|
||||||
#endif
|
|
||||||
MOV(32, M(&temp64), R(EAX));
|
|
||||||
MOVD_xmm(XMM0, M(&temp64));
|
|
||||||
// SSE4 optimization opportunity here.
|
|
||||||
PXOR(XMM1, R(XMM1));
|
|
||||||
PUNPCKLBW(XMM0, R(XMM1));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
|
||||||
CVTDQ2PD(XMM0, R(XMM0));
|
|
||||||
fpr.LoadToX64(inst.RS, false, true);
|
|
||||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
|
||||||
MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale]));
|
|
||||||
MULPD(r, R(XMM0));
|
|
||||||
if (update && offset != 0)
|
|
||||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case QUANTIZE_S16:
|
|
||||||
{
|
|
||||||
gpr.LoadToX64(inst.RA, true, update);
|
|
||||||
#ifdef _M_X64
|
|
||||||
MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
|
|
||||||
#else
|
|
||||||
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
|
|
||||||
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
|
|
||||||
MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base));
|
|
||||||
#endif
|
|
||||||
BSWAP(32, EAX);
|
|
||||||
MOV(32, M(&temp64), R(EAX));
|
|
||||||
fpr.LoadToX64(inst.RS, false, true);
|
|
||||||
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
|
|
||||||
MOVD_xmm(XMM0, M(&temp64));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword..
|
|
||||||
PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P
|
|
||||||
CVTDQ2PD(XMM0, R(XMM0));
|
|
||||||
MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale]));
|
|
||||||
MULPD(r, R(XMM0));
|
|
||||||
SHUFPD(r, R(r), 1);
|
|
||||||
if (update && offset != 0)
|
|
||||||
ADD(32, gpr.R(inst.RA), Imm32(offset));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
/*
|
|
||||||
Dynamic quantizer. Todo when we have a test set.
|
|
||||||
MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3)); // it's in the high byte.
|
|
||||||
AND(32, R(EAX), Imm8(0x3F));
|
|
||||||
MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD));
|
|
||||||
MOVDDUP(r, MComplex(RCX, EAX, 8, 0));
|
|
||||||
*/
|
|
||||||
default:
|
|
||||||
// 4 0
|
|
||||||
// 6 0 //power tennis
|
|
||||||
// 5 0
|
|
||||||
// PanicAlert("ld:%i %i", ldType, (int)inst.W);
|
|
||||||
Default(inst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12;
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue