A bit more progress on my JIT WIP: biggest changes are some substantial

work on floating-point.



git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1743 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
magumagu9 2009-01-03 07:51:27 +00:00
parent 35128bb041
commit 29a033e1dd
11 changed files with 524 additions and 654 deletions

View File

@ -97,8 +97,9 @@ Inter-block dead condition register elimination (Likely significant win
Optimize conditions for conditional branches.
General dead register elimination.
Inter-block inlining.
Track down a few correctness bugs (I think there's something wrong
with my branches, but I haven't been able to figure it out).
Track down issues with new JIT + dual-core mode (I think I'm going to
need help with this one; I'm not very familiar with the
dual-core code.)
Specialized slw/srw/sraw; I think there are some tricks that could
have a non-trivial effect, and there are significantly shorter
implementations for 64-bit involving abusing 64-bit shifts.
@ -502,16 +503,21 @@ struct RegInfo {
InstLoc FirstI;
std::vector<unsigned> IInfo;
InstLoc regs[16];
InstLoc fregs[16];
unsigned numSpills;
unsigned numFSpills;
bool MakeProfile;
bool UseProfile;
unsigned numProfiledLoads;
unsigned exitNumber;
RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) {
for (unsigned i = 0; i < 16; i++)
for (unsigned i = 0; i < 16; i++) {
regs[i] = 0;
fregs[i] = 0;
}
numSpills = 0;
numFSpills = 0;
numProfiledLoads = 0;
exitNumber = 0;
MakeProfile = UseProfile = false;
@ -533,6 +539,7 @@ static unsigned regReadUse(RegInfo& R, InstLoc I) {
static unsigned SlotSet[1000];
static unsigned ProfiledLoads[1000];
static u8 GC_ALIGNED16(FSlotSet[16*1000]);
static OpArg regLocForSlot(RegInfo& RI, unsigned slot) {
return M(&SlotSet[slot - 1]);
@ -558,57 +565,86 @@ static void regSpill(RegInfo& RI, X64Reg reg) {
RI.regs[reg] = 0;
}
static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
return M(&FSlotSet[slot*16]);
}
static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
unsigned newSpill = ++RI.numFSpills;
RI.IInfo[I - RI.FirstI] |= newSpill << 16;
return newSpill;
}
static unsigned fregGetSpill(RegInfo& RI, InstLoc I) {
return RI.IInfo[I - RI.FirstI] >> 16;
}
static void fregSpill(RegInfo& RI, X64Reg reg) {
if (!RI.fregs[reg]) return;
unsigned slot = fregGetSpill(RI, RI.fregs[reg]);
if (!slot) {
slot = fregCreateSpill(RI, RI.fregs[reg]);
RI.Jit->MOVAPD(fregLocForSlot(RI, slot), reg);
}
RI.fregs[reg] = 0;
}
// ECX is scratch, so we don't allocate it
static X64Reg RegAllocOrder[] = {EDI, ESI, EBP, EBX, EDX, EAX};
static unsigned RegAllocSize = sizeof(RegAllocOrder) / sizeof(X64Reg);
static X64Reg FRegAllocOrder[] = {XMM2, XMM3, XMM4, XMM5, XMM6, XMM7};
static unsigned FRegAllocSize = sizeof(FRegAllocOrder) / sizeof(X64Reg);
static X64Reg regFindFreeReg(RegInfo& RI) {
if (RI.regs[EDI] == 0) return EDI;
if (RI.regs[ESI] == 0) return ESI;
if (RI.regs[EBP] == 0) return EBP;
if (RI.regs[EBX] == 0) return EBX;
if (RI.regs[EDX] == 0) return EDX;
if (RI.regs[EAX] == 0) return EAX;
// ECX is scratch, so we don't allocate it
static X64Reg regs[] = {EDI, ESI, EBP, EBX, EDX, EAX};
for (unsigned i = 0; i < RegAllocSize; i++)
if (RI.regs[RegAllocOrder[i]] == 0)
return RegAllocOrder[i];
static unsigned nextReg = 0;
X64Reg reg = regs[nextReg++ % 6];
X64Reg reg = RegAllocOrder[nextReg++ % RegAllocSize];
regSpill(RI, reg);
return reg;
}
static X64Reg fregFindFreeReg(RegInfo& RI) {
for (unsigned i = 0; i < FRegAllocSize; i++)
if (RI.fregs[FRegAllocOrder[i]] == 0)
return FRegAllocOrder[i];
// XMM0/1 are scratch, so we don't allocate it
fregSpill(RI, XMM7);
return XMM7;
}
static OpArg regLocForInst(RegInfo& RI, InstLoc I) {
if (RI.regs[EDI] == I) return R(EDI);
if (RI.regs[ESI] == I) return R(ESI);
if (RI.regs[EBP] == I) return R(EBP);
if (RI.regs[EBX] == I) return R(EBX);
if (RI.regs[EDX] == I) return R(EDX);
if (RI.regs[EAX] == I) return R(EAX);
if (RI.regs[ECX] == I) return R(ECX);
for (unsigned i = 0; i < RegAllocSize; i++)
if (RI.regs[RegAllocOrder[i]] == I)
return R(RegAllocOrder[i]);
if (regGetSpill(RI, I) == 0)
PanicAlert("Retrieving unknown spill slot?!");
return regLocForSlot(RI, regGetSpill(RI, I));
}
static OpArg fregLocForInst(RegInfo& RI, InstLoc I) {
for (unsigned i = 0; i < FRegAllocSize; i++)
if (RI.fregs[FRegAllocOrder[i]] == I)
return R(FRegAllocOrder[i]);
if (fregGetSpill(RI, I) == 0)
PanicAlert("Retrieving unknown spill slot?!");
return fregLocForSlot(RI, fregGetSpill(RI, I));
}
static void regClearInst(RegInfo& RI, InstLoc I) {
if (RI.regs[EDI] == I) {
RI.regs[EDI] = 0;
}
if (RI.regs[ESI] == I) {
RI.regs[ESI] = 0;
}
if (RI.regs[EBP] == I) {
RI.regs[EBP] = 0;
}
if (RI.regs[EBX] == I) {
RI.regs[EBX] = 0;
}
if (RI.regs[EDX] == I) {
RI.regs[EDX] = 0;
}
if (RI.regs[EAX] == I) {
RI.regs[EAX] = 0;
}
if (RI.regs[ECX] == I) {
RI.regs[ECX] = 0;
}
for (unsigned i = 0; i < RegAllocSize; i++)
if (RI.regs[RegAllocOrder[i]] == I)
RI.regs[RegAllocOrder[i]] = 0;
}
static void fregClearInst(RegInfo& RI, InstLoc I) {
for (unsigned i = 0; i < FRegAllocSize; i++)
if (RI.fregs[FRegAllocOrder[i]] == I)
RI.fregs[FRegAllocOrder[i]] = 0;
}
static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
@ -645,6 +681,20 @@ static X64Reg regBinLHSReg(RegInfo& RI, InstLoc I) {
return reg;
}
static void regNormalRegClear(RegInfo& RI, InstLoc I) {
if (RI.IInfo[I - RI.FirstI] & 4)
regClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8)
regClearInst(RI, getOp2(I));
}
static void fregNormalRegClear(RegInfo& RI, InstLoc I) {
if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8)
fregClearInst(RI, getOp2(I));
}
static void regEmitBinInst(RegInfo& RI, InstLoc I,
void (Jit64::*op)(int, const OpArg&,
const OpArg&)) {
@ -660,11 +710,11 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I,
(RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I)));
}
RI.regs[reg] = I;
regNormalRegClear(RI, I);
}
// Mark and calculation routines for profiled load/store addresses
// Could be extended to unprofiled addresses.
// FIXME: Finish/activate!
static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
if (isImm(*AI)) {
unsigned addr = RI.Build->GetImmValue(AI);
@ -743,7 +793,6 @@ static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI,
}
return MDisp(baseReg, offset);
}
// end FIXME
static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
if (RI.UseProfile) {
@ -844,7 +893,6 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
RI.Jit->js.fifoBytesThisBlock += Size >> 3;
if (RI.IInfo[I - RI.FirstI] & 4)
regClearInst(RI, getOp1(I));
//regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
regClearDeadMemAddress(RI, I, getOp2(I), 2);
return;
}
@ -878,6 +926,7 @@ static void regEmitShiftInst(RegInfo& RI, InstLoc I,
RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
(RI.Jit->*op)(32, R(reg), R(ECX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
}
static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I,
@ -930,7 +979,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts());
RI.Build = ibuild;
RI.UseProfile = UseProfile;
RI.MakeProfile = !RI.UseProfile;
RI.MakeProfile = false;//!RI.UseProfile;
// Pass to compute liveness
ibuild->StartBackPass();
for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
@ -949,12 +998,14 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case LoadCarry:
case LoadCTR:
case LoadMSR:
case LoadFReg:
case BlockEnd:
case BlockStart:
case InterpreterFallback:
case SystemCall:
case RFIExit:
case InterpreterBranch:
case IdleLoop:
// No liveness effects
break;
case Tramp:
@ -965,6 +1016,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case SExt16:
case BSwap32:
case BSwap16:
case DupSingleToMReg:
case DoubleToSingle:
case ExpandPackedToMReg:
if (thisUsed)
regMarkUse(RI, I, getOp1(I), 1);
break;
@ -973,6 +1027,10 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case Load32:
regMarkMemAddress(RI, I, getOp1(I), 1);
break;
case LoadSingle:
case LoadPaired:
regMarkUse(RI, I, getOp1(I), 1);
break;
case StoreCR:
case StoreCarry:
regMarkUse(RI, I, getOp1(I), 1);
@ -981,6 +1039,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case StoreLink:
case StoreCTR:
case StoreMSR:
case StoreFReg:
if (!isImm(*getOp1(I)))
regMarkUse(RI, I, getOp1(I), 1);
break;
@ -1000,6 +1059,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case ICmpUgt:
case ICmpSle:
case ICmpSgt:
case FSMul:
case FSAdd:
case InsertDoubleInMReg:
if (thisUsed) {
regMarkUse(RI, I, getOp1(I), 1);
if (!isImm(*getOp2(I)))
@ -1041,6 +1103,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case InterpreterFallback: {
unsigned InstCode = ibuild->GetImmValue(getOp1(I));
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
// There really shouldn't be anything live across an
// interpreter call at the moment, but optimizing interpreter
// calls isn't completely out of the question...
regSpillCallerSaved(RI);
Jit->MOV(32, M(&PC), Imm32(InstLoc));
Jit->MOV(32, M(&NPC), Imm32(InstLoc+4));
@ -1089,6 +1154,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
unsigned ppcreg = *I >> 16;
regStoreInstToConstLoc(RI, 32, getOp1(I),
&PowerPC::ppcState.gpr[ppcreg]);
regNormalRegClear(RI, I);
break;
}
case StoreCR: {
@ -1096,18 +1162,22 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
unsigned ppcreg = *I >> 16;
// CAUTION: uses 8-bit reg!
Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX));
regNormalRegClear(RI, I);
break;
}
case StoreLink: {
regStoreInstToConstLoc(RI, 32, getOp1(I), &LR);
regNormalRegClear(RI, I);
break;
}
case StoreCTR: {
regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR);
regNormalRegClear(RI, I);
break;
}
case StoreMSR: {
regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR);
regNormalRegClear(RI, I);
break;
}
case StoreCarry: {
@ -1118,6 +1188,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
Jit->SetJumpTarget(nocarry);
Jit->JitClearCA();
Jit->SetJumpTarget(cont);
regNormalRegClear(RI, I);
break;
}
case Load8: {
@ -1150,6 +1221,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->MOVSX(32, 8, reg, R(ECX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case SExt16: {
@ -1157,6 +1229,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
X64Reg reg = regUReg(RI, I);
Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I)));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case And: {
@ -1199,6 +1272,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I)));
}
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case Rol: {
@ -1228,6 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
X64Reg reg = regFindFreeReg(RI);
Jit->MOVZX(32, 8, reg, R(ECX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case ICmpUgt: {
@ -1237,6 +1312,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
X64Reg reg = regFindFreeReg(RI);
Jit->MOVZX(32, 8, reg, R(ECX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case ICmpSle: {
@ -1246,6 +1322,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
X64Reg reg = regFindFreeReg(RI);
Jit->MOVZX(32, 8, reg, R(ECX));
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case ICmpCRUnsigned: {
@ -1264,6 +1341,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
Jit->SetJumpTarget(continue1);
Jit->SetJumpTarget(continue2);
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case ICmpCRSigned: {
@ -1282,6 +1360,102 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
Jit->SetJumpTarget(continue1);
Jit->SetJumpTarget(continue2);
RI.regs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case LoadSingle: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(reg, R(ECX));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case LoadPaired: {
if (!thisUsed) break;
regSpill(RI, EAX);
regSpill(RI, EDX);
X64Reg reg = fregFindFreeReg(RI);
unsigned quantreg = *I >> 16;
Jit->MOVZX(32, 16, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + quantreg]) + 2));
Jit->MOVZX(32, 8, EDX, R(AL));
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]!
Jit->SHL(32, R(EDX), Imm8(2));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->CALLptr(MDisp(EDX, (u32)asm_routines.pairedLoadQuantized));
Jit->MOVAPD(reg, R(XMM0));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case DupSingleToMReg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
Jit->MOVDDUP(reg, R(reg));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case InsertDoubleInMReg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->MOVSD(reg, R(XMM0));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case ExpandPackedToMReg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->CVTPS2PD(reg, fregLocForInst(RI, getOp1(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case LoadFReg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
unsigned ppcreg = *I >> 8;
Jit->MOVAPD(reg, M(&PowerPC::ppcState.ps[ppcreg]));
RI.fregs[reg] = I;
break;
}
case StoreFReg: {
unsigned ppcreg = *I >> 16;
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0);
fregNormalRegClear(RI, I);
break;
}
case DoubleToSingle: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FSMul: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->MULSS(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FSAdd: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->ADDSS(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case CInt32:
@ -1328,6 +1502,15 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
}
case BranchUncond: {
regWriteExit(RI, getOp1(I));
regNormalRegClear(RI, I);
break;
}
case IdleLoop: {
unsigned IdleParam = ibuild->GetImmValue(getOp1(I));
unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
Jit->ABI_CallFunctionC((void *)&PowerPC::OnIdle, IdleParam);
Jit->MOV(32, M(&PowerPC::ppcState.pc), Imm32(InstLoc + 12));
Jit->JMP(asm_routines.testExceptions, true);
break;
}
case SystemCall: {
@ -1378,26 +1561,16 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
PanicAlert("Unknown JIT instruction; aborting!");
exit(1);
}
if (getOpcode(*I) != Tramp &&
getOpcode(*I) != BranchCond &&
getOpcode(*I) != Load8 &&
getOpcode(*I) != Load16 &&
getOpcode(*I) != Load32 &&
getOpcode(*I) != Store8 &&
getOpcode(*I) != Store16 &&
getOpcode(*I) != Store32 &&
1) {
if (RI.IInfo[I - RI.FirstI] & 4)
regClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8)
regClearInst(RI, getOp2(I));
}
}
for (unsigned i = 0; i < 8; i++) {
if (RI.regs[i]) {
PanicAlert("Incomplete cleanup!");
exit(1);
}
if (RI.fregs[i]) {
PanicAlert("Incomplete cleanup!");
exit(1);
}
}
if (UseProfile && RI.numSpills)
@ -1412,8 +1585,8 @@ void Jit64::WriteCode() {
void ProfiledReJit() {
u8* x = (u8*)jit.GetCodePtr();
jit.SetCodePtr(jit.js.normalEntry);
jit.SetCodePtr(jit.js.rewriteStart);
DoWriteCode(&jit.ibuild, &jit, true);
jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.normalEntry;
jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.rewriteStart;
jit.SetCodePtr(x);
}

View File

@ -80,6 +80,7 @@ namespace IREmitter {
Store16,
Store32,
BranchCond,
#if 0
// Floating-point
// There are three floating-point formats: single, double,
// and packed. For any operation where the format of the
@ -141,8 +142,18 @@ namespace IREmitter {
ForceToSingle,
ForceToDouble,
ForceToMReg,
LoadFPReg,
StoreFPReg,
#endif
LoadSingle,
LoadDouble,
LoadPaired, // This handles quantizers itself
DoubleToSingle,
DupSingleToMReg,
InsertDoubleInMReg,
ExpandPackedToMReg,
LoadFReg,
StoreFReg,
FSMul,
FSAdd,
// "Trinary" operators
// FIXME: Need to change representation!
@ -156,6 +167,7 @@ namespace IREmitter {
SystemCall,
RFIExit,
InterpreterBranch,
IdleLoop,
// "Opcode" representing a register too far away to
// reference directly; this is a size optimization
@ -365,6 +377,42 @@ namespace IREmitter {
InstLoc EmitRFIExit() {
return FoldZeroOp(RFIExit, 0);
}
InstLoc EmitIdleLoop(InstLoc idleParam, InstLoc pc) {
return FoldBiOp(IdleLoop, idleParam, pc);
}
InstLoc EmitLoadSingle(InstLoc addr) {
return FoldUOp(LoadSingle, addr);
}
InstLoc EmitLoadDouble(InstLoc addr) {
return FoldUOp(LoadDouble, addr);
}
InstLoc EmitLoadPaired(InstLoc addr, unsigned quantReg) {
return FoldUOp(LoadPaired, addr, quantReg);
}
InstLoc EmitLoadFReg(unsigned freg) {
return FoldZeroOp(LoadFReg, freg);
}
InstLoc EmitStoreFReg(InstLoc val, unsigned freg) {
return FoldUOp(StoreFReg, val, freg);
}
InstLoc EmitDupSingleToMReg(InstLoc val) {
return FoldUOp(DupSingleToMReg, val);
}
InstLoc EmitInsertDoubleInMReg(InstLoc val, InstLoc reg) {
return FoldBiOp(InsertDoubleInMReg, val, reg);
}
InstLoc EmitExpandPackedToMReg(InstLoc val) {
return FoldUOp(ExpandPackedToMReg, val);
}
InstLoc EmitFSMul(InstLoc op1, InstLoc op2) {
return FoldBiOp(FSMul, op1, op2);
}
InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FSAdd, op1, op2);
}
InstLoc EmitDoubleToSingle(InstLoc op1) {
return FoldUOp(DoubleToSingle, op1);
}
void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
void StartForwardPass() { curReadPtr = &InstList[0]; }

View File

@ -420,12 +420,11 @@ namespace CPUCompare
SetJumpTarget(skip);
const u8 *normalEntry = GetCodePtr();
js.normalEntry = (u8*)normalEntry;
if (ImHereDebug)
ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
if (false && js.fpa.any)
if (js.fpa.any)
{
//This block uses FPU - needs to add FP exception bailout
TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
@ -445,24 +444,10 @@ namespace CPUCompare
SetJumpTarget(b1);
}
// Conditionally add profiling code.
if (Profiler::g_ProfileBlocks) {
ADD(32, M(&b->runCount), Imm8(1));
#ifdef _WIN32
b->ticCounter.QuadPart = 0;
b->ticStart.QuadPart = 0;
b->ticStop.QuadPart = 0;
#else
//TODO
#endif
// get start tic
PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart);
}
js.rewriteStart = (u8*)GetCodePtr();
//Start up the register allocators
//They use the information in gpa/fpa to preload commonly used registers.
//gpr.Start(js.gpa);
//fpr.Start(js.fpa);
// Start up IR builder (structure that collects the
// instruction processed by the JIT routines)
ibuild.Reset();
js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
@ -519,6 +504,7 @@ namespace CPUCompare
break;
}
// Perform actual code generation
WriteCode();
b->flags = js.block_flags;

View File

@ -95,7 +95,7 @@ private:
PPCAnalyst::BlockRegStats gpa;
PPCAnalyst::BlockRegStats fpa;
PPCAnalyst::CodeOp *op;
u8* normalEntry;
u8* rewriteStart;
JitBlock *curBlock;
};

View File

@ -23,6 +23,7 @@
#include "../PowerPC.h"
#include "../../CoreTiming.h"
#include "MemoryUtil.h"
#include "CPUDetect.h"
#include "ABI.h"
#include "Jit.h"
@ -168,6 +169,176 @@ void AsmRoutineManager::Generate()
GenerateCommon();
}
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
const float m_quantizeTableS[] =
{
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
};
const float m_dequantizeTableS[] =
{
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
};
float psTemp[2];
void AsmRoutineManager::GenQuantizedLoads() {
const u8* loadPairedIllegal = AlignCode4();
UD2();
const u8* loadPairedFloat = AlignCode4();
if (cpu_info.bSSSE3) {
#ifdef _M_X64
MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
#endif
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
} else {
#ifdef _M_X64
MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
BSWAP(64, RCX);
ROL(64, RCX, Imm8(32));
MOVQ_xmm(XMM0, R(RCX));
#else
#if 0
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
PXOR(XMM1, R(XMM1));
PSHUFLW(XMM0, R(XMM0), 0xB1);
MOVAPD(XMM1, R(XMM0));
PSRLW(XMM0, 8);
PSLLW(XMM1, 8);
POR(XMM0, R(XMM1));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
BSWAP(32, EAX);
MOV(32, M(&psTemp[0]), R(RAX));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
BSWAP(32, EAX);
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
MOVQ_xmm(XMM0, M(&psTemp[0]));
#endif
#endif
}
RET();
const u8* loadPairedU8 = AlignCode4();
#ifdef _M_X64
MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
#endif
MOVD_xmm(XMM0, R(ECX));
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS8 = AlignCode4();
#ifdef _M_X64
MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
#endif
MOVD_xmm(XMM0, R(ECX));
PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24);
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedU16 = AlignCode4();
#ifdef _M_X64
MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
#endif
BSWAP(32, ECX);
ROL(32, R(ECX), Imm8(16));
MOVD_xmm(XMM0, R(ECX));
PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1));
RET();
const u8* loadPairedS16 = AlignCode4();
#ifdef _M_X64
MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
#else
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
#endif
BSWAP(32, ECX);
ROL(32, R(ECX), Imm8(16));
MOVD_xmm(XMM0, R(ECX));
PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16);
CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(EAX), Imm8(6));
AND(32, R(EAX), Imm32(0xFC));
MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1));
RET();
pairedLoadQuantized[0] = loadPairedFloat;
pairedLoadQuantized[1] = loadPairedIllegal;
pairedLoadQuantized[2] = loadPairedIllegal;
pairedLoadQuantized[3] = loadPairedIllegal;
pairedLoadQuantized[4] = loadPairedU8;
pairedLoadQuantized[5] = loadPairedU16;
pairedLoadQuantized[6] = loadPairedS8;
pairedLoadQuantized[7] = loadPairedS16;
}
void AsmRoutineManager::GenFifoWrite(int size)
{
@ -257,6 +428,8 @@ void AsmRoutineManager::GenerateCommon()
SUB(32, M(&CoreTiming::downcount), Imm8(0));
JMP(dispatcher, true);
GenQuantizedLoads();
computeRcFp = AlignCode16();
//CMPSD(R(XMM0), M(&zero),
// TODO

View File

@ -42,6 +42,7 @@ private:
void GenFifoWrite(int size);
void GenFifoFloatWrite();
void GenFifoXmm64Write();
void GenQuantizedLoads();
public:
void Init() {
@ -80,6 +81,8 @@ public:
const u8 *doReJit;
const u8 *pairedLoadQuantized[8];
bool compareEnabled;
};

View File

@ -39,6 +39,9 @@
// Zelda and many more games seem to pass the Acid Test.
//#define NORMALBRANCH_START Default(inst); ibuild.EmitInterpreterBranch(); return;
#define NORMALBRANCH_START
using namespace Gen;
void Jit64::sc(UGeckoInstruction inst)
@ -53,6 +56,7 @@ using namespace Gen;
void Jit64::bx(UGeckoInstruction inst)
{
NORMALBRANCH_START
if (inst.LK)
ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));
@ -67,6 +71,7 @@ using namespace Gen;
void Jit64::bcx(UGeckoInstruction inst)
{
NORMALBRANCH_START
if (inst.LK)
ibuild.EmitStoreLink(
ibuild.EmitIntConst(js.compilerPC + 4));
@ -117,6 +122,7 @@ using namespace Gen;
void Jit64::bcctrx(UGeckoInstruction inst)
{
NORMALBRANCH_START
Default(inst);
ibuild.EmitInterpreterBranch();
return;
@ -124,6 +130,7 @@ using namespace Gen;
void Jit64::bclrx(UGeckoInstruction inst)
{
NORMALBRANCH_START
if (inst.hex == 0x4e800020) {
ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
return;

View File

@ -29,141 +29,54 @@
#define INSTRUCTION_START
// #define INSTRUCTION_START Default(inst); return;
const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
{
fpr.Lock(d, a, b);
if (d == a)
{
fpr.LoadToX64(d, true);
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (d == b && reversible)
{
fpr.LoadToX64(d, true);
(this->*op)(fpr.RX(d), fpr.R(a));
}
else if (a != d && b != d)
{
// Sources different from d, can use rather quick solution
fpr.LoadToX64(d, !dupe);
MOVSD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), fpr.R(b));
}
else if (b != d)
{
fpr.LoadToX64(d, !dupe);
MOVSD(XMM0, fpr.R(b));
MOVSD(fpr.RX(d), fpr.R(a));
(this->*op)(fpr.RX(d), Gen::R(XMM0));
}
else // Other combo, must use two temps :(
{
MOVSD(XMM0, fpr.R(a));
MOVSD(XMM1, fpr.R(b));
fpr.LoadToX64(d, !dupe);
(this->*op)(XMM0, Gen::R(XMM1));
MOVSD(fpr.RX(d), Gen::R(XMM0));
}
if (dupe) {
ForceSinglePrecisionS(fpr.RX(d));
MOVDDUP(fpr.RX(d), fpr.R(d));
}
fpr.UnlockAll();
}
void Jit64::fp_arith_s(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) {
Default(inst); return;
}
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
val = ibuild.EmitDoubleToSingle(val);
bool dupe = inst.OPCD == 59;
switch (inst.SUBOP5)
{
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add
case 25: //mul
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
case 18: //div
case 20: //sub
case 21: //add
case 23: //sel
Default(inst);
break;
case 24: //res
Default(inst);
break;
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
default:
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
}
val = ibuild.EmitDupSingleToMReg(val);
ibuild.EmitStoreFReg(val, inst.FD);
}
void Jit64::fmaddXX(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) {
Default(inst); return;
}
bool single_precision = inst.OPCD == 59;
int a = inst.FA;
int b = inst.FB;
int c = inst.FC;
int d = inst.FD;
fpr.Lock(a, b, c, d);
MOVSD(XMM0, fpr.R(a));
switch (inst.SUBOP5)
{
case 28: //msub
MULSD(XMM0, fpr.R(c));
SUBSD(XMM0, fpr.R(b));
break;
case 29: //madd
MULSD(XMM0, fpr.R(c));
ADDSD(XMM0, fpr.R(b));
break;
case 30: //nmsub
MULSD(XMM0, fpr.R(c));
SUBSD(XMM0, fpr.R(b));
XORPD(XMM0, M((void*)&psSignBits2));
break;
case 31: //nmadd
MULSD(XMM0, fpr.R(c));
ADDSD(XMM0, fpr.R(b));
XORPD(XMM0, M((void*)&psSignBits2));
break;
}
fpr.LoadToX64(d, false);
//YES it is necessary to dupe the result :(
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
if (single_precision) {
ForceSinglePrecisionS(XMM0);
MOVDDUP(fpr.RX(d), R(XMM0));
} else {
MOVSD(fpr.RX(d), R(XMM0));
}
fpr.UnlockAll();
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
val = ibuild.EmitDoubleToSingle(val);
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
val = ibuild.EmitDupSingleToMReg(val);
ibuild.EmitStoreFReg(val, inst.FD);
}
void Jit64::fmrx(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) {
Default(inst); return;
}
int d = inst.FD;
int b = inst.FB;
fpr.LoadToX64(d, true); // we don't want to destroy the high bit
MOVSD(fpr.RX(d), fpr.R(b));
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FB);
val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
ibuild.EmitStoreFReg(val, inst.FD);
}
void Jit64::fcmpx(UGeckoInstruction inst)

View File

@ -71,6 +71,20 @@ void Jit64::lhax(UGeckoInstruction inst)
void Jit64::lXz(UGeckoInstruction inst)
{
INSTRUCTION_START
if (Core::GetStartupParameter().bSkipIdle &&
inst.OPCD == 32 &&
(inst.hex & 0xFFFF0000) == 0x800D0000 &&
(Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 ||
(Core::GetStartupParameter().bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
{
ibuild.EmitIdleLoop(ibuild.EmitIntConst(PowerPC::ppcState.gpr[inst.RA] + (s32)(s16)inst.SIMM_16),
ibuild.EmitIntConst(js.compilerPC));
js.compilerPC += 8;
return;
}
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));

View File

@ -57,38 +57,12 @@ u32 GC_ALIGNED16(temp32);
void Jit64::lfs(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
int d = inst.RD;
int a = inst.RA;
if (!a)
{
Default(inst);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
if (jo.assumeFPLoadFromMem)
{
UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false);
}
else
{
SafeLoadRegToEAX(ABI_PARAM1, 32, offset);
}
MOV(32, M(&temp32), R(EAX));
fpr.Lock(d);
fpr.LoadToX64(d, false);
CVTSS2SD(fpr.RX(d), M(&temp32));
MOVDDUP(fpr.RX(d), fpr.R(d));
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
ibuild.EmitStoreFReg(val, inst.RD);
return;
}
@ -291,32 +265,10 @@ void Jit64::stfsx(UGeckoInstruction inst)
void Jit64::lfsx(UGeckoInstruction inst)
{
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
fpr.Lock(inst.RS);
fpr.LoadToX64(inst.RS, false, true);
MOV(32, R(EAX), gpr.R(inst.RB));
IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB), val;
if (inst.RA)
ADD(32, R(EAX), gpr.R(inst.RA));
if (cpu_info.bSSSE3) {
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
#ifdef _M_IX86
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
#else
MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
#endif
PSHUFB(r, M((void *)bswapShuffle1x4));
CVTSS2SD(r, R(r));
MOVDDUP(r, R(r));
} else {
UnsafeLoadRegToReg(EAX, EAX, 32, false);
MOV(32, M(&temp32), R(EAX));
CVTSS2SD(XMM0, M(&temp32));
MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
}
fpr.UnlockAll();
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
ibuild.EmitStoreFReg(val, inst.RD);
}

View File

@ -40,419 +40,20 @@
#define INSTRUCTION_START
// #define INSTRUCTION_START Default(inst); return;
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
static u64 GC_ALIGNED16(temp64);
// TODO(ector): Improve 64-bit version
static void WriteDual32(u64 value, u32 address)
{
Memory::Write_U32((u32)(value >> 32), address);
Memory::Write_U32((u32)value, address + 4);
}
const double GC_ALIGNED16(m_quantizeTableD[]) =
{
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
};
const double GC_ALIGNED16(m_dequantizeTableD[]) =
{
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
};
// The big problem is likely instructions that set the quantizers in the same block.
// We will have to break block after quantizers are written to.
void Jit64::psq_st(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
{
Default(inst);
return;
}
if (!inst.RA)
{
// This really should never happen. Unless we change this to also support stwux
Default(inst);
return;
}
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
int stScale = gqr.ST_SCALE;
bool update = inst.OPCD == 61;
int offset = inst.SIMM_12;
int a = inst.RA;
int s = inst.RS; // Fp numbers
if (inst.W) {
// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update);
// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
// floats so that's what we'll work on.
switch (stType)
{
case QUANTIZE_FLOAT:
{
// This one has quite a bit of optimization potential.
if (gpr.R(a).IsImm())
{
PanicAlert("Imm: %08x", gpr.R(a).offset);
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, true);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
CVTSD2SS(XMM0, fpr.R(s));
MOVD_xmm(M(&temp64), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp64));
FixupBranch argh = J_CC(CC_NZ);
BSWAP(32, ABI_PARAM1);
#ifdef _M_X64
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
FixupBranch skip_call = J();
SetJumpTarget(argh);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
SetJumpTarget(skip_call);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
return;
}
default:
Default(inst);
return;
}
return;
}
if (stType == QUANTIZE_FLOAT)
{
if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
{
u32 addr = (u32)(gpr.R(a).offset + offset);
if (addr == 0xCC008000) {
// Writing to FIFO. Let's do fast method.
CVTPD2PS(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
CALL((void*)asm_routines.fifoDirectWriteXmm64);
js.fifoBytesThisBlock += 8;
return;
}
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, true);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
CVTPD2PS(XMM0, fpr.R(s));
SHUFPS(XMM0, R(XMM0), 1);
MOVQ_xmm(M(&temp64), XMM0);
#ifdef _M_X64
MOV(64, R(ABI_PARAM1), M(&temp64));
FixupBranch argh = J_CC(CC_NZ);
BSWAP(64, ABI_PARAM1);
MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
FixupBranch arg2 = J();
SetJumpTarget(argh);
CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
#else
FixupBranch argh = J_CC(CC_NZ);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
BSWAP(32, ABI_PARAM1);
AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1));
MOV(32, R(ABI_PARAM1), M(&temp64));
BSWAP(32, ABI_PARAM1);
MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1));
FixupBranch arg2 = J();
SetJumpTarget(argh);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
ADD(32, R(ABI_PARAM2), Imm32(4));
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
#endif
SetJumpTarget(arg2);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
else if (stType == QUANTIZE_U8)
{
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, update);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
MOVAPD(XMM0, fpr.R(s));
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
MULPD(XMM0, R(XMM1));
CVTPD2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(M(&temp64), XMM0);
MOV(16, R(ABI_PARAM1), M(&temp64));
#ifdef _M_X64
MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
if (update)
MOV(32, gpr.R(a), R(ABI_PARAM2));
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
else if (stType == QUANTIZE_S16)
{
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
if (update)
gpr.LoadToX64(a, true, update);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
if (update)
MOV(32, gpr.R(a), R(ABI_PARAM2));
MOVAPD(XMM0, fpr.R(s));
MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
MULPD(XMM0, R(XMM1));
SHUFPD(XMM0, R(XMM0), 1);
CVTPD2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(M(&temp64), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp64));
BSWAP(32, ABI_PARAM1);
#ifdef _M_X64
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
}
else {
// Dodger uses this.
// mario tennis
//PanicAlert("st %i:%i", stType, inst.W);
Default(inst);
}
Default(inst); return;
}
void Jit64::psq_l(UGeckoInstruction inst)
{
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
{
Default(inst);
return;
}
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
const EQuantizeType ldType = static_cast<EQuantizeType>(gqr.LD_TYPE);
int ldScale = gqr.LD_SCALE;
bool update = inst.OPCD == 57;
if (!inst.RA || inst.W)
{
// 0 1 during load
//PanicAlert("ld:%i %i", ldType, (int)inst.W);
Default(inst);
return;
}
int offset = inst.SIMM_12;
switch (ldType) {
case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address.
{
#ifdef _M_X64
gpr.LoadToX64(inst.RA, true, update);
fpr.LoadToX64(inst.RS, false);
if (cpu_info.bSSSE3) {
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
PSHUFB(xd, M((void *)pbswapShuffle2x4));
CVTPS2PD(xd, R(xd));
} else {
MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
BSWAP(64, RAX);
MOV(64, M(&psTemp[0]), R(RAX));
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
CVTPS2PD(r, M(&psTemp[0]));
SHUFPD(r, R(r), 1);
}
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
break;
#else
if (cpu_info.bSSSE3) {
gpr.LoadToX64(inst.RA, true, update);
fpr.LoadToX64(inst.RS, false);
X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
MOV(32, R(EAX), gpr.R(inst.RA));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset));
PSHUFB(xd, M((void *)pbswapShuffle2x4));
CVTPS2PD(xd, R(xd));
} else {
gpr.FlushLockX(ECX);
gpr.LoadToX64(inst.RA, true, update);
// This can probably be optimized somewhat.
LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
BSWAP(32, RAX);
MOV(32, M(&psTemp[0]), R(RAX));
MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
BSWAP(32, RAX);
MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
CVTPS2PD(r, M(&psTemp[0]));
gpr.UnlockAllX();
}
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
break;
#endif
}
case QUANTIZE_U8:
{
gpr.LoadToX64(inst.RA, true, update);
#ifdef _M_X64
MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
#else
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base));
#endif
MOV(32, M(&temp64), R(EAX));
MOVD_xmm(XMM0, M(&temp64));
// SSE4 optimization opportunity here.
PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1));
CVTDQ2PD(XMM0, R(XMM0));
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale]));
MULPD(r, R(XMM0));
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
}
break;
case QUANTIZE_S16:
{
gpr.LoadToX64(inst.RA, true, update);
#ifdef _M_X64
MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
#else
LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base));
#endif
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
fpr.LoadToX64(inst.RS, false, true);
X64Reg r = fpr.R(inst.RS).GetSimpleReg();
MOVD_xmm(XMM0, M(&temp64));
PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword..
PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P
CVTDQ2PD(XMM0, R(XMM0));
MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale]));
MULPD(r, R(XMM0));
SHUFPD(r, R(r), 1);
if (update && offset != 0)
ADD(32, gpr.R(inst.RA), Imm32(offset));
}
break;
/*
Dynamic quantizer. Todo when we have a test set.
MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3)); // it's in the high byte.
AND(32, R(EAX), Imm8(0x3F));
MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD));
MOVDDUP(r, MComplex(RCX, EAX, 8, 0));
*/
default:
// 4 0
// 6 0 //power tennis
// 5 0
// PanicAlert("ld:%i %i", ldType, (int)inst.W);
Default(inst);
return;
}
//u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12;
if (inst.W) {Default(inst); return;}
IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val;
if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
val = ibuild.EmitLoadPaired(addr, inst.I);
val = ibuild.EmitExpandPackedToMReg(val);
ibuild.EmitStoreFReg(val, inst.RD);
}