A bit more progress on my JIT WIP: biggest changes are some substantial

work on floating-point. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1743 8ced0084-cf51-0410-be5f-012b33b47a6e
2009-01-03 07:51:27 +00:00 · 2009-01-03 07:51:27 +00:00 · 29a033e1dd
parent 35128bb041
commit 29a033e1dd
11 changed files with 524 additions and 654 deletions
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp
@ -97,8 +97,9 @@ Inter-block dead condition register elimination (Likely significant win
 Optimize conditions for conditional branches.
 General dead register elimination.
 Inter-block inlining.
-Track down a few correctness bugs (I think there's something wrong
-	with my branches, but I haven't been able to figure it out).
+Track down issues with new JIT + dual-core mode (I think I'm going to
+	need help with this one; I'm not very familiar with the
+	dual-core code.)
 Specialized slw/srw/sraw; I think there are some tricks that could
 	have a non-trivial effect, and there are significantly shorter
 	implementations for 64-bit involving abusing 64-bit shifts.
@ -502,16 +503,21 @@ struct RegInfo {
 	InstLoc FirstI;
 	std::vector<unsigned> IInfo;
 	InstLoc regs[16];
+	InstLoc fregs[16];
 	unsigned numSpills;
+	unsigned numFSpills;
 	bool MakeProfile;
 	bool UseProfile;
 	unsigned numProfiledLoads;
 	unsigned exitNumber;

 	RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) {
-		for (unsigned i = 0; i < 16; i++)
+		for (unsigned i = 0; i < 16; i++) {
 			regs[i] = 0;
+			fregs[i] = 0;
+		}
 		numSpills = 0;
+		numFSpills = 0;
 		numProfiledLoads = 0;
 		exitNumber = 0;
 		MakeProfile = UseProfile = false;
@ -533,6 +539,7 @@ static unsigned regReadUse(RegInfo& R, InstLoc I) {

 static unsigned SlotSet[1000];
 static unsigned ProfiledLoads[1000];
+static u8 GC_ALIGNED16(FSlotSet[16*1000]);

 static OpArg regLocForSlot(RegInfo& RI, unsigned slot) {
 	return M(&SlotSet[slot - 1]);
@ -558,57 +565,86 @@ static void regSpill(RegInfo& RI, X64Reg reg) {
 	RI.regs[reg] = 0;
 }

+static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
+	return M(&FSlotSet[slot*16]);
+}
+
+static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
+	unsigned newSpill = ++RI.numFSpills;
+	RI.IInfo[I - RI.FirstI] |= newSpill << 16;
+	return newSpill;
+}
+
+static unsigned fregGetSpill(RegInfo& RI, InstLoc I) {
+	return RI.IInfo[I - RI.FirstI] >> 16;
+}
+
+static void fregSpill(RegInfo& RI, X64Reg reg) {
+	if (!RI.fregs[reg]) return;
+	unsigned slot = fregGetSpill(RI, RI.fregs[reg]);
+	if (!slot) {
+		slot = fregCreateSpill(RI, RI.fregs[reg]);
+		RI.Jit->MOVAPD(fregLocForSlot(RI, slot), reg);
+	}
+	RI.fregs[reg] = 0;
+}
+
+// ECX is scratch, so we don't allocate it
+static X64Reg RegAllocOrder[] = {EDI, ESI, EBP, EBX, EDX, EAX};
+static unsigned RegAllocSize = sizeof(RegAllocOrder) / sizeof(X64Reg);
+static X64Reg FRegAllocOrder[] = {XMM2, XMM3, XMM4, XMM5, XMM6, XMM7};
+static unsigned FRegAllocSize = sizeof(FRegAllocOrder) / sizeof(X64Reg);
+
 static X64Reg regFindFreeReg(RegInfo& RI) {
-	if (RI.regs[EDI] == 0) return EDI;
-	if (RI.regs[ESI] == 0) return ESI;
-	if (RI.regs[EBP] == 0) return EBP;
-	if (RI.regs[EBX] == 0) return EBX;
-	if (RI.regs[EDX] == 0) return EDX;
-	if (RI.regs[EAX] == 0) return EAX;
-	// ECX is scratch, so we don't allocate it
-	static X64Reg regs[] = {EDI, ESI, EBP, EBX, EDX, EAX};
+	for (unsigned i = 0; i < RegAllocSize; i++)
+		if (RI.regs[RegAllocOrder[i]] == 0)
+			return RegAllocOrder[i];
+
 	static unsigned nextReg = 0;
-	X64Reg reg = regs[nextReg++ % 6];
+	X64Reg reg = RegAllocOrder[nextReg++ % RegAllocSize];
 	regSpill(RI, reg);
 	return reg;
 }

+static X64Reg fregFindFreeReg(RegInfo& RI) {
+	for (unsigned i = 0; i < FRegAllocSize; i++)
+		if (RI.fregs[FRegAllocOrder[i]] == 0)
+			return FRegAllocOrder[i];
+	// XMM0/1 are scratch, so we don't allocate it
+	fregSpill(RI, XMM7);
+	return XMM7;
+}
+
 static OpArg regLocForInst(RegInfo& RI, InstLoc I) {
-	if (RI.regs[EDI] == I) return R(EDI);
-	if (RI.regs[ESI] == I) return R(ESI);
-	if (RI.regs[EBP] == I) return R(EBP);
-	if (RI.regs[EBX] == I) return R(EBX);
-	if (RI.regs[EDX] == I) return R(EDX);
-	if (RI.regs[EAX] == I) return R(EAX);
-	if (RI.regs[ECX] == I) return R(ECX);
+	for (unsigned i = 0; i < RegAllocSize; i++)
+		if (RI.regs[RegAllocOrder[i]] == I)
+			return R(RegAllocOrder[i]);

 	if (regGetSpill(RI, I) == 0)
 		PanicAlert("Retrieving unknown spill slot?!");
 	return regLocForSlot(RI, regGetSpill(RI, I));
 }

+static OpArg fregLocForInst(RegInfo& RI, InstLoc I) {
+	for (unsigned i = 0; i < FRegAllocSize; i++)
+		if (RI.fregs[FRegAllocOrder[i]] == I)
+			return R(FRegAllocOrder[i]);
+
+	if (fregGetSpill(RI, I) == 0)
+		PanicAlert("Retrieving unknown spill slot?!");
+	return fregLocForSlot(RI, fregGetSpill(RI, I));
+}
+
 static void regClearInst(RegInfo& RI, InstLoc I) {
-	if (RI.regs[EDI] == I) {
-		RI.regs[EDI] = 0;
-	}
-	if (RI.regs[ESI] == I) {
-		RI.regs[ESI] = 0;
-	}
-	if (RI.regs[EBP] == I) {
-		RI.regs[EBP] = 0;
-	}
-	if (RI.regs[EBX] == I) {
-		RI.regs[EBX] = 0;
-	}
-	if (RI.regs[EDX] == I) {
-		RI.regs[EDX] = 0;
-	}
-	if (RI.regs[EAX] == I) {
-		RI.regs[EAX] = 0;
-	}
-	if (RI.regs[ECX] == I) {
-		RI.regs[ECX] = 0;
-	}
+	for (unsigned i = 0; i < RegAllocSize; i++)
+		if (RI.regs[RegAllocOrder[i]] == I)
+			RI.regs[RegAllocOrder[i]] = 0;
+}
+
+static void fregClearInst(RegInfo& RI, InstLoc I) {
+	for (unsigned i = 0; i < FRegAllocSize; i++)
+		if (RI.fregs[FRegAllocOrder[i]] == I)
+			RI.fregs[FRegAllocOrder[i]] = 0;
 }

 static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
@ -645,6 +681,20 @@ static X64Reg regBinLHSReg(RegInfo& RI, InstLoc I) {
 	return reg;
 }

+static void regNormalRegClear(RegInfo& RI, InstLoc I) {
+	if (RI.IInfo[I - RI.FirstI] & 4)
+		regClearInst(RI, getOp1(I));
+	if (RI.IInfo[I - RI.FirstI] & 8)
+		regClearInst(RI, getOp2(I));
+}
+
+static void fregNormalRegClear(RegInfo& RI, InstLoc I) {
+	if (RI.IInfo[I - RI.FirstI] & 4)
+		fregClearInst(RI, getOp1(I));
+	if (RI.IInfo[I - RI.FirstI] & 8)
+		fregClearInst(RI, getOp2(I));
+}
+
 static void regEmitBinInst(RegInfo& RI, InstLoc I,
 			   void (Jit64::*op)(int, const OpArg&,
 			                     const OpArg&)) {
@ -660,11 +710,11 @@ static void regEmitBinInst(RegInfo& RI, InstLoc I,
 		(RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I)));
 	}
 	RI.regs[reg] = I;
+	regNormalRegClear(RI, I);
 }

 // Mark and calculation routines for profiled load/store addresses
 // Could be extended to unprofiled addresses.
-// FIXME: Finish/activate!
 static void regMarkMemAddress(RegInfo& RI, InstLoc I, InstLoc AI, unsigned OpNum) {
 	if (isImm(*AI)) {
 		unsigned addr = RI.Build->GetImmValue(AI);	
@ -743,7 +793,6 @@ static OpArg regBuildMemAddress(RegInfo& RI, InstLoc I, InstLoc AI,
 	}
 	return MDisp(baseReg, offset);
 }
-// end FIXME

 static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) {
 	if (RI.UseProfile) {
@ -844,7 +893,6 @@ static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) {
 			RI.Jit->js.fifoBytesThisBlock += Size >> 3;
 			if (RI.IInfo[I - RI.FirstI] & 4)
 				regClearInst(RI, getOp1(I));
-			//regBuildMemAddress(RI, I, getOp2(I), 2, Size, 0, false);
 			regClearDeadMemAddress(RI, I, getOp2(I), 2);
 			return;
 		}
@ -878,6 +926,7 @@ static void regEmitShiftInst(RegInfo& RI, InstLoc I,
 	RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
 	(RI.Jit->*op)(32, R(reg), R(ECX));
 	RI.regs[reg] = I;
+	regNormalRegClear(RI, I);
 }

 static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I,
@ -930,7 +979,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 	RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts());
 	RI.Build = ibuild;
 	RI.UseProfile = UseProfile;
-	RI.MakeProfile = !RI.UseProfile;
+	RI.MakeProfile = false;//!RI.UseProfile;
 	// Pass to compute liveness
 	ibuild->StartBackPass();
 	for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) {
@ -949,12 +998,14 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case LoadCarry:
 		case LoadCTR:
 		case LoadMSR:
+		case LoadFReg:
 		case BlockEnd:
 		case BlockStart:
 		case InterpreterFallback:
 		case SystemCall:
 		case RFIExit:
 		case InterpreterBranch:
+		case IdleLoop:
 			// No liveness effects
 			break;
 		case Tramp:
@ -965,6 +1016,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case SExt16:
 		case BSwap32:
 		case BSwap16:
+		case DupSingleToMReg:
+		case DoubleToSingle:
+		case ExpandPackedToMReg:
 			if (thisUsed)
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
@ -973,6 +1027,10 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case Load32:
 			regMarkMemAddress(RI, I, getOp1(I), 1);
 			break;
+		case LoadSingle:
+		case LoadPaired:
+			regMarkUse(RI, I, getOp1(I), 1);
+			break;
 		case StoreCR:
 		case StoreCarry:
 			regMarkUse(RI, I, getOp1(I), 1);
@ -981,6 +1039,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case StoreLink:
 		case StoreCTR:
 		case StoreMSR:
+		case StoreFReg:
 			if (!isImm(*getOp1(I)))
 				regMarkUse(RI, I, getOp1(I), 1);
 			break;
@ -1000,6 +1059,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case ICmpUgt:
 		case ICmpSle:
 		case ICmpSgt:
+		case FSMul:
+		case FSAdd:
+		case InsertDoubleInMReg:
 			if (thisUsed) {
 				regMarkUse(RI, I, getOp1(I), 1);
 				if (!isImm(*getOp2(I)))
@ -1041,6 +1103,9 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		case InterpreterFallback: {
 			unsigned InstCode = ibuild->GetImmValue(getOp1(I));
 			unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
+			// There really shouldn't be anything live across an
+			// interpreter call at the moment, but optimizing interpreter
+			// calls isn't completely out of the question...
 			regSpillCallerSaved(RI);
 			Jit->MOV(32, M(&PC), Imm32(InstLoc));
 			Jit->MOV(32, M(&NPC), Imm32(InstLoc+4));
@ -1089,6 +1154,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			unsigned ppcreg = *I >> 16;
 			regStoreInstToConstLoc(RI, 32, getOp1(I),
 					       &PowerPC::ppcState.gpr[ppcreg]);
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case StoreCR: {
@ -1096,18 +1162,22 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			unsigned ppcreg = *I >> 16;
 			// CAUTION: uses 8-bit reg!
 			Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX));
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case StoreLink: {
 			regStoreInstToConstLoc(RI, 32, getOp1(I), &LR);
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case StoreCTR: {
 			regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR);
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case StoreMSR: {
 			regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR);
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case StoreCarry: {
@ -1118,6 +1188,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			Jit->SetJumpTarget(nocarry);
 			Jit->JitClearCA();
 			Jit->SetJumpTarget(cont);
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case Load8: {
@ -1150,6 +1221,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
 			Jit->MOVSX(32, 8, reg, R(ECX));
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case SExt16: {
@ -1157,6 +1229,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			X64Reg reg = regUReg(RI, I);
 			Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I)));
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case And: {
@ -1199,6 +1272,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 				Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I)));
 			}
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case Rol: {
@ -1228,6 +1302,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			X64Reg reg = regFindFreeReg(RI);
 			Jit->MOVZX(32, 8, reg, R(ECX));
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case ICmpUgt: {
@ -1237,6 +1312,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			X64Reg reg = regFindFreeReg(RI);
 			Jit->MOVZX(32, 8, reg, R(ECX));
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case ICmpSle: {
@ -1246,6 +1322,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			X64Reg reg = regFindFreeReg(RI);
 			Jit->MOVZX(32, 8, reg, R(ECX));
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case ICmpCRUnsigned: {
@ -1264,6 +1341,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			Jit->SetJumpTarget(continue1);
 			Jit->SetJumpTarget(continue2);
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
 			break;
 		}
 		case ICmpCRSigned: {
@ -1282,6 +1360,102 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			Jit->SetJumpTarget(continue1);
 			Jit->SetJumpTarget(continue2);
 			RI.regs[reg] = I;
+			regNormalRegClear(RI, I);
+			break;
+		}
+		case LoadSingle: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
+			RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
+			Jit->MOVD_xmm(reg, R(ECX));
+			RI.fregs[reg] = I;
+			regNormalRegClear(RI, I);
+			break;
+		}
+		case LoadPaired: {
+			if (!thisUsed) break;
+			regSpill(RI, EAX);
+			regSpill(RI, EDX);
+			X64Reg reg = fregFindFreeReg(RI);
+			unsigned quantreg = *I >> 16;
+			Jit->MOVZX(32, 16, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + quantreg]) + 2));
+			Jit->MOVZX(32, 8, EDX, R(AL));
+			// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]!
+			Jit->SHL(32, R(EDX), Imm8(2));
+			Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
+			Jit->CALLptr(MDisp(EDX, (u32)asm_routines.pairedLoadQuantized));
+			Jit->MOVAPD(reg, R(XMM0));
+			RI.fregs[reg] = I;
+			regNormalRegClear(RI, I);
+			break;
+		}
+		case DupSingleToMReg: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
+			Jit->MOVDDUP(reg, R(reg));
+			RI.fregs[reg] = I;
+			fregNormalRegClear(RI, I);
+			break;
+		}
+		case InsertDoubleInMReg: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
+			Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
+			Jit->MOVSD(reg, R(XMM0));
+			RI.fregs[reg] = I;
+			fregNormalRegClear(RI, I);
+			break;
+		}
+		case ExpandPackedToMReg: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->CVTPS2PD(reg, fregLocForInst(RI, getOp1(I)));
+			RI.fregs[reg] = I;
+			fregNormalRegClear(RI, I);
+			break;
+		}
+		case LoadFReg: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			unsigned ppcreg = *I >> 8;
+			Jit->MOVAPD(reg, M(&PowerPC::ppcState.ps[ppcreg]));
+			RI.fregs[reg] = I;
+			break;
+		}
+		case StoreFReg: {
+			unsigned ppcreg = *I >> 16;
+			Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
+			Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0);
+			fregNormalRegClear(RI, I);
+			break;
+		}
+		case DoubleToSingle: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
+			RI.fregs[reg] = I;
+			fregNormalRegClear(RI, I);
+			break;
+		}
+		case FSMul: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
+			Jit->MULSS(reg, fregLocForInst(RI, getOp2(I)));
+			RI.fregs[reg] = I;
+			fregNormalRegClear(RI, I);
+			break;
+		}
+		case FSAdd: {
+			if (!thisUsed) break;
+			X64Reg reg = fregFindFreeReg(RI);
+			Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
+			Jit->ADDSS(reg, fregLocForInst(RI, getOp2(I)));
+			RI.fregs[reg] = I;
+			fregNormalRegClear(RI, I);
 			break;
 		}
 		case CInt32:
@ -1328,6 +1502,15 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 		}
 		case BranchUncond: {
 			regWriteExit(RI, getOp1(I));
+			regNormalRegClear(RI, I);
+			break;
+		}
+		case IdleLoop: {
+			unsigned IdleParam = ibuild->GetImmValue(getOp1(I));
+			unsigned InstLoc = ibuild->GetImmValue(getOp2(I));
+			Jit->ABI_CallFunctionC((void *)&PowerPC::OnIdle, IdleParam);
+			Jit->MOV(32, M(&PowerPC::ppcState.pc), Imm32(InstLoc + 12));
+			Jit->JMP(asm_routines.testExceptions, true);
 			break;
 		}
 		case SystemCall: {
@ -1378,26 +1561,16 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
 			PanicAlert("Unknown JIT instruction; aborting!");
 			exit(1);
 		}
-		if (getOpcode(*I) != Tramp &&
-		    getOpcode(*I) != BranchCond &&
-		    getOpcode(*I) != Load8 && 
-		    getOpcode(*I) != Load16 && 
-		    getOpcode(*I) != Load32 &&
-		    getOpcode(*I) != Store8 &&
-		    getOpcode(*I) != Store16 &&
-		    getOpcode(*I) != Store32 &&
-		    1) {
-			if (RI.IInfo[I - RI.FirstI] & 4)
-				regClearInst(RI, getOp1(I));
-			if (RI.IInfo[I - RI.FirstI] & 8)
-				regClearInst(RI, getOp2(I));
-		}
 	}
 	for (unsigned i = 0; i < 8; i++) {
 		if (RI.regs[i]) {
 			PanicAlert("Incomplete cleanup!");
 			exit(1);
 		}
+		if (RI.fregs[i]) {
+			PanicAlert("Incomplete cleanup!");
+			exit(1);
+		}
 	}

 	if (UseProfile && RI.numSpills)
@ -1412,8 +1585,8 @@ void Jit64::WriteCode() {

 void ProfiledReJit() {
 	u8* x = (u8*)jit.GetCodePtr();
-	jit.SetCodePtr(jit.js.normalEntry);
+	jit.SetCodePtr(jit.js.rewriteStart);
 	DoWriteCode(&jit.ibuild, &jit, true);
-	jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.normalEntry;
+	jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.rewriteStart;
 	jit.SetCodePtr(x);
 }
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h
@ -80,6 +80,7 @@ namespace IREmitter {
 		Store16,
 		Store32,
 		BranchCond,
+#if 0
 		// Floating-point
 		// There are three floating-point formats: single, double,
 		// and packed.  For any operation where the format of the
@ -141,8 +142,18 @@ namespace IREmitter {
 		ForceToSingle,
 		ForceToDouble,
 		ForceToMReg,
-		LoadFPReg,
-		StoreFPReg,
+#endif
+		LoadSingle,
+		LoadDouble,
+		LoadPaired, // This handles quantizers itself
+		DoubleToSingle,
+		DupSingleToMReg,
+		InsertDoubleInMReg,
+		ExpandPackedToMReg,
+		LoadFReg,
+		StoreFReg,
+		FSMul,
+		FSAdd,

 		// "Trinary" operators
 		// FIXME: Need to change representation!
@ -156,6 +167,7 @@ namespace IREmitter {
 		SystemCall,
 		RFIExit,
 		InterpreterBranch,
+		IdleLoop,

 		// "Opcode" representing a register too far away to
 		// reference directly; this is a size optimization
@ -365,6 +377,42 @@ namespace IREmitter {
 		InstLoc EmitRFIExit() {
 			return FoldZeroOp(RFIExit, 0);
 		}
+		InstLoc EmitIdleLoop(InstLoc idleParam, InstLoc pc) {
+			return FoldBiOp(IdleLoop, idleParam, pc);
+		}
+		InstLoc EmitLoadSingle(InstLoc addr) {
+			return FoldUOp(LoadSingle, addr);
+		}
+		InstLoc EmitLoadDouble(InstLoc addr) {
+			return FoldUOp(LoadDouble, addr);
+		}
+		InstLoc EmitLoadPaired(InstLoc addr, unsigned quantReg) {
+			return FoldUOp(LoadPaired, addr, quantReg);
+		}
+		InstLoc EmitLoadFReg(unsigned freg) {
+			return FoldZeroOp(LoadFReg, freg);
+		}
+		InstLoc EmitStoreFReg(InstLoc val, unsigned freg) {
+			return FoldUOp(StoreFReg, val, freg);
+		}
+		InstLoc EmitDupSingleToMReg(InstLoc val) {
+			return FoldUOp(DupSingleToMReg, val);
+		}
+		InstLoc EmitInsertDoubleInMReg(InstLoc val, InstLoc reg) {
+			return FoldBiOp(InsertDoubleInMReg, val, reg);
+		}
+		InstLoc EmitExpandPackedToMReg(InstLoc val) {
+			return FoldUOp(ExpandPackedToMReg, val);
+		}
+		InstLoc EmitFSMul(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(FSMul, op1, op2);
+		}
+		InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
+			return FoldBiOp(FSAdd, op1, op2);
+		}
+		InstLoc EmitDoubleToSingle(InstLoc op1) {
+			return FoldUOp(DoubleToSingle, op1);
+		}

 		void StartBackPass() { curReadPtr = &InstList[InstList.size()]; }
 		void StartForwardPass() { curReadPtr = &InstList[0]; }
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp
@ -420,12 +420,11 @@ namespace CPUCompare
 		SetJumpTarget(skip);

 		const u8 *normalEntry = GetCodePtr();
-		js.normalEntry = (u8*)normalEntry;
 		
 		if (ImHereDebug)
 			ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful
 		
-		if (false && js.fpa.any)
+		if (js.fpa.any)
 		{
 			//This block uses FPU - needs to add FP exception bailout
 			TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit
@ -445,24 +444,10 @@ namespace CPUCompare
 			SetJumpTarget(b1);
 		}

-		// Conditionally add profiling code.
-		if (Profiler::g_ProfileBlocks) {
-			ADD(32, M(&b->runCount), Imm8(1));
-#ifdef _WIN32
-			b->ticCounter.QuadPart = 0;
-			b->ticStart.QuadPart = 0;
-			b->ticStop.QuadPart = 0;
-#else
-//TODO
-#endif
-			// get start tic
-			PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart);
-		}
+		js.rewriteStart = (u8*)GetCodePtr();

-		//Start up the register allocators
-		//They use the information in gpa/fpa to preload commonly used registers.
-		//gpr.Start(js.gpa);
-		//fpr.Start(js.fpa);
+		// Start up IR builder (structure that collects the
+		// instruction processed by the JIT routines)
 		ibuild.Reset();

 		js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address);
@ -519,6 +504,7 @@ namespace CPUCompare
 				break;
 		}

+		// Perform actual code generation
 		WriteCode();

 		b->flags = js.block_flags;
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h
@ -95,7 +95,7 @@ private:
 		PPCAnalyst::BlockRegStats gpa;
 		PPCAnalyst::BlockRegStats fpa;
 		PPCAnalyst::CodeOp *op;
-		u8* normalEntry;
+		u8* rewriteStart;

 		JitBlock *curBlock;
 	};
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp
@ -23,6 +23,7 @@
 #include "../PowerPC.h"
 #include "../../CoreTiming.h"
 #include "MemoryUtil.h"
+#include "CPUDetect.h"

 #include "ABI.h"
 #include "Jit.h"
@ -168,6 +169,176 @@ void AsmRoutineManager::Generate()
 	GenerateCommon();
 }

+const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
+
+const float m_quantizeTableS[] =
+{
+	(1 <<  0),	(1 <<  1),	(1 <<  2),	(1 <<  3),
+	(1 <<  4),	(1 <<  5),	(1 <<  6),	(1 <<  7),
+	(1 <<  8),	(1 <<  9),	(1 << 10),	(1 << 11),
+	(1 << 12),	(1 << 13),	(1 << 14),	(1 << 15),
+	(1 << 16),	(1 << 17),	(1 << 18),	(1 << 19),
+	(1 << 20),	(1 << 21),	(1 << 22),	(1 << 23),
+	(1 << 24),	(1 << 25),	(1 << 26),	(1 << 27),
+	(1 << 28),	(1 << 29),	(1 << 30),	(1 << 31),
+	1.0 / (1ULL << 32),	1.0 / (1 << 31),	1.0 / (1 << 30),	1.0 / (1 << 29),
+	1.0 / (1 << 28),	1.0 / (1 << 27),	1.0 / (1 << 26),	1.0 / (1 << 25),
+	1.0 / (1 << 24),	1.0 / (1 << 23),	1.0 / (1 << 22),	1.0 / (1 << 21),
+	1.0 / (1 << 20),	1.0 / (1 << 19),	1.0 / (1 << 18),	1.0 / (1 << 17),
+	1.0 / (1 << 16),	1.0 / (1 << 15),	1.0 / (1 << 14),	1.0 / (1 << 13),
+	1.0 / (1 << 12),	1.0 / (1 << 11),	1.0 / (1 << 10),	1.0 / (1 <<  9),
+	1.0 / (1 <<  8),	1.0 / (1 <<  7),	1.0 / (1 <<  6),	1.0 / (1 <<  5),
+	1.0 / (1 <<  4),	1.0 / (1 <<  3),	1.0 / (1 <<  2),	1.0 / (1 <<  1),
+}; 
+
+const float m_dequantizeTableS[] =
+{
+	1.0 / (1 <<  0),	1.0 / (1 <<  1),	1.0 / (1 <<  2),	1.0 / (1 <<  3),
+	1.0 / (1 <<  4),	1.0 / (1 <<  5),	1.0 / (1 <<  6),	1.0 / (1 <<  7),
+	1.0 / (1 <<  8),	1.0 / (1 <<  9),	1.0 / (1 << 10),	1.0 / (1 << 11),
+	1.0 / (1 << 12),	1.0 / (1 << 13),	1.0 / (1 << 14),	1.0 / (1 << 15),
+	1.0 / (1 << 16),	1.0 / (1 << 17),	1.0 / (1 << 18),	1.0 / (1 << 19),
+	1.0 / (1 << 20),	1.0 / (1 << 21),	1.0 / (1 << 22),	1.0 / (1 << 23),
+	1.0 / (1 << 24),	1.0 / (1 << 25),	1.0 / (1 << 26),	1.0 / (1 << 27),
+	1.0 / (1 << 28),	1.0 / (1 << 29),	1.0 / (1 << 30),	1.0 / (1 << 31),
+	(1ULL << 32),	(1 << 31),		(1 << 30),		(1 << 29),
+	(1 << 28),		(1 << 27),		(1 << 26),		(1 << 25),
+	(1 << 24),		(1 << 23),		(1 << 22),		(1 << 21),
+	(1 << 20),		(1 << 19),		(1 << 18),		(1 << 17),
+	(1 << 16),		(1 << 15),		(1 << 14),		(1 << 13),
+	(1 << 12),		(1 << 11),		(1 << 10),		(1 <<  9),
+	(1 <<  8),		(1 <<  7),		(1 <<  6),		(1 <<  5),
+	(1 <<  4),		(1 <<  3),		(1 <<  2),		(1 <<  1),
+};  
+
+float psTemp[2];
+
+void AsmRoutineManager::GenQuantizedLoads() {
+	const u8* loadPairedIllegal = AlignCode4();
+	UD2();
+	const u8* loadPairedFloat = AlignCode4();
+	if (cpu_info.bSSSE3) {
+#ifdef _M_X64
+		MOVQ_xmm(XMM0, MComplex(RBX, RCX, 1, 0));
+#else
+		AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+		MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
+#endif
+		PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
+	} else {
+#ifdef _M_X64
+		MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
+		BSWAP(64, RCX);
+		ROL(64, RCX, Imm8(32));
+		MOVQ_xmm(XMM0, R(RCX));
+#else
+#if 0
+		AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+		MOVQ_xmm(XMM0, MDisp(ECX, (u32)Memory::base));
+		PXOR(XMM1, R(XMM1));
+		PSHUFLW(XMM0, R(XMM0), 0xB1);
+		MOVAPD(XMM1, R(XMM0));
+		PSRLW(XMM0, 8);
+		PSLLW(XMM1, 8);
+		POR(XMM0, R(XMM1));
+#else
+		AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+		MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
+		BSWAP(32, EAX);
+		MOV(32, M(&psTemp[0]), R(RAX));
+		MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
+		BSWAP(32, EAX);
+		MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
+		MOVQ_xmm(XMM0, M(&psTemp[0]));
+#endif
+#endif
+	}
+	RET();
+
+	const u8* loadPairedU8 = AlignCode4();
+#ifdef _M_X64
+	MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
+#else
+	AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+	MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
+#endif
+	MOVD_xmm(XMM0, R(ECX));
+	PXOR(XMM1, R(XMM1));
+	PUNPCKLBW(XMM0, R(XMM1));
+	PUNPCKLWD(XMM0, R(XMM1));
+	CVTDQ2PS(XMM0, R(XMM0));
+	SHR(32, R(EAX), Imm8(6));
+	MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
+	PUNPCKLDQ(XMM1, R(XMM1));
+	MULPS(XMM0, R(XMM1));
+	RET();
+
+	const u8* loadPairedS8 = AlignCode4();
+#ifdef _M_X64
+	MOVZX(32, 16, ECX, MComplex(RBX, RCX, 1, 0));
+#else
+	AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+	MOVZX(32, 16, ECX, MDisp(ECX, (u32)Memory::base));
+#endif
+	MOVD_xmm(XMM0, R(ECX));
+	PUNPCKLBW(XMM0, R(XMM0));
+	PUNPCKLWD(XMM0, R(XMM0));
+	PSRAD(XMM0, 24);
+	CVTDQ2PS(XMM0, R(XMM0));
+	SHR(32, R(EAX), Imm8(6));
+	MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
+	PUNPCKLDQ(XMM1, R(XMM1));
+	MULPS(XMM0, R(XMM1));
+	RET();
+
+	const u8* loadPairedU16 = AlignCode4();
+#ifdef _M_X64
+	MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
+#else
+	AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+	MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
+#endif
+	BSWAP(32, ECX);
+	ROL(32, R(ECX), Imm8(16));
+	MOVD_xmm(XMM0, R(ECX));
+	PXOR(XMM1, R(XMM1));
+	PUNPCKLWD(XMM0, R(XMM1));
+	CVTDQ2PS(XMM0, R(XMM0));
+	SHR(32, R(EAX), Imm8(6));
+	MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
+	PUNPCKLDQ(XMM1, R(XMM1));
+	MULPS(XMM0, R(XMM1));
+	RET();
+
+	const u8* loadPairedS16 = AlignCode4();
+#ifdef _M_X64
+	MOV(32, R(ECX), MComplex(RBX, RCX, 1, 0));
+#else
+	AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
+	MOV(32, R(ECX), MDisp(ECX, (u32)Memory::base));
+#endif
+	BSWAP(32, ECX);
+	ROL(32, R(ECX), Imm8(16));
+	MOVD_xmm(XMM0, R(ECX));
+	PUNPCKLWD(XMM0, R(XMM0));
+	PSRAD(XMM0, 16);
+	CVTDQ2PS(XMM0, R(XMM0));
+	SHR(32, R(EAX), Imm8(6));
+	AND(32, R(EAX), Imm32(0xFC));
+	MOVSS(XMM1, MDisp(EAX, (u32)m_dequantizeTableS));
+	PUNPCKLDQ(XMM1, R(XMM1));
+	MULPS(XMM0, R(XMM1));
+	RET();
+
+	pairedLoadQuantized[0] = loadPairedFloat;
+	pairedLoadQuantized[1] = loadPairedIllegal;
+	pairedLoadQuantized[2] = loadPairedIllegal;
+	pairedLoadQuantized[3] = loadPairedIllegal;
+	pairedLoadQuantized[4] = loadPairedU8;
+	pairedLoadQuantized[5] = loadPairedU16;
+	pairedLoadQuantized[6] = loadPairedS8;
+	pairedLoadQuantized[7] = loadPairedS16;
+}

 void AsmRoutineManager::GenFifoWrite(int size) 
 {
@ -257,6 +428,8 @@ void AsmRoutineManager::GenerateCommon()
 	SUB(32, M(&CoreTiming::downcount), Imm8(0));
 	JMP(dispatcher, true);

+	GenQuantizedLoads();
+
 	computeRcFp = AlignCode16();
 	//CMPSD(R(XMM0), M(&zero), 
 	// TODO
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h
@ -42,6 +42,7 @@ private:
 	void GenFifoWrite(int size);
 	void GenFifoFloatWrite();
 	void GenFifoXmm64Write();
+	void GenQuantizedLoads();

 public:
 	void Init() {
@ -80,6 +81,8 @@ public:

 	const u8 *doReJit;

+	const u8 *pairedLoadQuantized[8];
+
 	bool compareEnabled;
 };

--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp
@ -39,6 +39,9 @@

 // Zelda and many more games seem to pass the Acid Test. 

+//#define NORMALBRANCH_START Default(inst); ibuild.EmitInterpreterBranch(); return;
+#define NORMALBRANCH_START
+
 using namespace Gen;

 	void Jit64::sc(UGeckoInstruction inst)
@ -53,6 +56,7 @@ using namespace Gen;

 	void Jit64::bx(UGeckoInstruction inst)
 	{
+		NORMALBRANCH_START
 		if (inst.LK)
 			ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4));

@ -67,6 +71,7 @@ using namespace Gen;

 	void Jit64::bcx(UGeckoInstruction inst)
 	{
+		NORMALBRANCH_START
 		if (inst.LK)
 			ibuild.EmitStoreLink(
 				ibuild.EmitIntConst(js.compilerPC + 4));
@ -117,6 +122,7 @@ using namespace Gen;

 	void Jit64::bcctrx(UGeckoInstruction inst)
 	{
+		NORMALBRANCH_START
 		Default(inst);
 		ibuild.EmitInterpreterBranch();
 		return;
@ -124,6 +130,7 @@ using namespace Gen;

 	void Jit64::bclrx(UGeckoInstruction inst)
 	{
+		NORMALBRANCH_START
 		if (inst.hex == 0x4e800020) {
 			ibuild.EmitBranchUncond(ibuild.EmitLoadLink());
 			return;
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp
@ -29,141 +29,54 @@
 #define INSTRUCTION_START
 // #define INSTRUCTION_START Default(inst); return;

-	const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
-	const u64 GC_ALIGNED16(psAbsMask2[2])  = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
-	const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0};
-
-	void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg))
-	{
-		fpr.Lock(d, a, b);
-		if (d == a)
-		{
-			fpr.LoadToX64(d, true);
-			(this->*op)(fpr.RX(d), fpr.R(b));
-		}
-		else if (d == b && reversible)
-		{
-			fpr.LoadToX64(d, true);
-			(this->*op)(fpr.RX(d), fpr.R(a));
-		}
-		else if (a != d && b != d) 
-		{
-			// Sources different from d, can use rather quick solution
-			fpr.LoadToX64(d, !dupe);
-			MOVSD(fpr.RX(d), fpr.R(a));
-			(this->*op)(fpr.RX(d), fpr.R(b));
-		}
-		else if (b != d)
-		{
-			fpr.LoadToX64(d, !dupe);
-			MOVSD(XMM0, fpr.R(b));
-			MOVSD(fpr.RX(d), fpr.R(a));
-			(this->*op)(fpr.RX(d), Gen::R(XMM0));
-		}
-		else // Other combo, must use two temps :(
-		{
-			MOVSD(XMM0, fpr.R(a));
-			MOVSD(XMM1, fpr.R(b));
-			fpr.LoadToX64(d, !dupe);
-			(this->*op)(XMM0, Gen::R(XMM1));
-			MOVSD(fpr.RX(d), Gen::R(XMM0));
-		}
-		if (dupe) {
-			ForceSinglePrecisionS(fpr.RX(d));
-			MOVDDUP(fpr.RX(d), fpr.R(d));
-		}
-		fpr.UnlockAll();
-	}
-
 	void Jit64::fp_arith_s(UGeckoInstruction inst)
 	{
-		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
-			{Default(inst); return;} // turn off from debugger
-		INSTRUCTION_START;
-		if (inst.Rc) {
+		if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) {
 			Default(inst); return;
 		}
+		IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
+		val = ibuild.EmitDoubleToSingle(val);
 		bool dupe = inst.OPCD == 59;
 		switch (inst.SUBOP5)
 		{
-		case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div
-		case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub
-		case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true,  dupe, &XEmitter::ADDSD); break; //add
+		case 25: //mul
+			val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
+		case 18: //div
+		case 20: //sub
+		case 21: //add
 		case 23: //sel
-			Default(inst);
-			break;
 		case 24: //res
-			Default(inst);
-			break;
-		case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul
 		default:
 			_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
 		}
+		val = ibuild.EmitDupSingleToMReg(val);
+		ibuild.EmitStoreFReg(val, inst.FD);
 	}

 	void Jit64::fmaddXX(UGeckoInstruction inst)
 	{
-		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
-			{Default(inst); return;} // turn off from debugger		
-		INSTRUCTION_START;
-		if (inst.Rc) {
+		if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) {
 			Default(inst); return;
 		}

 		bool single_precision = inst.OPCD == 59;

-		int a = inst.FA;
-		int b = inst.FB;
-		int c = inst.FC;
-		int d = inst.FD;
-
-		fpr.Lock(a, b, c, d);
-		MOVSD(XMM0, fpr.R(a));
-		switch (inst.SUBOP5)
-		{
-		case 28: //msub
-			MULSD(XMM0, fpr.R(c));
-			SUBSD(XMM0, fpr.R(b));
-			break;
-		case 29: //madd
-			MULSD(XMM0, fpr.R(c));
-			ADDSD(XMM0, fpr.R(b));
-			break;
-		case 30: //nmsub
-			MULSD(XMM0, fpr.R(c));
-			SUBSD(XMM0, fpr.R(b));
-			XORPD(XMM0, M((void*)&psSignBits2));
-			break;
-		case 31: //nmadd
-			MULSD(XMM0, fpr.R(c));
-			ADDSD(XMM0, fpr.R(b));
-			XORPD(XMM0, M((void*)&psSignBits2));
-			break;
-		}
-		fpr.LoadToX64(d, false);
-		//YES it is necessary to dupe the result :(
-		//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
-		if (single_precision) {
-			ForceSinglePrecisionS(XMM0);
-			MOVDDUP(fpr.RX(d), R(XMM0));
-		} else {
-			MOVSD(fpr.RX(d), R(XMM0));
-		}
-		fpr.UnlockAll();
+		IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
+		val = ibuild.EmitDoubleToSingle(val);
+		val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
+		val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
+		val = ibuild.EmitDupSingleToMReg(val);
+		ibuild.EmitStoreFReg(val, inst.FD);
 	}
 	
 	void Jit64::fmrx(UGeckoInstruction inst)
 	{
-		if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff)
-			{Default(inst); return;} // turn off from debugger
-		INSTRUCTION_START;
 		if (inst.Rc) {
 			Default(inst); return;
 		}
-		int d = inst.FD;
-		int b = inst.FB;
-		fpr.LoadToX64(d, true);  // we don't want to destroy the high bit
-		MOVSD(fpr.RX(d), fpr.R(b));
+		IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FB);
+		val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
+		ibuild.EmitStoreFReg(val, inst.FD);
 	}

 	void Jit64::fcmpx(UGeckoInstruction inst)
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp
@ -71,6 +71,20 @@ void Jit64::lhax(UGeckoInstruction inst)
 void Jit64::lXz(UGeckoInstruction inst)
 {
 	INSTRUCTION_START
+
+	if (Core::GetStartupParameter().bSkipIdle &&
+		inst.OPCD == 32 && 
+		(inst.hex & 0xFFFF0000) == 0x800D0000 &&
+		(Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 ||
+		(Core::GetStartupParameter().bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
+		Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
+	{
+		ibuild.EmitIdleLoop(ibuild.EmitIntConst(PowerPC::ppcState.gpr[inst.RA] + (s32)(s16)inst.SIMM_16),
+				    ibuild.EmitIntConst(js.compilerPC));
+		js.compilerPC += 8;
+		return;
+	}
+
 	IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16);
 	if (inst.RA)
 		addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp
@ -57,38 +57,12 @@ u32 GC_ALIGNED16(temp32);

 void Jit64::lfs(UGeckoInstruction inst)
 {
-	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
-		{Default(inst); return;} // turn off from debugger	
-	INSTRUCTION_START;
-
-	int d = inst.RD;
-	int a = inst.RA;
-	if (!a) 
-	{
-		Default(inst);
-		return;
-	}
-	s32 offset = (s32)(s16)inst.SIMM_16;
-	gpr.FlushLockX(ABI_PARAM1);
-	gpr.Lock(a);
-	MOV(32, R(ABI_PARAM1), gpr.R(a));
-	if (jo.assumeFPLoadFromMem)
-	{
-		UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false);
-	}
-	else
-	{
-		SafeLoadRegToEAX(ABI_PARAM1, 32, offset);
-	}
-
-	MOV(32, M(&temp32), R(EAX));
-	fpr.Lock(d);
-	fpr.LoadToX64(d, false);
-	CVTSS2SD(fpr.RX(d), M(&temp32));
-	MOVDDUP(fpr.RX(d), fpr.R(d));
-	gpr.UnlockAll();
-	gpr.UnlockAllX();
-	fpr.UnlockAll();
+	IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
+	if (inst.RA)
+		addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+	val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
+	ibuild.EmitStoreFReg(val, inst.RD);
+	return;
 }


@ -291,32 +265,10 @@ void Jit64::stfsx(UGeckoInstruction inst)

 void Jit64::lfsx(UGeckoInstruction inst)
 {
-	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff)
-		{Default(inst); return;} // turn off from debugger	
-	INSTRUCTION_START;
-
-	fpr.Lock(inst.RS);
-	fpr.LoadToX64(inst.RS, false, true);
-	MOV(32, R(EAX), gpr.R(inst.RB));
+	IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB), val;
 	if (inst.RA)
-		ADD(32, R(EAX), gpr.R(inst.RA));
-	if (cpu_info.bSSSE3) {
-		X64Reg r = fpr.R(inst.RS).GetSimpleReg();
-#ifdef _M_IX86
-		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-		MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
-#else
-		MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
-#endif
-		PSHUFB(r, M((void *)bswapShuffle1x4));
-		CVTSS2SD(r, R(r));
-		MOVDDUP(r, R(r));
-	} else {
-		UnsafeLoadRegToReg(EAX, EAX, 32, false);
-		MOV(32, M(&temp32), R(EAX));
-		CVTSS2SD(XMM0, M(&temp32));
-		MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
-	}
-	fpr.UnlockAll();
+		addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+	val = ibuild.EmitDupSingleToMReg(ibuild.EmitLoadSingle(addr));
+	ibuild.EmitStoreFReg(val, inst.RD);
 }

--- a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp
+++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp
@ -40,419 +40,20 @@
 #define INSTRUCTION_START
 // #define INSTRUCTION_START Default(inst); return;

-const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
-const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0};
-static u64 GC_ALIGNED16(temp64);
-
-// TODO(ector): Improve 64-bit version
-static void WriteDual32(u64 value, u32 address)
-{
-	Memory::Write_U32((u32)(value >> 32), address);
-	Memory::Write_U32((u32)value, address + 4);
-}
-
-const double GC_ALIGNED16(m_quantizeTableD[]) =
-{
-	(1 <<  0),	(1 <<  1),	(1 <<  2),	(1 <<  3),
-	(1 <<  4),	(1 <<  5),	(1 <<  6),	(1 <<  7),
-	(1 <<  8),	(1 <<  9),	(1 << 10),	(1 << 11),
-	(1 << 12),	(1 << 13),	(1 << 14),	(1 << 15),
-	(1 << 16),	(1 << 17),	(1 << 18),	(1 << 19),
-	(1 << 20),	(1 << 21),	(1 << 22),	(1 << 23),
-	(1 << 24),	(1 << 25),	(1 << 26),	(1 << 27),
-	(1 << 28),	(1 << 29),	(1 << 30),	(1 << 31),
-	1.0 / (1ULL << 32),	1.0 / (1 << 31),	1.0 / (1 << 30),	1.0 / (1 << 29),
-	1.0 / (1 << 28),	1.0 / (1 << 27),	1.0 / (1 << 26),	1.0 / (1 << 25),
-	1.0 / (1 << 24),	1.0 / (1 << 23),	1.0 / (1 << 22),	1.0 / (1 << 21),
-	1.0 / (1 << 20),	1.0 / (1 << 19),	1.0 / (1 << 18),	1.0 / (1 << 17),
-	1.0 / (1 << 16),	1.0 / (1 << 15),	1.0 / (1 << 14),	1.0 / (1 << 13),
-	1.0 / (1 << 12),	1.0 / (1 << 11),	1.0 / (1 << 10),	1.0 / (1 <<  9),
-	1.0 / (1 <<  8),	1.0 / (1 <<  7),	1.0 / (1 <<  6),	1.0 / (1 <<  5),
-	1.0 / (1 <<  4),	1.0 / (1 <<  3),	1.0 / (1 <<  2),	1.0 / (1 <<  1),
-}; 
-
-const double GC_ALIGNED16(m_dequantizeTableD[]) =
-{
-	1.0 / (1 <<  0),	1.0 / (1 <<  1),	1.0 / (1 <<  2),	1.0 / (1 <<  3),
-	1.0 / (1 <<  4),	1.0 / (1 <<  5),	1.0 / (1 <<  6),	1.0 / (1 <<  7),
-	1.0 / (1 <<  8),	1.0 / (1 <<  9),	1.0 / (1 << 10),	1.0 / (1 << 11),
-	1.0 / (1 << 12),	1.0 / (1 << 13),	1.0 / (1 << 14),	1.0 / (1 << 15),
-	1.0 / (1 << 16),	1.0 / (1 << 17),	1.0 / (1 << 18),	1.0 / (1 << 19),
-	1.0 / (1 << 20),	1.0 / (1 << 21),	1.0 / (1 << 22),	1.0 / (1 << 23),
-	1.0 / (1 << 24),	1.0 / (1 << 25),	1.0 / (1 << 26),	1.0 / (1 << 27),
-	1.0 / (1 << 28),	1.0 / (1 << 29),	1.0 / (1 << 30),	1.0 / (1 << 31),
-	(1ULL << 32),	(1 << 31),		(1 << 30),		(1 << 29),
-	(1 << 28),		(1 << 27),		(1 << 26),		(1 << 25),
-	(1 << 24),		(1 << 23),		(1 << 22),		(1 << 21),
-	(1 << 20),		(1 << 19),		(1 << 18),		(1 << 17),
-	(1 << 16),		(1 << 15),		(1 << 14),		(1 << 13),
-	(1 << 12),		(1 << 11),		(1 << 10),		(1 <<  9),
-	(1 <<  8),		(1 <<  7),		(1 <<  6),		(1 <<  5),
-	(1 <<  4),		(1 <<  3),		(1 <<  2),		(1 <<  1),
-};  
-
 // The big problem is likely instructions that set the quantizers in the same block.
 // We will have to break block after quantizers are written to.
 void Jit64::psq_st(UGeckoInstruction inst)
 {
-	if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
-		{Default(inst); return;} // turn off from debugger	
-	INSTRUCTION_START;
-	js.block_flags |= BLOCK_USE_GQR0 << inst.I;
-
-	if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
-	{
-		Default(inst);
-		return;
-	}
-	if (!inst.RA)
-	{
-		// This really should never happen. Unless we change this to also support stwux
-		Default(inst);
-		return;
-	}
-
-	const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
-	const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
-	int stScale = gqr.ST_SCALE;
-	bool update = inst.OPCD == 61;
-
-	int offset = inst.SIMM_12;
-	int a = inst.RA;
-	int s = inst.RS; // Fp numbers
-
-	if (inst.W) {
-		// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update); 
-		// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
-		// floats so that's what we'll work on.
-		switch (stType)
-		{
-		case QUANTIZE_FLOAT:
-			{
-			// This one has quite a bit of optimization potential.
-			if (gpr.R(a).IsImm())
-			{
-				PanicAlert("Imm: %08x", gpr.R(a).offset);
-			}
-			gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
-			gpr.Lock(a);
-			fpr.Lock(s);
-			if (update)
-				gpr.LoadToX64(a, true, true);
-			MOV(32, R(ABI_PARAM2), gpr.R(a));
-			if (offset)
-				ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
-			TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
-			if (update && offset)
-				MOV(32, gpr.R(a), R(ABI_PARAM2));
-			CVTSD2SS(XMM0, fpr.R(s));
-			MOVD_xmm(M(&temp64), XMM0);
-			MOV(32, R(ABI_PARAM1), M(&temp64));
-			FixupBranch argh = J_CC(CC_NZ);
-			BSWAP(32, ABI_PARAM1);
-#ifdef _M_X64
-			MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
-#else
-			MOV(32, R(EAX), R(ABI_PARAM2));
-			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-			MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
-#endif
-			FixupBranch skip_call = J();
-			SetJumpTarget(argh);
-			ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
-			SetJumpTarget(skip_call);
-			gpr.UnlockAll();
-			gpr.UnlockAllX();
-			fpr.UnlockAll();
-			return;
-			}
-		default:
-			Default(inst);
-			return;
-		}
-		return;
-	}
-
-	if (stType == QUANTIZE_FLOAT)
-	{
-		if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3)
-		{
-			u32 addr = (u32)(gpr.R(a).offset + offset);
-			if (addr == 0xCC008000) {
-				// Writing to FIFO. Let's do fast method.
-				CVTPD2PS(XMM0, fpr.R(s));
-				PSHUFB(XMM0, M((void*)&pbswapShuffle2x4));
-				CALL((void*)asm_routines.fifoDirectWriteXmm64);
-				js.fifoBytesThisBlock += 8;
-				return;
-			}
-		}
-
-		gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
-		gpr.Lock(a);
-		fpr.Lock(s);
-		if (update)
-			gpr.LoadToX64(a, true, true);
-		MOV(32, R(ABI_PARAM2), gpr.R(a));
-		if (offset)
-			ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
-		TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
-		if (update && offset)
-			MOV(32, gpr.R(a), R(ABI_PARAM2));
-		CVTPD2PS(XMM0, fpr.R(s));
-		SHUFPS(XMM0, R(XMM0), 1);
-		MOVQ_xmm(M(&temp64), XMM0);
-#ifdef _M_X64
-		MOV(64, R(ABI_PARAM1), M(&temp64));
-		FixupBranch argh = J_CC(CC_NZ);
-		BSWAP(64, ABI_PARAM1);
-		MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
-		FixupBranch arg2 = J();
-		SetJumpTarget(argh);
-		CALL(thunks.ProtectFunction((void *)&WriteDual32, 0));
-#else
-		FixupBranch argh = J_CC(CC_NZ);
-		MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
-		BSWAP(32, ABI_PARAM1);
-		AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK));
-		MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1));
-		MOV(32, R(ABI_PARAM1), M(&temp64));
-		BSWAP(32, ABI_PARAM1);
-		MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1));
-		FixupBranch arg2 = J();
-		SetJumpTarget(argh);
-		MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4));
-		ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
-		MOV(32, R(ABI_PARAM1), M(((char*)&temp64)));
-		ADD(32, R(ABI_PARAM2), Imm32(4));
-		ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); 
-#endif
-		SetJumpTarget(arg2);
-		gpr.UnlockAll();
-		gpr.UnlockAllX();
-		fpr.UnlockAll();
-	}
-	else if (stType == QUANTIZE_U8)
-	{
-		gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
-		gpr.Lock(a);
-		fpr.Lock(s);
-		if (update)
-			gpr.LoadToX64(a, true, update);
-		MOV(32, R(ABI_PARAM2), gpr.R(a));
-		if (offset)
-			ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
-		if (update && offset)
-			MOV(32, gpr.R(a), R(ABI_PARAM2));
-		MOVAPD(XMM0, fpr.R(s));
-		MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
-		MULPD(XMM0, R(XMM1));
-		CVTPD2DQ(XMM0, R(XMM0));
-		PACKSSDW(XMM0, R(XMM0));
-		PACKUSWB(XMM0, R(XMM0));
-		MOVD_xmm(M(&temp64), XMM0);
-		MOV(16, R(ABI_PARAM1), M(&temp64));
-#ifdef _M_X64
-		MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
-#else
-		MOV(32, R(EAX), R(ABI_PARAM2));
-		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-		MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
-#endif
-		if (update)
-			MOV(32, gpr.R(a), R(ABI_PARAM2));
-		gpr.UnlockAll();
-		gpr.UnlockAllX();
-		fpr.UnlockAll();
-	} 
-	else if (stType == QUANTIZE_S16)
-	{
-		gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
-		gpr.Lock(a);
-		fpr.Lock(s);
-		if (update)
-			gpr.LoadToX64(a, true, update);
-		MOV(32, R(ABI_PARAM2), gpr.R(a));
-		if (offset)
-			ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
-		if (update)
-			MOV(32, gpr.R(a), R(ABI_PARAM2));
-		MOVAPD(XMM0, fpr.R(s));
-		MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale]));
-		MULPD(XMM0, R(XMM1));
-		SHUFPD(XMM0, R(XMM0), 1);
-		CVTPD2DQ(XMM0, R(XMM0));
-		PACKSSDW(XMM0, R(XMM0));
-		MOVD_xmm(M(&temp64), XMM0);
-		MOV(32, R(ABI_PARAM1), M(&temp64));
-		BSWAP(32, ABI_PARAM1);
-#ifdef _M_X64
-		MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
-#else
-		MOV(32, R(EAX), R(ABI_PARAM2));
-		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-		MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
-#endif
-		gpr.UnlockAll();
-		gpr.UnlockAllX();
-		fpr.UnlockAll();
-	}
-	else {
-		// Dodger uses this.
-        // mario tennis
-		//PanicAlert("st %i:%i", stType, inst.W);
-		Default(inst);
-	}
+	Default(inst); return;
 }

 void Jit64::psq_l(UGeckoInstruction inst)
 {
-	if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff)
-		{Default(inst); return;} // turn off from debugger	
-	INSTRUCTION_START;
-
-	js.block_flags |= BLOCK_USE_GQR0 << inst.I;
-
-	if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers)
-	{
-		Default(inst);
-		return;
-	}
-
-	const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
-	const EQuantizeType ldType = static_cast<EQuantizeType>(gqr.LD_TYPE);
-	int ldScale = gqr.LD_SCALE;
-	bool update = inst.OPCD == 57;
-	if (!inst.RA || inst.W)
-	{
-		// 0 1 during load
-		//PanicAlert("ld:%i %i", ldType, (int)inst.W);
-		Default(inst);
-		return;
-	}
-	int offset = inst.SIMM_12;
-	switch (ldType) {
-		case QUANTIZE_FLOAT:  // We know this is from RAM, so we don't need to check the address.
-			{
-#ifdef _M_X64
-			gpr.LoadToX64(inst.RA, true, update);
-			fpr.LoadToX64(inst.RS, false);
-			if (cpu_info.bSSSE3) {
-				X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
-				MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
-				PSHUFB(xd, M((void *)pbswapShuffle2x4));
-				CVTPS2PD(xd, R(xd));
-			} else {
-				MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
-				BSWAP(64, RAX);
-				MOV(64, M(&psTemp[0]), R(RAX));
-				X64Reg r = fpr.R(inst.RS).GetSimpleReg();
-				CVTPS2PD(r, M(&psTemp[0]));
-				SHUFPD(r, R(r), 1);
-			}
-			if (update && offset != 0)
-				ADD(32, gpr.R(inst.RA), Imm32(offset));
-			break;
-#else
-			if (cpu_info.bSSSE3) {
-				gpr.LoadToX64(inst.RA, true, update);
-				fpr.LoadToX64(inst.RS, false);
-				X64Reg xd = fpr.R(inst.RS).GetSimpleReg();
-				MOV(32, R(EAX), gpr.R(inst.RA));
-				AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-				MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset));
-				PSHUFB(xd, M((void *)pbswapShuffle2x4));
-				CVTPS2PD(xd, R(xd));
-			} else {
-				gpr.FlushLockX(ECX);
-				gpr.LoadToX64(inst.RA, true, update);
-				// This can probably be optimized somewhat.
-				LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
-				AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
-				MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base));
-				BSWAP(32, RAX);
-				MOV(32, M(&psTemp[0]), R(RAX));
-				MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4));
-				BSWAP(32, RAX);
-				MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX));
-				fpr.LoadToX64(inst.RS, false, true);
-				X64Reg r = fpr.R(inst.RS).GetSimpleReg();
-				CVTPS2PD(r, M(&psTemp[0]));
-				gpr.UnlockAllX();
-			}
-			if (update && offset != 0)
-				ADD(32, gpr.R(inst.RA), Imm32(offset));
-			break;
-#endif
-			}
-		case QUANTIZE_U8:
-			{
-			gpr.LoadToX64(inst.RA, true, update);
-#ifdef _M_X64
-			MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
-#else
-			LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
-			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-			MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base));
-#endif
-			MOV(32, M(&temp64), R(EAX));
-			MOVD_xmm(XMM0, M(&temp64));
-			// SSE4 optimization opportunity here.
-			PXOR(XMM1, R(XMM1));
-			PUNPCKLBW(XMM0, R(XMM1));
-			PUNPCKLWD(XMM0, R(XMM1));
-			CVTDQ2PD(XMM0, R(XMM0));
-			fpr.LoadToX64(inst.RS, false, true);
-			X64Reg r = fpr.R(inst.RS).GetSimpleReg();
-			MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale]));
-			MULPD(r, R(XMM0));
-			if (update && offset != 0)
-				ADD(32, gpr.R(inst.RA), Imm32(offset));
-			}
-			break;
-		case QUANTIZE_S16:
-			{
-			gpr.LoadToX64(inst.RA, true, update);
-#ifdef _M_X64
-			MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset));
-#else
-			LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset));
-			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-			MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base));
-#endif
-			BSWAP(32, EAX);
-			MOV(32, M(&temp64), R(EAX));
-			fpr.LoadToX64(inst.RS, false, true);
-			X64Reg r = fpr.R(inst.RS).GetSimpleReg();
-			MOVD_xmm(XMM0, M(&temp64));
-			PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword..
-			PSRAD(XMM0, 16);          // then use this signed shift to sign extend. clever eh? :P
-			CVTDQ2PD(XMM0, R(XMM0));
-			MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale]));
-			MULPD(r, R(XMM0));
-			SHUFPD(r, R(r), 1);
-			if (update && offset != 0)
-				ADD(32, gpr.R(inst.RA), Imm32(offset));
-			}
-			break;
-
-			/*
-			Dynamic quantizer. Todo when we have a test set.
-			MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3));  // it's in the high byte.
-			AND(32, R(EAX), Imm8(0x3F));
-			MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD));
-			MOVDDUP(r, MComplex(RCX, EAX, 8, 0));
-			*/
-		default:
-			// 4 0
-			// 6 0 //power tennis
-			// 5 0 
-			// PanicAlert("ld:%i %i", ldType, (int)inst.W);
-			Default(inst);
-			return;
-	}
-
-	//u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12;
+	if (inst.W) {Default(inst); return;}
+	IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_12), val;
+	if (inst.RA)
+		addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
+	val = ibuild.EmitLoadPaired(addr, inst.I);
+	val = ibuild.EmitExpandPackedToMReg(val);
+	ibuild.EmitStoreFReg(val, inst.RD);
 }