[ARM] Push new ArmEmitter changes from PPSSPP. Mostly Fixes a few VFP/NEON instruction encodings.

2013-08-02 23:18:44 +00:00 · 2013-08-02 23:18:44 +00:00 · ab0f42636d
parent c1baed086d
commit ab0f42636d
3 changed files with 262 additions and 94 deletions
--- a/Source/Core/Common/Src/ArmCPUDetect.cpp
+++ b/Source/Core/Common/Src/ArmCPUDetect.cpp
@ -17,9 +17,10 @@

 #include "Common.h"
 #include "CPUDetect.h"
-#include "StringUtil.h"
 #include "FileUtil.h"

+// Only Linux platforms have /proc/cpuinfo
+#if !defined(BLACKBERRY) && !defined(IOS) && !defined(__SYMBIAN32__)
 const char procfile[] = "/proc/cpuinfo";

 char *GetCPUString()
@ -33,7 +34,7 @@ char *GetCPUString()
 	auto const fp = file.GetHandle();
 	if (!fp)
 		return 0;
-
+	
 	while (fgets(buf, sizeof(buf), fp))
 	{
 		if (strncmp(buf, marker, sizeof(marker) - 1))
@ -42,6 +43,7 @@ char *GetCPUString()
 		cpu_string = strndup(cpu_string, strlen(cpu_string) - 1); // Strip the newline
 		break;
 	}
+	
 	return cpu_string;
 }

@ -91,14 +93,13 @@ unsigned short GetCPUPart()
 		break;
 	}
 	return part;
-
 }

 bool CheckCPUFeature(const char *feature)
 {
 	const char marker[] = "Features\t: ";
 	char buf[1024];
-	
+
 	File::IOFile file(procfile, "r");
 	auto const fp = file.GetHandle();
 	if (!fp)
@ -117,10 +118,18 @@ bool CheckCPUFeature(const char *feature)
 			token = strtok(NULL, " ");
 		}
 	}
+	
 	return false;
 }
+#endif
+
 int GetCoreCount()
 {
+#ifdef __SYMBIAN32__
+	return 1;
+#elif defined(BLACKBERRY) || defined(IOS)
+	return 2;
+#else
 	const char marker[] = "processor\t: ";
 	int cores = 0;
 	char buf[1024];
@ -129,14 +138,16 @@ int GetCoreCount()
 	auto const fp = file.GetHandle();
 	if (!fp)
 		return 0;
-
+	
 	while (fgets(buf, sizeof(buf), fp))
 	{
 		if (strncmp(buf, marker, sizeof(marker) - 1))
 			continue;
 		++cores;
 	}
+	
 	return cores;
+#endif
 }

 CPUInfo cpu_info;
@ -153,12 +164,58 @@ void CPUInfo::Detect()
 	HTT = false;
 	OS64bit = false;
 	CPU64bit = false;
-	Mode64bit = false;
+	Mode64bit = false;				 
 	vendor = VENDOR_ARM;
-
+	
 	// Get the information about the CPU 
-	strncpy(cpu_string, GetCPUString(), sizeof(cpu_string));
 	num_cores = GetCoreCount();
+#if defined(__SYMBIAN32__) || defined(BLACKBERRY) || defined(IOS)
+	bool isVFP3 = false;
+	bool isVFP4 = false;
+#ifdef IOS
+	isVFP3 = true;
+    // Check for swift arch (VFP4`)
+    #ifdef __ARM_ARCH_7S__
+        isVFP4 = true;
+    #endif // #ifdef __ARM_ARCH_7S__
+#elif defined(BLACKBERRY)
+	isVFP3 = true;
+	const char cpuInfoPath[] = "/pps/services/hw_info/inventory";
+	const char marker[] = "Processor_Name::";
+	const char qcCPU[] = "MSM";
+	char buf[1024];
+	FILE* fp;
+	if (fp = fopen(cpuInfoPath, "r"))
+	{
+		while (fgets(buf, sizeof(buf), fp))
+		{
+			if (strncmp(buf, marker, sizeof(marker) - 1))
+				continue;
+			if (strncmp(buf + sizeof(marker) - 1, qcCPU, sizeof(qcCPU) - 1) == 0)
+				isVFP4 = true;
+			break;
+		}
+		fclose(fp);
+	}
+#endif
+	// Hardcode this for now
+	bSwp = true;
+	bHalf = true;
+	bThumb = false;
+	bFastMult = true;
+	bVFP = true;
+	bEDSP = true;
+	bThumbEE = isVFP3;
+	bNEON = isVFP3;
+	bVFPv3 = isVFP3;
+	bTLS = true;
+	bVFPv4 = isVFP4;
+	bIDIVa = isVFP4;
+	bIDIVt = isVFP4;
+	bFP = false;
+	bASIMD = false;
+#else
+	strncpy(cpu_string, GetCPUString(), sizeof(cpu_string));
 	bSwp = CheckCPUFeature("swp");
 	bHalf = CheckCPUFeature("half");
 	bThumb = CheckCPUFeature("thumb");
@ -172,16 +229,15 @@ void CPUInfo::Detect()
 	bVFPv4 = CheckCPUFeature("vfpv4");
 	bIDIVa = CheckCPUFeature("idiva");
 	bIDIVt = CheckCPUFeature("idivt");
-
 	// Qualcomm Krait supports IDIVA but it doesn't report it. Check for krait.
 	if (GetCPUImplementer() == 0x51 && GetCPUPart() == 0x6F) // Krait(300) is 0x6F, Scorpion is 0x4D
-		bIDIVa = bIDIVt = true; 
-
-	// These two are ARMv8 specific.
+		bIDIVa = bIDIVt = true;
+	// These two require ARMv8 or higher
 	bFP = CheckCPUFeature("fp");
 	bASIMD = CheckCPUFeature("asimd");
-
-
+#endif
+// On android, we build a separate library for ARMv7 so this is fine.
+// TODO: Check for ARMv7 on other platforms.
 #if defined(__ARM_ARCH_7A__)
 	bArmV7 = true;
 #else
@ -193,11 +249,14 @@ void CPUInfo::Detect()
 std::string CPUInfo::Summarize()
 {
 	std::string sum;
+#if defined(BLACKBERRY) || defined(IOS) || defined(__SYMBIAN32__)
+	sum = StringFromFormat("%i cores", num_cores);
+#else
 	if (num_cores == 1)
 		sum = StringFromFormat("%s, %i core", cpu_string, num_cores);
 	else
 		sum = StringFromFormat("%s, %i cores", cpu_string, num_cores);
-
+#endif
 	if (bSwp) sum += ", SWP";
 	if (bHalf) sum += ", Half";
 	if (bThumb) sum += ", Thumb";
--- a/Source/Core/Common/Src/ArmEmitter.cpp
+++ b/Source/Core/Common/Src/ArmEmitter.cpp
@ -86,7 +86,7 @@ bool TryMakeOperand2_AllowNegation(s32 imm, Operand2 &op2, bool *negated)
 Operand2 AssumeMakeOperand2(u32 imm) {
 	Operand2 op2;
 	bool result = TryMakeOperand2(imm, op2);
-	_assert_msg_(DYNA_REC, result, "Could not make assumed Operand2.");
+	_dbg_assert_msg_(DYNA_REC, result, "Could not make assumed Operand2.");
 	return op2;
 }

@ -117,14 +117,33 @@ bool ARMXEmitter::TrySetValue_TwoOp(ARMReg reg, u32 val)
 	return true;
 }

-void ARMXEmitter::MOVI2F(ARMReg dest, float val, ARMReg tempReg)
+void ARMXEmitter::MOVI2F(ARMReg dest, float val, ARMReg tempReg, bool negate)
 {
 	union {float f; u32 u;} conv;
-	conv.f = val;
+	conv.f = negate ? -val : val;
+	// Try moving directly first if mantisse is empty
+	if (cpu_info.bVFPv3 && ((conv.u & 0x7FFFF) == 0))
+	{
+		// VFP Encoding for Imms: <7> Not(<6>) Repeat(<6>,5) <5:0> Zeros(19)
+		bool bit6 = (conv.u & 0x40000000) == 0x40000000;
+		bool canEncode = true;
+		for (u32 mask = 0x20000000; mask >= 0x2000000; mask >>= 1)
+		{
+			if (((conv.u & mask) == mask) == bit6)
+				canEncode = false;
+		}
+		if (canEncode)
+		{
+			u32 imm8 = (conv.u & 0x80000000) >> 24; // sign bit
+			imm8 |= (!bit6 << 6);
+			imm8 |= (conv.u & 0x1F80000) >> 19;
+			VMOV(dest, IMM(imm8));
+			return;
+		}
+	}
 	MOVI2R(tempReg, conv.u);
 	VMOV(dest, tempReg);
-	// TODO: VMOV an IMM directly if possible
-	// Otherwise, use a literal pool and VLDR directly (+- 1020)
+	// Otherwise, possible to use a literal pool and VLDR directly (+- 1020)
 }

 void ARMXEmitter::ADDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch)
@ -246,8 +265,12 @@ void ARMXEmitter::MOVI2R(ARMReg reg, u32 val, bool optimize)
 }

 void ARMXEmitter::QuickCallFunction(ARMReg reg, void *func) {
-	MOVI2R(reg, (u32)(func));
-	BL(reg);
+	if (BLInRange(func)) {
+		BL(func);
+	} else {
+		MOVI2R(reg, (u32)(func));
+		BL(reg);
+	}
 }

 void ARMXEmitter::SetCodePtr(u8 *ptr)
@ -369,7 +392,7 @@ FixupBranch ARMXEmitter::B_CC(CCFlags Cond)
 void ARMXEmitter::B_CC(CCFlags Cond, const void *fnptr)
 {
 	s32 distance = (s32)fnptr - (s32(code) + 8);
-        _assert_msg_(DYNA_REC, distance > -33554432
+        _dbg_assert_msg_(DYNA_REC, distance > -33554432
                     && distance <=  33554432,
                     "B_CC out of range (%p calls %p)", code, fnptr);

@ -388,7 +411,7 @@ FixupBranch ARMXEmitter::BL_CC(CCFlags Cond)
 void ARMXEmitter::SetJumpTarget(FixupBranch const &branch)
 {
 	s32 distance =  (s32(code) - 8)  - (s32)branch.ptr;
-     _assert_msg_(DYNA_REC, distance > -33554432
+     _dbg_assert_msg_(DYNA_REC, distance > -33554432
                     && distance <=  33554432,
                     "SetJumpTarget out of range (%p calls %p)", code,
 					 branch.ptr);
@ -402,7 +425,7 @@ void ARMXEmitter::SetJumpTarget(FixupBranch const &branch)
 void ARMXEmitter::B (const void *fnptr)
 {
 	s32 distance = (s32)fnptr - (s32(code) + 8);
-        _assert_msg_(DYNA_REC, distance > -33554432
+        _dbg_assert_msg_(DYNA_REC, distance > -33554432
                     && distance <=  33554432,
                     "B out of range (%p calls %p)", code, fnptr);

@ -414,10 +437,18 @@ void ARMXEmitter::B(ARMReg src)
 	Write32(condition | 0x12FFF10 | src);
 }

+bool ARMXEmitter::BLInRange(const void *fnptr) {
+	s32 distance = (s32)fnptr - (s32(code) + 8);
+	if (distance <= -33554432 || distance > 33554432)
+		return false;
+	else
+		return true;
+}
+
 void ARMXEmitter::BL(const void *fnptr)
 {
 	s32 distance = (s32)fnptr - (s32(code) + 8);
-        _assert_msg_(DYNA_REC, distance > -33554432
+        _dbg_assert_msg_(DYNA_REC, distance > -33554432
                     && distance <=  33554432,
                     "BL out of range (%p calls %p)", code, fnptr);
 	Write32(condition | 0x0B000000 | ((distance >> 2) & 0x00FFFFFF));
@ -555,7 +586,7 @@ void ARMXEmitter::WriteInstruction (u32 Op, ARMReg Rd, ARMReg Rn, Operand2 Rm, b
 		}
 	}
 	if (op == -1)
-		_assert_msg_(DYNA_REC, false, "%s not yet support %d", InstNames[Op], Rm.GetType()); 
+		_dbg_assert_msg_(DYNA_REC, false, "%s not yet support %d", InstNames[Op], Rm.GetType()); 
 	Write32(condition | (op << 21) | (SetFlags ? (1 << 20) : 0) | Rn << 16 | Rd << 12 | Data);
 }

@ -652,11 +683,11 @@ void ARMXEmitter::RBIT(ARMReg dest, ARMReg src)
 }
 void ARMXEmitter::REV (ARMReg dest, ARMReg src) 
 {
-	Write32(condition | (0x6B << 20) | (0xF << 16) | (dest << 12) | (0xF3 << 4) | src);
+	Write32(condition | (0x6BF << 16) | (dest << 12) | (0xF3 << 4) | src);
 }
 void ARMXEmitter::REV16(ARMReg dest, ARMReg src)
 {
-	Write32(condition | (0x3DF << 16) | (dest << 12) | (0xFD << 4) | src);
+	Write32(condition | (0x6BF << 16) | (dest << 12) | (0xFB << 4) | src);
 }

 void ARMXEmitter::_MSR (bool write_nzcvq, bool write_g,		Operand2 op2)
@ -677,7 +708,7 @@ void ARMXEmitter::LDREX(ARMReg dest, ARMReg base)
 }
 void ARMXEmitter::STREX(ARMReg result, ARMReg base, ARMReg op)
 {
-	_assert_msg_(DYNA_REC, (result != base && result != op), "STREX dest can't be other two registers");
+	_dbg_assert_msg_(DYNA_REC, (result != base && result != op), "STREX dest can't be other two registers");
 	Write32(condition | (24 << 20) | (base << 16) | (result << 12) | (0xF9 << 4) | op);
 }
 void ARMXEmitter::DMB ()
@ -730,7 +761,7 @@ void ARMXEmitter::WriteStoreOp(u32 Op, ARMReg Rt, ARMReg Rn, Operand2 Rm, bool R
 	bool SignedLoad = false;

 	if (op == -1)
-		_assert_msg_(DYNA_REC, false, "%s does not support %d", LoadStoreNames[Op], Rm.GetType()); 
+		_dbg_assert_msg_(DYNA_REC, false, "%s does not support %d", LoadStoreNames[Op], Rm.GetType()); 

 	switch (Op)
 	{
@ -854,10 +885,25 @@ ARMReg ARMXEmitter::SubBase(ARMReg Reg)
 }

 // NEON Specific
+void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
+{
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
+	bool register_quad = Vd >= Q0;
+
+	// Gets encoded as a double register
+	Vd = SubBase(Vd);
+	Vn = SubBase(Vn);
+	Vm = SubBase(Vm);
+
+	Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
+		| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
+		| ((Vm & 0x10) << 2) | (Vm & 0xF));
+}
 void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
-	_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
-	_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it");

 	bool register_quad = Vd >= Q0;

@ -868,13 +914,13 @@ void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
 		| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
-		| ((Vm & 0x10) << 1) | (Vm & 0xF)); 
+		| ((Vm & 0x10) << 1) | (Vm & 0xF));

 }
 void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
-	_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
-	_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");
+	_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");

 	// Gets encoded as a double register
 	Vd = SubBase(Vd);
@ -883,36 +929,37 @@ void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
 		| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
-		| ((Vm & 0x10) << 2) | (Vm & 0xF)); 
+		| ((Vm & 0x10) << 2) | (Vm & 0xF));
 }

-// VFP Specific
-struct VFPEnc
-{
-	s16 opc1;
-	s16 opc2;
-};
 // Double/single, Neon
-const VFPEnc VFPOps[][2] = {
+extern const VFPEnc VFPOps[16][2] = {
 	{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
-	{{0xE0, 0xA4}, {0x22, 0xD1}}, // 1: VMLS
-	{{0xE3, 0xA0}, {0x20, 0xD0}}, // 2: VADD
-	{{0xE3, 0xA4}, {0x22, 0xD0}}, // 3: VSUB
-	{{0xE2, 0xA0}, {0x30, 0xD1}}, // 4: VMUL
-	{{0xEB, 0xAC}, {  -1 /* 0x3B */,  -1 /* 0x70 */}}, // 5: VABS(Vn(0x0) used for encoding)
-	{{0xE8, 0xA0}, {  -1,   -1}}, // 6: VDIV
-	{{0xEB, 0xA4}, {  -1 /* 0x3B */,   -1 /* 0x78 */}}, // 7: VNEG(Vn(0x1) used for encoding)
-	{{0xEB, 0xAC}, {  -1,   -1}}, // 8: VSQRT (Vn(0x1) used for encoding)
-	{{0xEB, 0xA4}, {  -1,   -1}}, // 9: VCMP (Vn(0x4 | #0 ? 1 : 0) used for encoding)
-	{{0xEB, 0xAC}, {  -1,   -1}}, // 10: VCMPE (Vn(0x4 | #0 ? 1 : 0) used for encoding)
-	{{  -1,   -1}, {0x3B, 0x30}}, // 11: VABSi
+	{{0xE1, 0xA4}, {  -1,   -1}}, // 1: VNMLA
+	{{0xE0, 0xA4}, {0x22, 0xD1}}, // 2: VMLS
+	{{0xE1, 0xA0}, {  -1,   -1}}, // 3: VNMLS
+	{{0xE3, 0xA0}, {0x20, 0xD0}}, // 4: VADD
+	{{0xE3, 0xA4}, {0x22, 0xD0}}, // 5: VSUB
+	{{0xE2, 0xA0}, {0x30, 0xD1}}, // 6: VMUL
+	{{0xE2, 0xA4}, {  -1,   -1}}, // 7: VNMUL
+	{{0xEB, 0xAC}, {  -1 /* 0x3B */,  -1 /* 0x70 */}}, // 8: VABS(Vn(0x0) used for encoding)
+	{{0xE8, 0xA0}, {  -1,   -1}}, // 9: VDIV
+	{{0xEB, 0xA4}, {  -1 /* 0x3B */,   -1 /* 0x78 */}}, // 10: VNEG(Vn(0x1) used for encoding)
+	{{0xEB, 0xAC}, {  -1,   -1}}, // 11: VSQRT (Vn(0x1) used for encoding)
+	{{0xEB, 0xA4}, {  -1,   -1}}, // 12: VCMP (Vn(0x4 | #0 ? 1 : 0) used for encoding)
+	{{0xEB, 0xAC}, {  -1,   -1}}, // 13: VCMPE (Vn(0x4 | #0 ? 1 : 0) used for encoding)
+	{{  -1,   -1}, {0x3B, 0x30}}, // 14: VABSi
 	};
-const char *VFPOpNames[] = {
+
+extern const char *VFPOpNames[16] = {
 	"VMLA",
+	"VNMLA",
 	"VMLS",
+	"VNMLS",
 	"VADD",
 	"VSUB",
 	"VMUL",
+	"VNMUL",
 	"VABS",
 	"VDIV",
 	"VNEG",
@ -966,6 +1013,7 @@ u32 ARMXEmitter::EncodeVm(ARMReg Vm)
 		else
 			return ((Reg & 0x1) << 5) | (Reg >> 1);
 }
+
 void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)
 {
 	bool quad_reg = Vd >= Q0;
@ -973,39 +1021,42 @@ void ARMXEmitter::WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm)

 	VFPEnc enc = VFPOps[Op][quad_reg];
 	if (enc.opc1 == -1 && enc.opc2 == -1)
-		_assert_msg_(DYNA_REC, false, "%s does not support %s", VFPOpNames[Op], quad_reg ? "NEON" : "VFP"); 
+		_dbg_assert_msg_(DYNA_REC, false, "%s does not support %s", VFPOpNames[Op], quad_reg ? "NEON" : "VFP"); 
 	u32 VdEnc = EncodeVd(Vd);
 	u32 VnEnc = EncodeVn(Vn);
 	u32 VmEnc = EncodeVm(Vm);
 	u32 cond = quad_reg ? (0xF << 28) : condition;
-	
+
 	Write32(cond | (enc.opc1 << 20) | VnEnc | VdEnc | (enc.opc2 << 4) | (quad_reg << 6) | (double_reg << 8) | VmEnc);
 }
 void ARMXEmitter::VMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(0, Vd, Vn, Vm); }
-void ARMXEmitter::VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(1, Vd, Vn, Vm); }
-void ARMXEmitter::VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(2, Vd, Vn, Vm); }
-void ARMXEmitter::VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(3, Vd, Vn, Vm); }
-void ARMXEmitter::VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(4, Vd, Vn, Vm); }
-void ARMXEmitter::VABS(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(5, Vd, D0, Vm); }
-void ARMXEmitter::VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(6, Vd, Vn, Vm); }
-void ARMXEmitter::VNEG(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(7, Vd, D1, Vm); }
-void ARMXEmitter::VSQRT(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(8, Vd, D1, Vm); }
-void ARMXEmitter::VCMP(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(9, Vd, D4, Vm); }
-void ARMXEmitter::VCMPE(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(10, Vd, D4, Vm); }
-void ARMXEmitter::VCMP(ARMReg Vd){ WriteVFPDataOp(9, Vd, D5, D0); }
-void ARMXEmitter::VCMPE(ARMReg Vd){ WriteVFPDataOp(10, Vd, D5, D0); }
+void ARMXEmitter::VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(1, Vd, Vn, Vm); }
+void ARMXEmitter::VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(2, Vd, Vn, Vm); }
+void ARMXEmitter::VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(3, Vd, Vn, Vm); }
+void ARMXEmitter::VADD(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(4, Vd, Vn, Vm); }
+void ARMXEmitter::VSUB(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(5, Vd, Vn, Vm); }
+void ARMXEmitter::VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(6, Vd, Vn, Vm); }
+void ARMXEmitter::VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(7, Vd, Vn, Vm); }
+void ARMXEmitter::VABS(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(8, Vd, D0, Vm); }
+void ARMXEmitter::VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm){ WriteVFPDataOp(9, Vd, Vn, Vm); }
+void ARMXEmitter::VNEG(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(10, Vd, D1, Vm); }
+void ARMXEmitter::VSQRT(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(11, Vd, D1, Vm); }
+void ARMXEmitter::VCMP(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(12, Vd, D4, Vm); }
+void ARMXEmitter::VCMPE(ARMReg Vd, ARMReg Vm){ WriteVFPDataOp(13, Vd, D4, Vm); }
+void ARMXEmitter::VCMP(ARMReg Vd){ WriteVFPDataOp(12, Vd, D5, D0); }
+void ARMXEmitter::VCMPE(ARMReg Vd){ WriteVFPDataOp(13, Vd, D5, D0); }

 void ARMXEmitter::VLDR(ARMReg Dest, ARMReg Base, s16 offset)
 {
-	_assert_msg_(DYNA_REC, Dest >= S0 && Dest <= D31, "Passed Invalid dest register to VLDR"); 
-	_assert_msg_(DYNA_REC, Base <= R15, "Passed invalid Base register to VLDR");
+	_dbg_assert_msg_(DYNA_REC, Dest >= S0 && Dest <= D31, "Passed Invalid dest register to VLDR");
+	_dbg_assert_msg_(DYNA_REC, Base <= R15, "Passed invalid Base register to VLDR");

 	bool Add = offset >= 0 ? true : false;
 	u32 imm = abs(offset);

-	_assert_msg_(DYNA_REC, (imm & 0xC03) == 0, "VLDR: Offset needs to be word aligned and small enough");
+	_dbg_assert_msg_(DYNA_REC, (imm & 0xC03) == 0, "VLDR: Offset needs to be word aligned and small enough");

-	if (imm & 0xC03) 
+	if (imm & 0xC03)
 		ERROR_LOG(DYNA_REC, "VLDR: Bad offset %08x", imm);

 	bool single_reg = Dest < D0;
@ -1015,25 +1066,25 @@ void ARMXEmitter::VLDR(ARMReg Dest, ARMReg Base, s16 offset)
 	if (single_reg)
 	{
 		Write32(condition | (0xD << 24) | (Add << 23) | ((Dest & 0x1) << 22) | (1 << 20) | (Base << 16) \
-			| ((Dest & 0x1E) << 11) | (10 << 8) | (imm >> 2));	
+			| ((Dest & 0x1E) << 11) | (10 << 8) | (imm >> 2));
 	}
 	else
 	{
 		Write32(condition | (0xD << 24) | (Add << 23) | ((Dest & 0x10) << 18) | (1 << 20) | (Base << 16) \
-			| ((Dest & 0xF) << 12) | (11 << 8) | (imm >> 2));	
+			| ((Dest & 0xF) << 12) | (11 << 8) | (imm >> 2));
 	}
 }
 void ARMXEmitter::VSTR(ARMReg Src, ARMReg Base, s16 offset)
 {
-	_assert_msg_(DYNA_REC, Src >= S0 && Src <= D31, "Passed invalid src register to VSTR");
-	_assert_msg_(DYNA_REC, Base <= R15, "Passed invalid base register to VSTR");
+	_dbg_assert_msg_(DYNA_REC, Src >= S0 && Src <= D31, "Passed invalid src register to VSTR");
+	_dbg_assert_msg_(DYNA_REC, Base <= R15, "Passed invalid base register to VSTR");

 	bool Add = offset >= 0 ? true : false;
 	u32 imm = abs(offset);

-	_assert_msg_(DYNA_REC, (imm & 0xC03) == 0, "VSTR: Offset needs to be word aligned and small enough");
+	_dbg_assert_msg_(DYNA_REC, (imm & 0xC03) == 0, "VSTR: Offset needs to be word aligned and small enough");

-	if (imm & 0xC03) 
+	if (imm & 0xC03)
 		ERROR_LOG(DYNA_REC, "VSTR: Bad offset %08x", imm);

 	bool single_reg = Src < D0;
@ -1043,12 +1094,12 @@ void ARMXEmitter::VSTR(ARMReg Src, ARMReg Base, s16 offset)
 	if (single_reg)
 	{
 		Write32(condition | (0xD << 24) | (Add << 23) | ((Src & 0x1) << 22) | (Base << 16) \
-			| ((Src & 0x1E) << 11) | (10 << 8) | (imm >> 2));	
+			| ((Src & 0x1E) << 11) | (10 << 8) | (imm >> 2));
 	}
 	else
 	{
 		Write32(condition | (0xD << 24) | (Add << 23) | ((Src & 0x10) << 18) | (Base << 16) \
-			| ((Src & 0xF) << 12) | (11 << 8) | (imm >> 2));	
+			| ((Src & 0xF) << 12) | (11 << 8) | (imm >> 2));
 	}
 }

@ -1063,15 +1114,20 @@ void ARMXEmitter::VMSR(ARMReg Rt) {
 }

 // VFP and ASIMD
+void ARMXEmitter::VMOV(ARMReg Dest, Operand2 op2)
+{
+	_dbg_assert_msg_(DYNA_REC, cpu_info.bVFPv3, "VMOV #imm requires VFPv3");
+	Write32(condition | (0xEB << 20) | EncodeVd(Dest) | (0xA << 8) | op2.Imm8VFP());
+}
 void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src, bool high)
 {
-	_assert_msg_(DYNA_REC, Src < S0, "This VMOV doesn't support SRC other than ARM Reg");
-	_assert_msg_(DYNA_REC, Dest >= D0, "This VMOV doesn't support DEST other than VFP");
+	_dbg_assert_msg_(DYNA_REC, Src < S0, "This VMOV doesn't support SRC other than ARM Reg");
+	_dbg_assert_msg_(DYNA_REC, Dest >= D0, "This VMOV doesn't support DEST other than VFP");

 	Dest = SubBase(Dest);

 	Write32(condition | (0xE << 24) | (high << 21) | ((Dest & 0xF) << 16) | (Src << 12) \
-		| (11 << 8) | ((Dest & 0x10) << 3) | (1 << 4));
+		| (0xB << 8) | ((Dest & 0x10) << 3) | (1 << 4));
 }

 void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src)
@ -1091,7 +1147,7 @@ void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src)
 			else
 			{
 				// Move 64bit from Arm reg
-				_assert_msg_(DYNA_REC, false, "This VMOV doesn't support moving 64bit ARM to NEON");
+				_dbg_assert_msg_(DYNA_REC, false, "This VMOV doesn't support moving 64bit ARM to NEON");
 				return;
 			}
 		}
@ -1111,14 +1167,14 @@ void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src)
 			else
 			{
 				// Move 64bit To Arm reg
-				_assert_msg_(DYNA_REC, false, "This VMOV doesn't support moving 64bit ARM From NEON");
+				_dbg_assert_msg_(DYNA_REC, false, "This VMOV doesn't support moving 64bit ARM From NEON");
 				return;
 			}
 		}
 		else
 		{
 			// Move Arm reg to Arm reg
-			_assert_msg_(DYNA_REC, false, "VMOV doesn't support moving ARM registers");
+			_dbg_assert_msg_(DYNA_REC, false, "VMOV doesn't support moving ARM registers");
 		}
 	}
 	// Moving NEON registers
@ -1127,7 +1183,7 @@ void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src)
 	bool Single = DestSize == 1;
 	bool Quad = DestSize == 4;

-	_assert_msg_(DYNA_REC, SrcSize == DestSize, "VMOV doesn't support moving different register sizes");
+	_dbg_assert_msg_(DYNA_REC, SrcSize == DestSize, "VMOV doesn't support moving different register sizes");

 	Dest = SubBase(Dest);
 	Src = SubBase(Src);
@ -1142,7 +1198,7 @@ void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src)
 		// Double and quad
 		if (Quad)
 		{
-			_assert_msg_(DYNA_REC, cpu_info.bNEON, "Trying to use quad registers when you don't support ASIMD."); 
+			_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Trying to use quad registers when you don't support ASIMD."); 
 			// Gets encoded as a Double register
 			Write32((0xF2 << 24) | ((Dest & 0x10) << 18) | (2 << 20) | ((Src & 0xF) << 16) \
 				| ((Dest & 0xF) << 12) | (1 << 8) | ((Src & 0x10) << 3) | (1 << 6) \
@ -1160,13 +1216,39 @@ void ARMXEmitter::VMOV(ARMReg Dest, ARMReg Src)
 void ARMXEmitter::VCVT(ARMReg Dest, ARMReg Source, int flags)
 {
 	bool single_reg = (Dest < D0) && (Source < D0);
+	bool single_double = !single_reg && (Source < D0 || Dest < D0);
+	bool single_to_double = Source < D0;
 	int op  = ((flags & TO_INT) ? (flags & ROUND_TO_ZERO) : (flags & IS_SIGNED)) ? 1 : 0;
 	int op2 = ((flags & TO_INT) ? (flags & IS_SIGNED) : 0) ? 1 : 0;
 	Dest = SubBase(Dest);
 	Source = SubBase(Source);

-	if (single_reg)
+	if (single_double)
 	{
+		// S32<->F64
+		if ((flags & TO_INT) || (flags & TO_FLOAT))
+		{
+			if (single_to_double)
+			{
+				Write32(condition | (0x1D << 23) | ((Dest & 0x10) << 18) | (0x7 << 19) \
+					| ((Dest & 0xF) << 12) | (op << 7) | (0x2D << 6) | ((Source & 0x1) << 5) | (Source >> 1));
+			} else {
+				Write32(condition | (0x1D << 23) | ((Dest & 0x1) << 22) | (0x7 << 19) | ((flags & TO_INT) << 18) | (op2 << 16) \
+					| ((Dest & 0x1E) << 11) | (op << 7) | (0x2D << 6) | ((Source & 0x10) << 1) | (Source & 0xF));
+			}
+		}
+		// F32<->F64
+		else {
+			if (single_to_double)
+			{
+				Write32(condition | (0x1D << 23) | ((Dest & 0x10) << 18) | (0x3 << 20) | (0x7 << 16) \
+					| ((Dest & 0xF) << 12) | (0x2F << 6) | ((Source & 0x1) << 5) | (Source >> 1));
+			} else {
+				Write32(condition | (0x1D << 23) | ((Dest & 0x1) << 22) | (0x3 << 20) | (0x7 << 16) \
+					| ((Dest & 0x1E) << 11) | (0x2B << 6) | ((Source & 0x10) << 1) | (Source & 0xF));
+			}
+		}
+	} else if (single_reg) {
 		Write32(condition | (0x1D << 23) | ((Dest & 0x1) << 22) | (0x7 << 19) | ((flags & TO_INT) << 18) | (op2 << 16) \
 			| ((Dest & 0x1E) << 11) | (op << 7) | (0x29 << 6) | ((Source & 0x1) << 5) | (Source >> 1));
 	} else {
--- a/Source/Core/Common/Src/ArmEmitter.h
+++ b/Source/Core/Common/Src/ArmEmitter.h
@ -365,7 +365,7 @@ private:
 	u32 EncodeVn(ARMReg Vn);
 	u32 EncodeVm(ARMReg Vm);
 	void WriteVFPDataOp(u32 Op, ARMReg Vd, ARMReg Vn, ARMReg Vm);
-	
+
 	void Write4OpMultiply(u32 op, ARMReg destLo, ARMReg destHi, ARMReg rn, ARMReg rm);

 	// New Ops
@ -431,6 +431,7 @@ public:
 	void B (ARMReg src);
 	void BL(const void *fnptr);
 	void BL(ARMReg src);
+	bool BLInRange(const void *fnptr);

 	void PUSH(const int num, ...);
 	void POP(const int num, ...);
@ -530,6 +531,7 @@ public:
 	// Subtracts the base from the register to give us the real one
 	ARMReg SubBase(ARMReg Reg);	
 	// NEON Only
+	void VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);

@ -541,6 +543,10 @@ public:
 	// Compares against zero
 	void VCMP(ARMReg Vd);
 	void VCMPE(ARMReg Vd);
+
+	void VNMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VNMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VNMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VDIV(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VSQRT(ARMReg Vd, ARMReg Vm);

@ -552,6 +558,7 @@ public:
 	void VMUL(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VMLA(ARMReg Vd, ARMReg Vn, ARMReg Vm);
 	void VMLS(ARMReg Vd, ARMReg Vn, ARMReg Vm);
+	void VMOV(ARMReg Dest, Operand2 op2);
 	void VMOV(ARMReg Dest, ARMReg Src, bool high);
 	void VMOV(ARMReg Dest, ARMReg Src);
 	void VCVT(ARMReg Dest, ARMReg Src, int flags);
@ -564,7 +571,7 @@ public:

 	// Wrapper around MOVT/MOVW with fallbacks.
 	void MOVI2R(ARMReg reg, u32 val, bool optimize = true);
-	void MOVI2F(ARMReg dest, float val, ARMReg tempReg);
+	void MOVI2F(ARMReg dest, float val, ARMReg tempReg, bool negate = false);

 	void ADDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
 	void ANDI2R(ARMReg rd, ARMReg rs, u32 val, ARMReg scratch);
@ -624,7 +631,11 @@ public:
 	// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
 	void WriteProtect()
 	{
-		WriteProtectMemory(region, region_size, true);		
+		WriteProtectMemory(region, region_size, true);
+	}
+	void UnWriteProtect()
+	{
+		UnWriteProtectMemory(region, region_size, false);
 	}

 	void ResetCodePtr()
@ -636,8 +647,24 @@ public:
 	{
 		return region_size - (GetCodePtr() - region);
 	}
+
+	u8 *GetBasePtr() {
+		return region;
+	}
+
+	size_t GetOffset(u8 *ptr) {
+		return ptr - region;
+	}
 };

+// VFP Specific
+struct VFPEnc {
+	s16 opc1;
+	s16 opc2;
+};
+extern const VFPEnc VFPOps[16][2];
+extern const char *VFPOpNames[16];
+
 }  // namespace

 #endif // _DOLPHIN_INTEL_CODEGEN_