A few vcmp*'s, float_16_to_32 bit of vupkd3d128 (untested).

This commit is contained in:
Ben Vanik 2013-10-02 23:25:05 -07:00
parent 21d273e85b
commit 6d46b51ed4
2 changed files with 221 additions and 94 deletions

View File

@ -348,7 +348,7 @@ typedef struct {
uint32_t VD128l : 5;
uint32_t : 6;
} VX128_P;
// kXEPPCInstrFormatVX128
// kXEPPCInstrFormatVX128_R
struct {
// VD128 = VD128l | (VD128h << 5)
// VA128 = VA128l | (VA128h << 5) | (VA128H << 6)

View File

@ -48,6 +48,9 @@ namespace x64 {
#define VX128_3_VD128 (i.VX128_3.VD128l | (i.VX128_3.VD128h << 5))
#define VX128_3_VB128 (i.VX128_3.VB128l | (i.VX128_3.VB128h << 5))
#define VX128_3_IMM (i.VX128_3.IMM)
#define VX128_R_VD128 (i.VX128_R.VD128l | (i.VX128_R.VD128h << 5))
#define VX128_R_VA128 (i.VX128_R.VA128l | (i.VX128_R.VA128h << 5) | (i.VX128_R.VA128H << 6))
#define VX128_R_VB128 (i.VX128_R.VB128l | (i.VX128_R.VB128h << 5))
XEEMITTER(dst, 0x7C0002AC, XDSS)(X64Emitter& e, X86Compiler& c, InstrData& i) {
@ -425,94 +428,202 @@ XEEMITTER(vcfpuxws128, VX128_3(6, 624), VX128_3)(X64Emitter& e, X86Compiler&
return 1;
}
XEEMITTER(vcmpbfp, 0x100003C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
int InstrEmit_vcmpbfp_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t va, uint32_t vb, uint32_t rc) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vcmpbfp, 0x100003C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
return InstrEmit_vcmpbfp_(e, c, i, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpbfp128, VX128(6, 384), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpbfp_(e, c, i, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc);
}
void InstrEmit_vcmp_cr6_(X64Emitter& e, X86Compiler& c, XmmVar& v) {
// Testing for all 1's and all 0's.
// if (Rc) CR6 = all_equal | 0 | none_equal | 0
// Since none_equal and all_equal are mutually exclusive we optimize
// a bit here. This is still terrible.
GpVar lo(c.newGpVar());
GpVar hi(c.newGpVar());
c.pextrq(hi.m64(), v, imm(1));
c.movq(lo.m64(), v);
GpVar gt(c.newGpVar());
GpVar cr(c.newGpVar());
c.xor_(cr, cr);
Label skip(c.newLabel());
// cmp with 0xFF... and set all_equal
c.mov(gt, lo);
c.and_(gt, hi);
c.test(gt, imm(0));
// !eq = all_equal
// all_equal= 0b1000
c.mov(gt, imm(0x8)); // 0b1000
c.cmovne(cr, gt);
c.jne(skip);
// cmp with 0 and set none_equal
c.mov(gt, lo);
c.or_(gt, hi);
c.test(gt, imm(0));
// eq = none_equal
// none_equal= 0b0010
c.mov(gt, imm(0x2)); // 0b0010
c.cmove(cr, gt);
c.bind(skip);
e.update_cr_value(6, cr);
}
// http://x86.renejeschke.de/html/file_module_x86_id_37.html
// These line up to the cmpps ops, except at the end where we have our own
// emulated ops for gt/etc that don't exist in the instruction set.
enum vcmpxxfp_op {
vcmpxxfp_eq = 0,
vcmpxxfp_lt = 1,
vcmpxxfp_le = 2,
vcmpxxfp_unord = 3,
vcmpxxfp_neq = 4,
vcmpxxfp_nlt = 5,
vcmpxxfp_nle = 6,
vcmpxxfp_ord = 7,
// Emulated ops:
vcmpxxfp_gt = 8,
vcmpxxfp_ge = 9,
};
int InstrEmit_vcmpxxfp_(X64Emitter& e, X86Compiler& c, InstrData& i, vcmpxxfp_op cmpop, uint32_t vd, uint32_t va, uint32_t vb, uint32_t rc) {
// (VD.xyzw) = (VA.xyzw) OP (VB.xyzw) ? 0xFFFFFFFF : 0x00000000
// if (Rc) CR6 = all_equal | 0 | none_equal | 0
// If an element in either VA or VB is NaN the result will be 0x00000000
XmmVar v(c.newXmmVar());
switch (cmpop) {
// Supported ops:
default:
c.movaps(v, e.vr_value(va));
c.cmpps(v, e.vr_value(vb), imm(cmpop));
break;
// Emulated ops:
case vcmpxxfp_gt:
c.movaps(v, e.vr_value(vb));
c.cmpps(v, e.vr_value(va), imm(vcmpxxfp_lt));
break;
case vcmpxxfp_ge:
c.movaps(v, e.vr_value(vb));
c.cmpps(v, e.vr_value(va), imm(vcmpxxfp_le));
break;
}
e.update_vr_value(vd, v);
if (rc) {
InstrEmit_vcmp_cr6_(e, c, v);
}
e.TraceVR(vd, va, vb);
return 0;
}
XEEMITTER(vcmpeqfp, 0x100000C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_eq, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpeqfp128, VX128(6, 0), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_eq, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc);
}
XEEMITTER(vcmpequb, 0x10000006, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vcmpequh, 0x10000046, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vcmpequw, 0x10000086, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vcmpequw128, VX128(6, 512), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vcmpgefp, 0x100001C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_ge, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgefp128, VX128(6, 128), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_ge, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc);
}
XEEMITTER(vcmpgtfp, 0x100002C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_gt, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgtfp128, VX128(6, 256), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_gt, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc);
}
enum vcmpxxi_op {
vcmpxxi_eq = 0,
vcmpxxi_gt_signed = 1,
vcmpxxi_gt_unsigned = 2,
};
int InstrEmit_vcmpxxi_(X64Emitter& e, X86Compiler& c, InstrData& i, vcmpxxi_op cmpop, uint32_t width, uint32_t vd, uint32_t va, uint32_t vb, uint32_t rc) {
// (VD.xyzw) = (VA.xyzw) OP (VB.xyzw) ? 0xFFFFFFFF : 0x00000000
// if (Rc) CR6 = all_equal | 0 | none_equal | 0
// If an element in either VA or VB is NaN the result will be 0x00000000
XmmVar v(c.newXmmVar());
c.movaps(v, e.vr_value(va));
switch (cmpop) {
case vcmpxxi_eq:
switch (width) {
case 1:
c.pcmpeqb(v, e.vr_value(vb));
break;
case 2:
c.pcmpeqw(v, e.vr_value(vb));
break;
case 4:
c.pcmpeqd(v, e.vr_value(vb));
break;
default: XEASSERTALWAYS(); return 1;
}
break;
case vcmpxxi_gt_signed:
switch (width) {
case 1:
c.pcmpgtb(v, e.vr_value(vb));
break;
case 2:
c.pcmpgtw(v, e.vr_value(vb));
break;
case 4:
c.pcmpgtd(v, e.vr_value(vb));
break;
default: XEASSERTALWAYS(); return 1;
}
break;
case vcmpxxi_gt_unsigned:
// Nasty, as there is no unsigned variant.
c.int3();
XEINSTRNOTIMPLEMENTED();
return 1;
default: XEASSERTALWAYS(); return 1;
}
e.update_vr_value(vd, v);
if (rc) {
InstrEmit_vcmp_cr6_(e, c, v);
}
e.TraceVR(vd, va, vb);
return 0;
}
XEEMITTER(vcmpequb, 0x10000006, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 1, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpequh, 0x10000046, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 2, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpequw, 0x10000086, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 4, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpequw128, VX128(6, 512), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) {
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 4, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc);
}
XEEMITTER(vcmpgtsb, 0x10000306, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_signed, 1, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgtsh, 0x10000346, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_signed, 2, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgtsw, 0x10000386, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_signed, 4, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgtub, 0x10000206, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_unsigned, 1, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgtuh, 0x10000246, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_unsigned, 2, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vcmpgtuw, 0x10000286, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_unsigned, 4, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc);
}
XEEMITTER(vctsxs, 0x100003CA, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) {
@ -529,7 +640,6 @@ XEEMITTER(vexptefp, 0x1000018A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vexptefp128, VX128_3(6, 1712), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -539,7 +649,6 @@ XEEMITTER(vlogefp, 0x100001CA, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vlogefp128, VX128_3(6, 1776), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -581,7 +690,6 @@ XEEMITTER(vmaxfp, 0x1000040A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vmaxfp128, VX128(6, 640), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -631,7 +739,6 @@ XEEMITTER(vminfp, 0x1000044A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vminfp128, VX128(6, 704), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -992,7 +1099,6 @@ XEEMITTER(vpkshss, 0x1000018E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkshss128, VX128(5, 512), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1002,7 +1108,6 @@ XEEMITTER(vpkswss, 0x100001CE, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkswss128, VX128(5, 640), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1012,7 +1117,6 @@ XEEMITTER(vpkswus, 0x1000014E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkswus128, VX128(5, 704), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1022,7 +1126,6 @@ XEEMITTER(vpkuhum, 0x1000000E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkuhum128, VX128(5, 768), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1032,7 +1135,6 @@ XEEMITTER(vpkuhus, 0x1000008E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkuhus128, VX128(5, 832), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1042,7 +1144,6 @@ XEEMITTER(vpkshus, 0x1000010E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkshus128, VX128(5, 576), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1052,7 +1153,6 @@ XEEMITTER(vpkuwum, 0x1000004E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkuwum128, VX128(5, 896), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1062,7 +1162,6 @@ XEEMITTER(vpkuwus, 0x100000CE, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vpkuwus128, VX128(5, 960), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1077,7 +1176,6 @@ XEEMITTER(vrefp, 0x1000010A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vrefp128, VX128_3(6, 1584), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1087,7 +1185,6 @@ XEEMITTER(vrfim, 0x100002CA, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vrfim128, VX128_3(6, 816), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1097,7 +1194,6 @@ XEEMITTER(vrfin, 0x1000020A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vrfin128, VX128_3(6, 880), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1107,7 +1203,6 @@ XEEMITTER(vrfip, 0x1000028A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vrfip128, VX128_3(6, 944), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1117,7 +1212,6 @@ XEEMITTER(vrfiz, 0x1000024A, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vrfiz128, VX128_3(6, 1008), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1137,7 +1231,6 @@ XEEMITTER(vrlw, 0x10000084, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vrlw128, VX128(6, 80), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1208,7 +1301,6 @@ XEEMITTER(vsel, 0x1000002A, VXA )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vsel128, VX128(5, 848), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1228,7 +1320,6 @@ XEEMITTER(vsldoi, 0x1000002C, VXA )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vsldoi128, VX128_5(4, 16), VX128_5)(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1243,7 +1334,6 @@ XEEMITTER(vslo, 0x1000040C, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vslo128, VX128(5, 912), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1386,7 +1476,6 @@ XEEMITTER(vsraw, 0x10000384, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vsraw128, VX128(6, 336), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1406,7 +1495,6 @@ XEEMITTER(vsro, 0x1000044C, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vsro128, VX128(5, 976), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1416,7 +1504,6 @@ XEEMITTER(vsrw, 0x10000284, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vsrw128, VX128(6, 464), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1522,7 +1609,6 @@ XEEMITTER(vupkhsb, 0x1000020E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vupkhsb128, VX128(6, 896), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1542,7 +1628,6 @@ XEEMITTER(vupklsb, 0x1000028E, VX )(X64Emitter& e, X86Compiler& c, Instr
XEINSTRNOTIMPLEMENTED();
return 1;
}
XEEMITTER(vupklsb128, VX128(6, 960), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) {
XEINSTRNOTIMPLEMENTED();
return 1;
@ -1553,6 +1638,32 @@ XEEMITTER(vupklsh, 0x100002CE, VX )(X64Emitter& e, X86Compiler& c, Instr
return 1;
}
__m128 half_to_float5_SSE2(__m128i h) {
#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
#define SSE_CONST(name) *(const __m128i *)&name
#define SSE_CONSTF(name) *(const __m128 *)&name
SSE_CONST4(mask_nosign, 0x7fff);
SSE_CONST4(magic, (254 - 15) << 23);
SSE_CONST4(was_infnan, 0x7bff);
SSE_CONST4(exp_infnan, 255 << 23);
__m128i mnosign = SSE_CONST(mask_nosign);
__m128i expmant = _mm_and_si128(mnosign, h);
__m128i justsign = _mm_xor_si128(h, expmant);
__m128i expmant2 = expmant; // copy (just here for counting purposes)
__m128i shifted = _mm_slli_epi32(expmant, 13);
__m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic);
__m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, SSE_CONST(was_infnan));
__m128i sign = _mm_slli_epi32(justsign, 16);
__m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), SSE_CONSTF(exp_infnan));
__m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
__m128 final = _mm_or_ps(scaled, sign_inf);
// ~11 SSE2 ops.
return final;
#undef SSE_CONST4
#undef CONST
#undef CONSTF
}
XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) {
// Can't find many docs on this. Best reference is
// http://worldcraft.googlecode.com/svn/trunk/src/qylib/math/xmmatrix.inl,
@ -1592,18 +1703,34 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler&
break;
case 3: // VPACK_... 2 FLOAT16s
{
// (VD.x) = fixed_16_to_32(VB.x)
// (VD.y) = fixed_16_to_32(VB.y)
// (VD.x) = fixed_16_to_32(VB.x (low))
// (VD.y) = fixed_16_to_32(VB.x (high))
// (VD.z) = 0.0
// (VD.w) = 1.0
// 1 bit sign, 5 bit exponent, 10 bit mantissa
// D3D10 half float format
// TODO(benvanik): fixed_16_to_32 in SSE?
// TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx
// Use _mm_cvtph_ps -- requires very modern processors (SSE5+)
// Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
// Packing half floats: https://gist.github.com/rygorous/2156668
// Load source, move from tight pack of X16Y16.... to X16...Y16...
// Also zero out the high end.
c.int3();
c.movaps(vt, e.vr_value(vb));
c.save(vt);
c.lea(gt, vt.m128());
X86CompilerFuncCall* call = c.call(half_to_float5_SSE2);
uint32_t args[] = {kX86VarTypeGpq};
call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args));
call->setArgument(0, gt);
call->setReturn(v);
// Select XY00.
c.xorps(vt, vt);
c.shufps(v, vt, imm(0x04));
// {0.0, 0.0, 0.0, 1.0}
c.mov(gt, imm(0x3F800000));
c.pinsrd(vt, gt.r32(), imm(3));
c.movaps(v, vt);
c.int3();
c.pinsrd(v, gt.r32(), imm(3));
}
break;
default:
@ -1712,10 +1839,6 @@ void X64RegisterEmitCategoryAltivec() {
XEREGISTERINSTR(vcmpbfp128, VX128(6, 384));
XEREGISTERINSTR(vcmpeqfp, 0x100000C6);
XEREGISTERINSTR(vcmpeqfp128, VX128(6, 0));
XEREGISTERINSTR(vcmpequb, 0x10000006);
XEREGISTERINSTR(vcmpequh, 0x10000046);
XEREGISTERINSTR(vcmpequw, 0x10000086);
XEREGISTERINSTR(vcmpequw128, VX128(6, 512));
XEREGISTERINSTR(vcmpgefp, 0x100001C6);
XEREGISTERINSTR(vcmpgefp128, VX128(6, 128));
XEREGISTERINSTR(vcmpgtfp, 0x100002C6);
@ -1723,8 +1846,12 @@ void X64RegisterEmitCategoryAltivec() {
XEREGISTERINSTR(vcmpgtsb, 0x10000306);
XEREGISTERINSTR(vcmpgtsh, 0x10000346);
XEREGISTERINSTR(vcmpgtsw, 0x10000386);
XEREGISTERINSTR(vcmpequb, 0x10000006);
XEREGISTERINSTR(vcmpgtub, 0x10000206);
XEREGISTERINSTR(vcmpequh, 0x10000046);
XEREGISTERINSTR(vcmpgtuh, 0x10000246);
XEREGISTERINSTR(vcmpequw, 0x10000086);
XEREGISTERINSTR(vcmpequw128, VX128(6, 512));
XEREGISTERINSTR(vcmpgtuw, 0x10000286);
XEREGISTERINSTR(vctsxs, 0x100003CA);
XEREGISTERINSTR(vctuxs, 0x1000038A);