diff --git a/src/xenia/cpu/ppc/instr.h b/src/xenia/cpu/ppc/instr.h index 7f957b2ee..b07d1c6d6 100644 --- a/src/xenia/cpu/ppc/instr.h +++ b/src/xenia/cpu/ppc/instr.h @@ -348,7 +348,7 @@ typedef struct { uint32_t VD128l : 5; uint32_t : 6; } VX128_P; - // kXEPPCInstrFormatVX128 + // kXEPPCInstrFormatVX128_R struct { // VD128 = VD128l | (VD128h << 5) // VA128 = VA128l | (VA128h << 5) | (VA128H << 6) diff --git a/src/xenia/cpu/x64/x64_emit_altivec.cc b/src/xenia/cpu/x64/x64_emit_altivec.cc index dfa644099..4fadd4740 100644 --- a/src/xenia/cpu/x64/x64_emit_altivec.cc +++ b/src/xenia/cpu/x64/x64_emit_altivec.cc @@ -48,6 +48,9 @@ namespace x64 { #define VX128_3_VD128 (i.VX128_3.VD128l | (i.VX128_3.VD128h << 5)) #define VX128_3_VB128 (i.VX128_3.VB128l | (i.VX128_3.VB128h << 5)) #define VX128_3_IMM (i.VX128_3.IMM) +#define VX128_R_VD128 (i.VX128_R.VD128l | (i.VX128_R.VD128h << 5)) +#define VX128_R_VA128 (i.VX128_R.VA128l | (i.VX128_R.VA128h << 5) | (i.VX128_R.VA128H << 6)) +#define VX128_R_VB128 (i.VX128_R.VB128l | (i.VX128_R.VB128h << 5)) XEEMITTER(dst, 0x7C0002AC, XDSS)(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -425,94 +428,202 @@ XEEMITTER(vcfpuxws128, VX128_3(6, 624), VX128_3)(X64Emitter& e, X86Compiler& return 1; } -XEEMITTER(vcmpbfp, 0x100003C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { +int InstrEmit_vcmpbfp_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t va, uint32_t vb, uint32_t rc) { XEINSTRNOTIMPLEMENTED(); return 1; } - +XEEMITTER(vcmpbfp, 0x100003C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vcmpbfp_(e, c, i, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); +} XEEMITTER(vcmpbfp128, VX128(6, 384), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpbfp_(e, c, i, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc); +} + +void InstrEmit_vcmp_cr6_(X64Emitter& e, X86Compiler& c, XmmVar& v) { + // Testing for all 1's and all 0's. + // if (Rc) CR6 = all_equal | 0 | none_equal | 0 + // Since none_equal and all_equal are mutually exclusive we optimize + // a bit here. This is still terrible. + GpVar lo(c.newGpVar()); + GpVar hi(c.newGpVar()); + c.pextrq(hi.m64(), v, imm(1)); + c.movq(lo.m64(), v); + + GpVar gt(c.newGpVar()); + GpVar cr(c.newGpVar()); + c.xor_(cr, cr); + Label skip(c.newLabel()); + + // cmp with 0xFF... and set all_equal + c.mov(gt, lo); + c.and_(gt, hi); + c.test(gt, imm(0)); + // !eq = all_equal + // all_equal= 0b1000 + c.mov(gt, imm(0x8)); // 0b1000 + c.cmovne(cr, gt); + c.jne(skip); + + // cmp with 0 and set none_equal + c.mov(gt, lo); + c.or_(gt, hi); + c.test(gt, imm(0)); + // eq = none_equal + // none_equal= 0b0010 + c.mov(gt, imm(0x2)); // 0b0010 + c.cmove(cr, gt); + + c.bind(skip); + e.update_cr_value(6, cr); +} + +// http://x86.renejeschke.de/html/file_module_x86_id_37.html +// These line up to the cmpps ops, except at the end where we have our own +// emulated ops for gt/etc that don't exist in the instruction set. +enum vcmpxxfp_op { + vcmpxxfp_eq = 0, + vcmpxxfp_lt = 1, + vcmpxxfp_le = 2, + vcmpxxfp_unord = 3, + vcmpxxfp_neq = 4, + vcmpxxfp_nlt = 5, + vcmpxxfp_nle = 6, + vcmpxxfp_ord = 7, + // Emulated ops: + vcmpxxfp_gt = 8, + vcmpxxfp_ge = 9, +}; +int InstrEmit_vcmpxxfp_(X64Emitter& e, X86Compiler& c, InstrData& i, vcmpxxfp_op cmpop, uint32_t vd, uint32_t va, uint32_t vb, uint32_t rc) { + // (VD.xyzw) = (VA.xyzw) OP (VB.xyzw) ? 0xFFFFFFFF : 0x00000000 + // if (Rc) CR6 = all_equal | 0 | none_equal | 0 + // If an element in either VA or VB is NaN the result will be 0x00000000 + XmmVar v(c.newXmmVar()); + switch (cmpop) { + // Supported ops: + default: + c.movaps(v, e.vr_value(va)); + c.cmpps(v, e.vr_value(vb), imm(cmpop)); + break; + // Emulated ops: + case vcmpxxfp_gt: + c.movaps(v, e.vr_value(vb)); + c.cmpps(v, e.vr_value(va), imm(vcmpxxfp_lt)); + break; + case vcmpxxfp_ge: + c.movaps(v, e.vr_value(vb)); + c.cmpps(v, e.vr_value(va), imm(vcmpxxfp_le)); + break; + } + e.update_vr_value(vd, v); + if (rc) { + InstrEmit_vcmp_cr6_(e, c, v); + } + e.TraceVR(vd, va, vb); + return 0; } XEEMITTER(vcmpeqfp, 0x100000C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_eq, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpeqfp128, VX128(6, 0), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_eq, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc); } - -XEEMITTER(vcmpequb, 0x10000006, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vcmpequh, 0x10000046, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vcmpequw, 0x10000086, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - -XEEMITTER(vcmpequw128, VX128(6, 512), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; -} - XEEMITTER(vcmpgefp, 0x100001C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_ge, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgefp128, VX128(6, 128), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_ge, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc); } - XEEMITTER(vcmpgtfp, 0x100002C6, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_gt, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgtfp128, VX128(6, 256), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxfp_(e, c, i, vcmpxxfp_gt, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc); } +enum vcmpxxi_op { + vcmpxxi_eq = 0, + vcmpxxi_gt_signed = 1, + vcmpxxi_gt_unsigned = 2, +}; +int InstrEmit_vcmpxxi_(X64Emitter& e, X86Compiler& c, InstrData& i, vcmpxxi_op cmpop, uint32_t width, uint32_t vd, uint32_t va, uint32_t vb, uint32_t rc) { + // (VD.xyzw) = (VA.xyzw) OP (VB.xyzw) ? 0xFFFFFFFF : 0x00000000 + // if (Rc) CR6 = all_equal | 0 | none_equal | 0 + // If an element in either VA or VB is NaN the result will be 0x00000000 + XmmVar v(c.newXmmVar()); + c.movaps(v, e.vr_value(va)); + switch (cmpop) { + case vcmpxxi_eq: + switch (width) { + case 1: + c.pcmpeqb(v, e.vr_value(vb)); + break; + case 2: + c.pcmpeqw(v, e.vr_value(vb)); + break; + case 4: + c.pcmpeqd(v, e.vr_value(vb)); + break; + default: XEASSERTALWAYS(); return 1; + } + break; + case vcmpxxi_gt_signed: + switch (width) { + case 1: + c.pcmpgtb(v, e.vr_value(vb)); + break; + case 2: + c.pcmpgtw(v, e.vr_value(vb)); + break; + case 4: + c.pcmpgtd(v, e.vr_value(vb)); + break; + default: XEASSERTALWAYS(); return 1; + } + break; + case vcmpxxi_gt_unsigned: + // Nasty, as there is no unsigned variant. + c.int3(); + XEINSTRNOTIMPLEMENTED(); + return 1; + default: XEASSERTALWAYS(); return 1; + } + e.update_vr_value(vd, v); + if (rc) { + InstrEmit_vcmp_cr6_(e, c, v); + } + e.TraceVR(vd, va, vb); + return 0; +} +XEEMITTER(vcmpequb, 0x10000006, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 1, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); +} +XEEMITTER(vcmpequh, 0x10000046, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 2, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); +} +XEEMITTER(vcmpequw, 0x10000086, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 4, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); +} +XEEMITTER(vcmpequw128, VX128(6, 512), VX128_R)(X64Emitter& e, X86Compiler& c, InstrData& i) { + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_eq, 4, VX128_R_VD128, VX128_R_VA128, VX128_R_VB128, i.VX128_R.Rc); +} XEEMITTER(vcmpgtsb, 0x10000306, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_signed, 1, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgtsh, 0x10000346, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_signed, 2, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgtsw, 0x10000386, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_signed, 4, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgtub, 0x10000206, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_unsigned, 1, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgtuh, 0x10000246, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_unsigned, 2, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } - XEEMITTER(vcmpgtuw, 0x10000286, VXR )(X64Emitter& e, X86Compiler& c, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vcmpxxi_(e, c, i, vcmpxxi_gt_unsigned, 4, i.VXR.VD, i.VXR.VA, i.VXR.VB, i.VXR.Rc); } XEEMITTER(vctsxs, 0x100003CA, VX )(X64Emitter& e, X86Compiler& c, InstrData& i) { @@ -529,7 +640,6 @@ XEEMITTER(vexptefp, 0x1000018A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vexptefp128, VX128_3(6, 1712), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -539,7 +649,6 @@ XEEMITTER(vlogefp, 0x100001CA, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vlogefp128, VX128_3(6, 1776), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -581,7 +690,6 @@ XEEMITTER(vmaxfp, 0x1000040A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vmaxfp128, VX128(6, 640), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -631,7 +739,6 @@ XEEMITTER(vminfp, 0x1000044A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vminfp128, VX128(6, 704), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -992,7 +1099,6 @@ XEEMITTER(vpkshss, 0x1000018E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkshss128, VX128(5, 512), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1002,7 +1108,6 @@ XEEMITTER(vpkswss, 0x100001CE, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkswss128, VX128(5, 640), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1012,7 +1117,6 @@ XEEMITTER(vpkswus, 0x1000014E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkswus128, VX128(5, 704), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1022,7 +1126,6 @@ XEEMITTER(vpkuhum, 0x1000000E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkuhum128, VX128(5, 768), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1032,7 +1135,6 @@ XEEMITTER(vpkuhus, 0x1000008E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkuhus128, VX128(5, 832), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1042,7 +1144,6 @@ XEEMITTER(vpkshus, 0x1000010E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkshus128, VX128(5, 576), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1052,7 +1153,6 @@ XEEMITTER(vpkuwum, 0x1000004E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkuwum128, VX128(5, 896), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1062,7 +1162,6 @@ XEEMITTER(vpkuwus, 0x100000CE, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vpkuwus128, VX128(5, 960), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1077,7 +1176,6 @@ XEEMITTER(vrefp, 0x1000010A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vrefp128, VX128_3(6, 1584), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1087,7 +1185,6 @@ XEEMITTER(vrfim, 0x100002CA, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vrfim128, VX128_3(6, 816), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1097,7 +1194,6 @@ XEEMITTER(vrfin, 0x1000020A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vrfin128, VX128_3(6, 880), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1107,7 +1203,6 @@ XEEMITTER(vrfip, 0x1000028A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vrfip128, VX128_3(6, 944), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1117,7 +1212,6 @@ XEEMITTER(vrfiz, 0x1000024A, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vrfiz128, VX128_3(6, 1008), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1137,7 +1231,6 @@ XEEMITTER(vrlw, 0x10000084, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vrlw128, VX128(6, 80), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1208,7 +1301,6 @@ XEEMITTER(vsel, 0x1000002A, VXA )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vsel128, VX128(5, 848), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1228,7 +1320,6 @@ XEEMITTER(vsldoi, 0x1000002C, VXA )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vsldoi128, VX128_5(4, 16), VX128_5)(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1243,7 +1334,6 @@ XEEMITTER(vslo, 0x1000040C, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vslo128, VX128(5, 912), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1386,7 +1476,6 @@ XEEMITTER(vsraw, 0x10000384, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vsraw128, VX128(6, 336), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1406,7 +1495,6 @@ XEEMITTER(vsro, 0x1000044C, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vsro128, VX128(5, 976), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1416,7 +1504,6 @@ XEEMITTER(vsrw, 0x10000284, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vsrw128, VX128(6, 464), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1522,7 +1609,6 @@ XEEMITTER(vupkhsb, 0x1000020E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vupkhsb128, VX128(6, 896), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1542,7 +1628,6 @@ XEEMITTER(vupklsb, 0x1000028E, VX )(X64Emitter& e, X86Compiler& c, Instr XEINSTRNOTIMPLEMENTED(); return 1; } - XEEMITTER(vupklsb128, VX128(6, 960), VX128 )(X64Emitter& e, X86Compiler& c, InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; @@ -1553,6 +1638,32 @@ XEEMITTER(vupklsh, 0x100002CE, VX )(X64Emitter& e, X86Compiler& c, Instr return 1; } +__m128 half_to_float5_SSE2(__m128i h) { +#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) } +#define SSE_CONST(name) *(const __m128i *)&name +#define SSE_CONSTF(name) *(const __m128 *)&name + SSE_CONST4(mask_nosign, 0x7fff); + SSE_CONST4(magic, (254 - 15) << 23); + SSE_CONST4(was_infnan, 0x7bff); + SSE_CONST4(exp_infnan, 255 << 23); + __m128i mnosign = SSE_CONST(mask_nosign); + __m128i expmant = _mm_and_si128(mnosign, h); + __m128i justsign = _mm_xor_si128(h, expmant); + __m128i expmant2 = expmant; // copy (just here for counting purposes) + __m128i shifted = _mm_slli_epi32(expmant, 13); + __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic); + __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, SSE_CONST(was_infnan)); + __m128i sign = _mm_slli_epi32(justsign, 16); + __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), SSE_CONSTF(exp_infnan)); + __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp); + __m128 final = _mm_or_ps(scaled, sign_inf); + // ~11 SSE2 ops. + return final; +#undef SSE_CONST4 +#undef CONST +#undef CONSTF +} + XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler& c, InstrData& i) { // Can't find many docs on this. Best reference is // http://worldcraft.googlecode.com/svn/trunk/src/qylib/math/xmmatrix.inl, @@ -1592,18 +1703,34 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler& break; case 3: // VPACK_... 2 FLOAT16s { - // (VD.x) = fixed_16_to_32(VB.x) - // (VD.y) = fixed_16_to_32(VB.y) + // (VD.x) = fixed_16_to_32(VB.x (low)) + // (VD.y) = fixed_16_to_32(VB.x (high)) // (VD.z) = 0.0 // (VD.w) = 1.0 // 1 bit sign, 5 bit exponent, 10 bit mantissa // D3D10 half float format // TODO(benvanik): fixed_16_to_32 in SSE? + // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) + // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + // Packing half floats: https://gist.github.com/rygorous/2156668 + // Load source, move from tight pack of X16Y16.... to X16...Y16... + // Also zero out the high end. + c.int3(); + c.movaps(vt, e.vr_value(vb)); + c.save(vt); + c.lea(gt, vt.m128()); + X86CompilerFuncCall* call = c.call(half_to_float5_SSE2); + uint32_t args[] = {kX86VarTypeGpq}; + call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args)); + call->setArgument(0, gt); + call->setReturn(v); + // Select XY00. + c.xorps(vt, vt); + c.shufps(v, vt, imm(0x04)); // {0.0, 0.0, 0.0, 1.0} c.mov(gt, imm(0x3F800000)); - c.pinsrd(vt, gt.r32(), imm(3)); - c.movaps(v, vt); - c.int3(); + c.pinsrd(v, gt.r32(), imm(3)); } break; default: @@ -1712,10 +1839,6 @@ void X64RegisterEmitCategoryAltivec() { XEREGISTERINSTR(vcmpbfp128, VX128(6, 384)); XEREGISTERINSTR(vcmpeqfp, 0x100000C6); XEREGISTERINSTR(vcmpeqfp128, VX128(6, 0)); - XEREGISTERINSTR(vcmpequb, 0x10000006); - XEREGISTERINSTR(vcmpequh, 0x10000046); - XEREGISTERINSTR(vcmpequw, 0x10000086); - XEREGISTERINSTR(vcmpequw128, VX128(6, 512)); XEREGISTERINSTR(vcmpgefp, 0x100001C6); XEREGISTERINSTR(vcmpgefp128, VX128(6, 128)); XEREGISTERINSTR(vcmpgtfp, 0x100002C6); @@ -1723,8 +1846,12 @@ void X64RegisterEmitCategoryAltivec() { XEREGISTERINSTR(vcmpgtsb, 0x10000306); XEREGISTERINSTR(vcmpgtsh, 0x10000346); XEREGISTERINSTR(vcmpgtsw, 0x10000386); + XEREGISTERINSTR(vcmpequb, 0x10000006); XEREGISTERINSTR(vcmpgtub, 0x10000206); + XEREGISTERINSTR(vcmpequh, 0x10000046); XEREGISTERINSTR(vcmpgtuh, 0x10000246); + XEREGISTERINSTR(vcmpequw, 0x10000086); + XEREGISTERINSTR(vcmpequw128, VX128(6, 512)); XEREGISTERINSTR(vcmpgtuw, 0x10000286); XEREGISTERINSTR(vctsxs, 0x100003CA); XEREGISTERINSTR(vctuxs, 0x1000038A);