Fixing terrible vsldoi implementation. Still terrible.

This commit is contained in:
Ben Vanik 2013-10-06 17:45:59 -07:00
parent 87481f5f4c
commit 99bde2d67e
1 changed files with 53 additions and 6 deletions

View File

@ -1555,6 +1555,42 @@ XEEMITTER(vslb, 0x10000104, VX )(X64Emitter& e, X86Compiler& c, Instr
return 1;
}
static __m128i __shift_table_out[16] = {
_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), // unused
_mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1),
_mm_set_epi8( 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2),
_mm_set_epi8( 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3),
_mm_set_epi8( 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4),
_mm_set_epi8( 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14),
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15),
};
static __m128i __shift_table_in[16] = {
_mm_set_epi8(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), // unused
_mm_set_epi8( 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15),
_mm_set_epi8( 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15),
_mm_set_epi8(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15),
_mm_set_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15),
_mm_set_epi8(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15),
_mm_set_epi8(13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15),
_mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15),
};
int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) {
// (VD) <- ((VA) || (VB)) << (SH << 3)
if (!sh) {
@ -1562,7 +1598,15 @@ int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, u
e.update_vr_value(vd, e.vr_value(va));
e.TraceVR(vd, va, vb);
return 0;
} else if (sh == 16) {
e.update_vr_value(vd, e.vr_value(vb));
e.TraceVR(vd, va, vb);
return 0;
}
// TODO(benvanik): optimize for the rotation case:
// vsldoi128 vr63,vr63,vr63,4
// (ABCD ABCD) << 4b = (BCDA)
// TODO(benvanik): rewrite this piece of shit.
XmmVar v(c.newXmmVar());
c.movaps(v, e.vr_value(va));
XmmVar v_r(c.newXmmVar());
@ -1570,12 +1614,15 @@ int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, u
// (VA << SH) OR (VB >> (16 - SH))
GpVar gt(c.newGpVar());
c.xor_(gt, gt);
c.pinsrb(v, gt.r8(), imm(15));
c.pinsrb(v_r, gt.r8(), imm(0));
c.mov(gt, imm((sysint_t)&__shift_table_left[sh]));
c.pshufb(v, xmmword_ptr(gt));
c.mov(gt, imm((sysint_t)&__shift_table_right[sh]));
c.pshufb(v_r, xmmword_ptr(gt));
c.pinsrb(v, gt.r8(), imm(0));
c.pinsrb(v_r, gt.r8(), imm(15));
c.mov(gt, imm((sysint_t)&__shift_table_out[sh]));
XmmVar shuf(c.newXmmVar());
c.movaps(shuf, xmmword_ptr(gt));
c.pshufb(v, shuf);
c.mov(gt, imm((sysint_t)&__shift_table_in[sh]));
c.movaps(shuf, xmmword_ptr(gt));
c.pshufb(v_r, shuf);
c.por(v, v_r);
e.update_vr_value(vd, v);
e.TraceVR(vd, va, vb);