Fixing terrible vsldoi implementation. Still terrible.
This commit is contained in:
parent
87481f5f4c
commit
99bde2d67e
|
@ -1555,6 +1555,42 @@ XEEMITTER(vslb, 0x10000104, VX )(X64Emitter& e, X86Compiler& c, Instr
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __m128i __shift_table_out[16] = {
|
||||||
|
_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), // unused
|
||||||
|
_mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1),
|
||||||
|
_mm_set_epi8( 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14),
|
||||||
|
_mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15),
|
||||||
|
};
|
||||||
|
static __m128i __shift_table_in[16] = {
|
||||||
|
_mm_set_epi8(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), // unused
|
||||||
|
_mm_set_epi8( 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8( 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15),
|
||||||
|
_mm_set_epi8(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15),
|
||||||
|
_mm_set_epi8(13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15),
|
||||||
|
_mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15),
|
||||||
|
};
|
||||||
int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) {
|
int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) {
|
||||||
// (VD) <- ((VA) || (VB)) << (SH << 3)
|
// (VD) <- ((VA) || (VB)) << (SH << 3)
|
||||||
if (!sh) {
|
if (!sh) {
|
||||||
|
@ -1562,7 +1598,15 @@ int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, u
|
||||||
e.update_vr_value(vd, e.vr_value(va));
|
e.update_vr_value(vd, e.vr_value(va));
|
||||||
e.TraceVR(vd, va, vb);
|
e.TraceVR(vd, va, vb);
|
||||||
return 0;
|
return 0;
|
||||||
|
} else if (sh == 16) {
|
||||||
|
e.update_vr_value(vd, e.vr_value(vb));
|
||||||
|
e.TraceVR(vd, va, vb);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
// TODO(benvanik): optimize for the rotation case:
|
||||||
|
// vsldoi128 vr63,vr63,vr63,4
|
||||||
|
// (ABCD ABCD) << 4b = (BCDA)
|
||||||
|
// TODO(benvanik): rewrite this piece of shit.
|
||||||
XmmVar v(c.newXmmVar());
|
XmmVar v(c.newXmmVar());
|
||||||
c.movaps(v, e.vr_value(va));
|
c.movaps(v, e.vr_value(va));
|
||||||
XmmVar v_r(c.newXmmVar());
|
XmmVar v_r(c.newXmmVar());
|
||||||
|
@ -1570,12 +1614,15 @@ int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, u
|
||||||
// (VA << SH) OR (VB >> (16 - SH))
|
// (VA << SH) OR (VB >> (16 - SH))
|
||||||
GpVar gt(c.newGpVar());
|
GpVar gt(c.newGpVar());
|
||||||
c.xor_(gt, gt);
|
c.xor_(gt, gt);
|
||||||
c.pinsrb(v, gt.r8(), imm(15));
|
c.pinsrb(v, gt.r8(), imm(0));
|
||||||
c.pinsrb(v_r, gt.r8(), imm(0));
|
c.pinsrb(v_r, gt.r8(), imm(15));
|
||||||
c.mov(gt, imm((sysint_t)&__shift_table_left[sh]));
|
c.mov(gt, imm((sysint_t)&__shift_table_out[sh]));
|
||||||
c.pshufb(v, xmmword_ptr(gt));
|
XmmVar shuf(c.newXmmVar());
|
||||||
c.mov(gt, imm((sysint_t)&__shift_table_right[sh]));
|
c.movaps(shuf, xmmword_ptr(gt));
|
||||||
c.pshufb(v_r, xmmword_ptr(gt));
|
c.pshufb(v, shuf);
|
||||||
|
c.mov(gt, imm((sysint_t)&__shift_table_in[sh]));
|
||||||
|
c.movaps(shuf, xmmword_ptr(gt));
|
||||||
|
c.pshufb(v_r, shuf);
|
||||||
c.por(v, v_r);
|
c.por(v, v_r);
|
||||||
e.update_vr_value(vd, v);
|
e.update_vr_value(vd, v);
|
||||||
e.TraceVR(vd, va, vb);
|
e.TraceVR(vd, va, vb);
|
||||||
|
|
Loading…
Reference in New Issue