Fixing lvrx.
This commit is contained in:
parent
d46069cd47
commit
83d7523da1
|
@ -155,17 +155,17 @@ static __m128i __lvsl_table[16] = {
|
|||
_mm_set_epi8(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
|
||||
};
|
||||
int InstrEmit_lvsl_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
c.and_(ea, imm(0xF));
|
||||
c.shl(ea, imm(4)); // table offset = (16b * sh)
|
||||
GpVar gt(c.newGpVar());
|
||||
c.mov(gt, imm((sysint_t)__lvsl_table));
|
||||
XmmVar v(c.newXmmVar());
|
||||
c.movaps(v, xmmword_ptr(gt, ea));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
c.and_(ea, imm(0xF));
|
||||
c.shl(ea, imm(4)); // table offset = (16b * sh)
|
||||
GpVar gt(c.newGpVar());
|
||||
c.mov(gt, imm((sysint_t)__lvsl_table));
|
||||
XmmVar v(c.newXmmVar());
|
||||
c.movaps(v, xmmword_ptr(gt, ea));
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
e.update_vr_value(vd, v);
|
||||
e.TraceVR(vd);
|
||||
|
@ -197,17 +197,17 @@ static __m128i __lvsr_table[16] = {
|
|||
_mm_set_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
|
||||
};
|
||||
int InstrEmit_lvsr_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
c.and_(ea, imm(0xF));
|
||||
c.shl(ea, imm(4)); // table offset = (16b * sh)
|
||||
GpVar gt(c.newGpVar());
|
||||
c.mov(gt, imm((sysint_t)__lvsr_table));
|
||||
XmmVar v(c.newXmmVar());
|
||||
c.movaps(v, xmmword_ptr(gt, ea));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
c.and_(ea, imm(0xF));
|
||||
c.shl(ea, imm(4)); // table offset = (16b * sh)
|
||||
GpVar gt(c.newGpVar());
|
||||
c.mov(gt, imm((sysint_t)__lvsr_table));
|
||||
XmmVar v(c.newXmmVar());
|
||||
c.movaps(v, xmmword_ptr(gt, ea));
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
e.update_vr_value(vd, v);
|
||||
e.TraceVR(vd);
|
||||
|
@ -221,13 +221,13 @@ XEEMITTER(lvsr128, VX128_1(4, 67), VX128_1)(X64Emitter& e, X86Compiler&
|
|||
}
|
||||
|
||||
int InstrEmit_lvx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
XmmVar v = e.ReadMemoryXmm(i.address, ea, 4);
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
XmmVar v = e.ReadMemoryXmm(i.address, ea, 4);
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
e.update_vr_value(vd, v);
|
||||
e.TraceVR(vd);
|
||||
return 0;
|
||||
|
@ -268,13 +268,13 @@ XEEMITTER(stvewx128, VX128_1(4, 387), VX128_1)(X64Emitter& e, X86Compiler&
|
|||
}
|
||||
|
||||
int InstrEmit_stvx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
XmmVar v = e.vr_value(vd);
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
XmmVar v = e.vr_value(vd);
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
e.WriteMemoryXmm(i.address, ea, 4, v);
|
||||
e.TraceVR(vd);
|
||||
return 0;
|
||||
|
@ -295,28 +295,28 @@ XEEMITTER(stvxl128, VX128_1(4, 963), VX128_1)(X64Emitter& e, X86Compiler&
|
|||
// The lvlx/lvrx/etc instructions are in Cell docs only:
|
||||
// https://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/C40E4C6133B31EE8872570B500791108/$file/vector_simd_pem_v_2.07c_26Oct2006_cell.pdf
|
||||
int InstrEmit_lvlx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
GpVar sh(c.newGpVar());
|
||||
c.mov(sh, ea);
|
||||
c.and_(sh, imm(0xF));
|
||||
XmmVar v = e.ReadMemoryXmm(i.address, ea, 4);
|
||||
// If fully aligned skip complex work.
|
||||
Label done(c.newLabel());
|
||||
c.test(sh, sh);
|
||||
c.jz(done);
|
||||
{
|
||||
// Shift left by the number of bytes offset and fill with zeros.
|
||||
// We reuse the lvsl table here, as it does that for us.
|
||||
GpVar gt(c.newGpVar());
|
||||
c.xor_(gt, gt);
|
||||
c.pinsrb(v, gt.r8(), imm(15));
|
||||
c.shl(sh, imm(4)); // table offset = (16b * sh)
|
||||
c.mov(gt, imm((sysint_t)__shift_table_left));
|
||||
c.pshufb(v, xmmword_ptr(gt, sh));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
GpVar sh(c.newGpVar());
|
||||
c.mov(sh, ea);
|
||||
c.and_(sh, imm(0xF));
|
||||
XmmVar v = e.ReadMemoryXmm(i.address, ea, 4);
|
||||
// If fully aligned skip complex work.
|
||||
Label done(c.newLabel());
|
||||
c.test(sh, sh);
|
||||
c.jz(done);
|
||||
{
|
||||
// Shift left by the number of bytes offset and fill with zeros.
|
||||
// We reuse the lvsl table here, as it does that for us.
|
||||
GpVar gt(c.newGpVar());
|
||||
c.xor_(gt, gt);
|
||||
c.pinsrb(v, gt.r8(), imm(15));
|
||||
c.shl(sh, imm(4)); // table offset = (16b * sh)
|
||||
c.mov(gt, imm((sysint_t)__shift_table_left));
|
||||
c.pshufb(v, xmmword_ptr(gt, sh));
|
||||
}
|
||||
c.bind(done);
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
|
@ -338,31 +338,33 @@ XEEMITTER(lvlxl128, VX128_1(4, 1539), VX128_1)(X64Emitter& e, X86Compiler&
|
|||
}
|
||||
|
||||
int InstrEmit_lvrx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
GpVar sh(c.newGpVar());
|
||||
c.mov(sh, ea);
|
||||
c.and_(sh, imm(0xF));
|
||||
XmmVar v = e.ReadMemoryXmm(i.address, ea, 4);
|
||||
// If fully aligned skip complex work.
|
||||
Label done(c.newLabel());
|
||||
c.test(sh, sh);
|
||||
c.jz(done);
|
||||
{
|
||||
// Shift left by the number of bytes offset and fill with zeros.
|
||||
// We reuse the lvsl table here, as it does that for us.
|
||||
GpVar gt(c.newGpVar());
|
||||
c.xor_(gt, gt);
|
||||
c.pinsrb(v, gt.r8(), imm(0));
|
||||
c.shl(sh, imm(4)); // table offset = (16b * sh)
|
||||
c.mov(gt, imm((sysint_t)__shift_table_right));
|
||||
c.pshufb(v, xmmword_ptr(gt, sh));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
GpVar sh(c.newGpVar());
|
||||
c.mov(sh, ea);
|
||||
c.and_(sh, imm(0xF));
|
||||
// If fully aligned skip complex work.
|
||||
XmmVar v(c.newXmmVar());
|
||||
c.pxor(v, v);
|
||||
Label done(c.newLabel());
|
||||
c.test(sh, sh);
|
||||
c.jz(done);
|
||||
{
|
||||
// Shift left by the number of bytes offset and fill with zeros.
|
||||
// We reuse the lvsl table here, as it does that for us.
|
||||
c.movaps(v, e.ReadMemoryXmm(i.address, ea, 4));
|
||||
GpVar gt(c.newGpVar());
|
||||
c.xor_(gt, gt);
|
||||
c.pinsrb(v, gt.r8(), imm(0));
|
||||
c.shl(sh, imm(4)); // table offset = (16b * sh)
|
||||
c.mov(gt, imm((sysint_t)__shift_table_right));
|
||||
c.pshufb(v, xmmword_ptr(gt, sh));
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
}
|
||||
c.bind(done);
|
||||
c.shufps(v, v, imm(0x1B));
|
||||
e.update_vr_value(vd, v);
|
||||
e.TraceVR(vd);
|
||||
return 0;
|
||||
|
@ -391,21 +393,21 @@ static void __emulated_stvlx(uint64_t addr, __m128i vd) {
|
|||
}
|
||||
}
|
||||
int InstrEmit_stvlx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
ea = e.TouchMemoryAddress(i.address, ea);
|
||||
XmmVar tvd(c.newXmmVar());
|
||||
c.movaps(tvd, e.vr_value(vd));
|
||||
c.shufps(tvd, tvd, imm(0x1B));
|
||||
c.save(tvd);
|
||||
GpVar pvd(c.newGpVar());
|
||||
c.lea(pvd, tvd.m128());
|
||||
X86CompilerFuncCall* call = c.call(__emulated_stvlx);
|
||||
uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args));
|
||||
GpVar pvd(c.newGpVar());
|
||||
c.lea(pvd, tvd.m128());
|
||||
X86CompilerFuncCall* call = c.call(__emulated_stvlx);
|
||||
uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args));
|
||||
call->setArgument(0, ea);
|
||||
call->setArgument(1, pvd);
|
||||
e.TraceVR(vd);
|
||||
|
@ -436,21 +438,21 @@ static void __emulated_stvrx(uint64_t addr, __m128i vd) {
|
|||
}
|
||||
}
|
||||
int InstrEmit_stvrx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) {
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
GpVar ea(c.newGpVar());
|
||||
c.mov(ea, e.gpr_value(rb));
|
||||
if (ra) {
|
||||
c.add(ea, e.gpr_value(ra));
|
||||
}
|
||||
ea = e.TouchMemoryAddress(i.address, ea);
|
||||
XmmVar tvd(c.newXmmVar());
|
||||
c.movaps(tvd, e.vr_value(vd));
|
||||
c.shufps(tvd, tvd, imm(0x1B));
|
||||
c.save(tvd);
|
||||
GpVar pvd(c.newGpVar());
|
||||
c.lea(pvd, tvd.m128());
|
||||
X86CompilerFuncCall* call = c.call(__emulated_stvrx);
|
||||
uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args));
|
||||
GpVar pvd(c.newGpVar());
|
||||
c.lea(pvd, tvd.m128());
|
||||
X86CompilerFuncCall* call = c.call(__emulated_stvrx);
|
||||
uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args));
|
||||
call->setArgument(0, ea);
|
||||
call->setArgument(1, pvd);
|
||||
e.TraceVR(vd);
|
||||
|
@ -1281,20 +1283,20 @@ __m128i __emulated_vperm(uint8_t* va, uint8_t* vb, uint8_t* vc) {
|
|||
int InstrEmit_vperm_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t vc) {
|
||||
// Call emulation function.
|
||||
XmmVar tva(c.newXmmVar()), tvb(c.newXmmVar()), tvc(c.newXmmVar());
|
||||
GpVar pva(c.newGpVar()), pvb(c.newGpVar()), pvc(c.newGpVar());
|
||||
c.movaps(tva, e.vr_value(va));
|
||||
c.movaps(tvb, e.vr_value(vb));
|
||||
c.movaps(tvc, e.vr_value(vc));
|
||||
c.save(tva);
|
||||
c.save(tvb);
|
||||
c.save(tvc);
|
||||
c.lea(pva, tva.m128());
|
||||
c.lea(pvb, tvb.m128());
|
||||
GpVar pva(c.newGpVar()), pvb(c.newGpVar()), pvc(c.newGpVar());
|
||||
c.movaps(tva, e.vr_value(va));
|
||||
c.movaps(tvb, e.vr_value(vb));
|
||||
c.movaps(tvc, e.vr_value(vc));
|
||||
c.save(tva);
|
||||
c.save(tvb);
|
||||
c.save(tvc);
|
||||
c.lea(pva, tva.m128());
|
||||
c.lea(pvb, tvb.m128());
|
||||
c.lea(pvc, tvc.m128());
|
||||
XmmVar v(c.newXmmVar());
|
||||
X86CompilerFuncCall* call = c.call(__emulated_vperm);
|
||||
uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq, kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args));
|
||||
X86CompilerFuncCall* call = c.call(__emulated_vperm);
|
||||
uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq, kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args));
|
||||
call->setArgument(0, pva);
|
||||
call->setArgument(1, pvb);
|
||||
call->setArgument(2, pvc);
|
||||
|
@ -1612,17 +1614,17 @@ int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, u
|
|||
XmmVar v_r(c.newXmmVar());
|
||||
c.movaps(v_r, e.vr_value(vb));
|
||||
// (VA << SH) OR (VB >> (16 - SH))
|
||||
GpVar gt(c.newGpVar());
|
||||
c.xor_(gt, gt);
|
||||
c.pinsrb(v, gt.r8(), imm(0));
|
||||
c.pinsrb(v_r, gt.r8(), imm(15));
|
||||
c.mov(gt, imm((sysint_t)&__shift_table_out[sh]));
|
||||
XmmVar shuf(c.newXmmVar());
|
||||
c.movaps(shuf, xmmword_ptr(gt));
|
||||
c.pshufb(v, shuf);
|
||||
c.mov(gt, imm((sysint_t)&__shift_table_in[sh]));
|
||||
c.movaps(shuf, xmmword_ptr(gt));
|
||||
c.pshufb(v_r, shuf);
|
||||
GpVar gt(c.newGpVar());
|
||||
c.xor_(gt, gt);
|
||||
c.pinsrb(v, gt.r8(), imm(0));
|
||||
c.pinsrb(v_r, gt.r8(), imm(15));
|
||||
c.mov(gt, imm((sysint_t)&__shift_table_out[sh]));
|
||||
XmmVar shuf(c.newXmmVar());
|
||||
c.movaps(shuf, xmmword_ptr(gt));
|
||||
c.pshufb(v, shuf);
|
||||
c.mov(gt, imm((sysint_t)&__shift_table_in[sh]));
|
||||
c.movaps(shuf, xmmword_ptr(gt));
|
||||
c.pshufb(v_r, shuf);
|
||||
c.por(v, v_r);
|
||||
e.update_vr_value(vd, v);
|
||||
e.TraceVR(vd, va, vb);
|
||||
|
@ -2068,12 +2070,12 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler&
|
|||
// Load source, move from tight pack of X16Y16.... to X16...Y16...
|
||||
// Also zero out the high end.
|
||||
c.int3();
|
||||
c.movaps(vt, e.vr_value(vb));
|
||||
c.save(vt);
|
||||
c.movaps(vt, e.vr_value(vb));
|
||||
c.save(vt);
|
||||
c.lea(gt, vt.m128());
|
||||
X86CompilerFuncCall* call = c.call(half_to_float5_SSE2);
|
||||
uint32_t args[] = {kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args));
|
||||
X86CompilerFuncCall* call = c.call(half_to_float5_SSE2);
|
||||
uint32_t args[] = {kX86VarTypeGpq};
|
||||
call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args));
|
||||
call->setArgument(0, gt);
|
||||
call->setReturn(v);
|
||||
// Select XY00.
|
||||
|
|
Loading…
Reference in New Issue