diff --git a/src/xenia/cpu/x64/x64_emit_altivec.cc b/src/xenia/cpu/x64/x64_emit_altivec.cc index 1096e998c..e16d1105b 100644 --- a/src/xenia/cpu/x64/x64_emit_altivec.cc +++ b/src/xenia/cpu/x64/x64_emit_altivec.cc @@ -155,17 +155,17 @@ static __m128i __lvsl_table[16] = { _mm_set_epi8(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), }; int InstrEmit_lvsl_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); - } - c.and_(ea, imm(0xF)); - c.shl(ea, imm(4)); // table offset = (16b * sh) - GpVar gt(c.newGpVar()); - c.mov(gt, imm((sysint_t)__lvsl_table)); - XmmVar v(c.newXmmVar()); - c.movaps(v, xmmword_ptr(gt, ea)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); + } + c.and_(ea, imm(0xF)); + c.shl(ea, imm(4)); // table offset = (16b * sh) + GpVar gt(c.newGpVar()); + c.mov(gt, imm((sysint_t)__lvsl_table)); + XmmVar v(c.newXmmVar()); + c.movaps(v, xmmword_ptr(gt, ea)); c.shufps(v, v, imm(0x1B)); e.update_vr_value(vd, v); e.TraceVR(vd); @@ -197,17 +197,17 @@ static __m128i __lvsr_table[16] = { _mm_set_epi8( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), }; int InstrEmit_lvsr_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); - } - c.and_(ea, imm(0xF)); - c.shl(ea, imm(4)); // table offset = (16b * sh) - GpVar gt(c.newGpVar()); - c.mov(gt, imm((sysint_t)__lvsr_table)); - XmmVar v(c.newXmmVar()); - c.movaps(v, xmmword_ptr(gt, ea)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); + } + c.and_(ea, imm(0xF)); + c.shl(ea, imm(4)); // table offset = (16b * sh) + GpVar gt(c.newGpVar()); + c.mov(gt, imm((sysint_t)__lvsr_table)); + XmmVar v(c.newXmmVar()); + c.movaps(v, xmmword_ptr(gt, ea)); c.shufps(v, v, imm(0x1B)); e.update_vr_value(vd, v); e.TraceVR(vd); @@ -221,13 +221,13 @@ XEEMITTER(lvsr128, VX128_1(4, 67), VX128_1)(X64Emitter& e, X86Compiler& } int InstrEmit_lvx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); - } - XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); - c.shufps(v, v, imm(0x1B)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); + } + XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); + c.shufps(v, v, imm(0x1B)); e.update_vr_value(vd, v); e.TraceVR(vd); return 0; @@ -268,13 +268,13 @@ XEEMITTER(stvewx128, VX128_1(4, 387), VX128_1)(X64Emitter& e, X86Compiler& } int InstrEmit_stvx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); } - XmmVar v = e.vr_value(vd); - c.shufps(v, v, imm(0x1B)); + XmmVar v = e.vr_value(vd); + c.shufps(v, v, imm(0x1B)); e.WriteMemoryXmm(i.address, ea, 4, v); e.TraceVR(vd); return 0; @@ -295,28 +295,28 @@ XEEMITTER(stvxl128, VX128_1(4, 963), VX128_1)(X64Emitter& e, X86Compiler& // The lvlx/lvrx/etc instructions are in Cell docs only: // https://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/C40E4C6133B31EE8872570B500791108/$file/vector_simd_pem_v_2.07c_26Oct2006_cell.pdf int InstrEmit_lvlx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); - } - GpVar sh(c.newGpVar()); - c.mov(sh, ea); - c.and_(sh, imm(0xF)); - XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); - // If fully aligned skip complex work. - Label done(c.newLabel()); - c.test(sh, sh); - c.jz(done); - { - // Shift left by the number of bytes offset and fill with zeros. - // We reuse the lvsl table here, as it does that for us. - GpVar gt(c.newGpVar()); - c.xor_(gt, gt); - c.pinsrb(v, gt.r8(), imm(15)); - c.shl(sh, imm(4)); // table offset = (16b * sh) - c.mov(gt, imm((sysint_t)__shift_table_left)); - c.pshufb(v, xmmword_ptr(gt, sh)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); + } + GpVar sh(c.newGpVar()); + c.mov(sh, ea); + c.and_(sh, imm(0xF)); + XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); + // If fully aligned skip complex work. + Label done(c.newLabel()); + c.test(sh, sh); + c.jz(done); + { + // Shift left by the number of bytes offset and fill with zeros. + // We reuse the lvsl table here, as it does that for us. + GpVar gt(c.newGpVar()); + c.xor_(gt, gt); + c.pinsrb(v, gt.r8(), imm(15)); + c.shl(sh, imm(4)); // table offset = (16b * sh) + c.mov(gt, imm((sysint_t)__shift_table_left)); + c.pshufb(v, xmmword_ptr(gt, sh)); } c.bind(done); c.shufps(v, v, imm(0x1B)); @@ -338,31 +338,33 @@ XEEMITTER(lvlxl128, VX128_1(4, 1539), VX128_1)(X64Emitter& e, X86Compiler& } int InstrEmit_lvrx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); - } - GpVar sh(c.newGpVar()); - c.mov(sh, ea); - c.and_(sh, imm(0xF)); - XmmVar v = e.ReadMemoryXmm(i.address, ea, 4); - // If fully aligned skip complex work. - Label done(c.newLabel()); - c.test(sh, sh); - c.jz(done); - { - // Shift left by the number of bytes offset and fill with zeros. - // We reuse the lvsl table here, as it does that for us. - GpVar gt(c.newGpVar()); - c.xor_(gt, gt); - c.pinsrb(v, gt.r8(), imm(0)); - c.shl(sh, imm(4)); // table offset = (16b * sh) - c.mov(gt, imm((sysint_t)__shift_table_right)); - c.pshufb(v, xmmword_ptr(gt, sh)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); + } + GpVar sh(c.newGpVar()); + c.mov(sh, ea); + c.and_(sh, imm(0xF)); + // If fully aligned skip complex work. + XmmVar v(c.newXmmVar()); + c.pxor(v, v); + Label done(c.newLabel()); + c.test(sh, sh); + c.jz(done); + { + // Shift left by the number of bytes offset and fill with zeros. + // We reuse the lvsl table here, as it does that for us. + c.movaps(v, e.ReadMemoryXmm(i.address, ea, 4)); + GpVar gt(c.newGpVar()); + c.xor_(gt, gt); + c.pinsrb(v, gt.r8(), imm(0)); + c.shl(sh, imm(4)); // table offset = (16b * sh) + c.mov(gt, imm((sysint_t)__shift_table_right)); + c.pshufb(v, xmmword_ptr(gt, sh)); + c.shufps(v, v, imm(0x1B)); } c.bind(done); - c.shufps(v, v, imm(0x1B)); e.update_vr_value(vd, v); e.TraceVR(vd); return 0; @@ -391,21 +393,21 @@ static void __emulated_stvlx(uint64_t addr, __m128i vd) { } } int InstrEmit_stvlx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); } ea = e.TouchMemoryAddress(i.address, ea); XmmVar tvd(c.newXmmVar()); c.movaps(tvd, e.vr_value(vd)); c.shufps(tvd, tvd, imm(0x1B)); c.save(tvd); - GpVar pvd(c.newGpVar()); - c.lea(pvd, tvd.m128()); - X86CompilerFuncCall* call = c.call(__emulated_stvlx); - uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq}; - call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args)); + GpVar pvd(c.newGpVar()); + c.lea(pvd, tvd.m128()); + X86CompilerFuncCall* call = c.call(__emulated_stvlx); + uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq}; + call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args)); call->setArgument(0, ea); call->setArgument(1, pvd); e.TraceVR(vd); @@ -436,21 +438,21 @@ static void __emulated_stvrx(uint64_t addr, __m128i vd) { } } int InstrEmit_stvrx_(X64Emitter& e, X86Compiler& c, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { - GpVar ea(c.newGpVar()); - c.mov(ea, e.gpr_value(rb)); - if (ra) { - c.add(ea, e.gpr_value(ra)); + GpVar ea(c.newGpVar()); + c.mov(ea, e.gpr_value(rb)); + if (ra) { + c.add(ea, e.gpr_value(ra)); } ea = e.TouchMemoryAddress(i.address, ea); XmmVar tvd(c.newXmmVar()); c.movaps(tvd, e.vr_value(vd)); c.shufps(tvd, tvd, imm(0x1B)); c.save(tvd); - GpVar pvd(c.newGpVar()); - c.lea(pvd, tvd.m128()); - X86CompilerFuncCall* call = c.call(__emulated_stvrx); - uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq}; - call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args)); + GpVar pvd(c.newGpVar()); + c.lea(pvd, tvd.m128()); + X86CompilerFuncCall* call = c.call(__emulated_stvrx); + uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq}; + call->setPrototype(kX86FuncConvDefault, kX86VarTypeGpq, args, XECOUNT(args)); call->setArgument(0, ea); call->setArgument(1, pvd); e.TraceVR(vd); @@ -1281,20 +1283,20 @@ __m128i __emulated_vperm(uint8_t* va, uint8_t* vb, uint8_t* vc) { int InstrEmit_vperm_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, uint32_t vb, uint32_t vc) { // Call emulation function. XmmVar tva(c.newXmmVar()), tvb(c.newXmmVar()), tvc(c.newXmmVar()); - GpVar pva(c.newGpVar()), pvb(c.newGpVar()), pvc(c.newGpVar()); - c.movaps(tva, e.vr_value(va)); - c.movaps(tvb, e.vr_value(vb)); - c.movaps(tvc, e.vr_value(vc)); - c.save(tva); - c.save(tvb); - c.save(tvc); - c.lea(pva, tva.m128()); - c.lea(pvb, tvb.m128()); + GpVar pva(c.newGpVar()), pvb(c.newGpVar()), pvc(c.newGpVar()); + c.movaps(tva, e.vr_value(va)); + c.movaps(tvb, e.vr_value(vb)); + c.movaps(tvc, e.vr_value(vc)); + c.save(tva); + c.save(tvb); + c.save(tvc); + c.lea(pva, tva.m128()); + c.lea(pvb, tvb.m128()); c.lea(pvc, tvc.m128()); XmmVar v(c.newXmmVar()); - X86CompilerFuncCall* call = c.call(__emulated_vperm); - uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq, kX86VarTypeGpq}; - call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args)); + X86CompilerFuncCall* call = c.call(__emulated_vperm); + uint32_t args[] = {kX86VarTypeGpq, kX86VarTypeGpq, kX86VarTypeGpq}; + call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args)); call->setArgument(0, pva); call->setArgument(1, pvb); call->setArgument(2, pvc); @@ -1612,17 +1614,17 @@ int InstrEmit_vsldoi_(X64Emitter& e, X86Compiler& c, uint32_t vd, uint32_t va, u XmmVar v_r(c.newXmmVar()); c.movaps(v_r, e.vr_value(vb)); // (VA << SH) OR (VB >> (16 - SH)) - GpVar gt(c.newGpVar()); - c.xor_(gt, gt); - c.pinsrb(v, gt.r8(), imm(0)); - c.pinsrb(v_r, gt.r8(), imm(15)); - c.mov(gt, imm((sysint_t)&__shift_table_out[sh])); - XmmVar shuf(c.newXmmVar()); - c.movaps(shuf, xmmword_ptr(gt)); - c.pshufb(v, shuf); - c.mov(gt, imm((sysint_t)&__shift_table_in[sh])); - c.movaps(shuf, xmmword_ptr(gt)); - c.pshufb(v_r, shuf); + GpVar gt(c.newGpVar()); + c.xor_(gt, gt); + c.pinsrb(v, gt.r8(), imm(0)); + c.pinsrb(v_r, gt.r8(), imm(15)); + c.mov(gt, imm((sysint_t)&__shift_table_out[sh])); + XmmVar shuf(c.newXmmVar()); + c.movaps(shuf, xmmword_ptr(gt)); + c.pshufb(v, shuf); + c.mov(gt, imm((sysint_t)&__shift_table_in[sh])); + c.movaps(shuf, xmmword_ptr(gt)); + c.pshufb(v_r, shuf); c.por(v, v_r); e.update_vr_value(vd, v); e.TraceVR(vd, va, vb); @@ -2068,12 +2070,12 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(X64Emitter& e, X86Compiler& // Load source, move from tight pack of X16Y16.... to X16...Y16... // Also zero out the high end. c.int3(); - c.movaps(vt, e.vr_value(vb)); - c.save(vt); + c.movaps(vt, e.vr_value(vb)); + c.save(vt); c.lea(gt, vt.m128()); - X86CompilerFuncCall* call = c.call(half_to_float5_SSE2); - uint32_t args[] = {kX86VarTypeGpq}; - call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args)); + X86CompilerFuncCall* call = c.call(half_to_float5_SSE2); + uint32_t args[] = {kX86VarTypeGpq}; + call->setPrototype(kX86FuncConvDefault, kX86VarTypeXmm, args, XECOUNT(args)); call->setArgument(0, gt); call->setReturn(v); // Select XY00.