diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index 3982030c3..3adb4d7af 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -1249,85 +1249,51 @@ XEEMITTER(vslw128, VX128(6, 208), VX128 )(PPCFunctionBuilder& f, Inst return InstrEmit_vslw_(f, VX128_VD128, VX128_VA128, VX128_VB128); } -// static __m128i __shift_table_out[16] = { -// _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), // unused -// _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1), -// _mm_set_epi8( 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2), -// _mm_set_epi8( 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3), -// _mm_set_epi8( 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4), -// _mm_set_epi8( 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14), -// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15), -// }; -// static __m128i __shift_table_in[16] = { -// _mm_set_epi8(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), // unused -// _mm_set_epi8( 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8( 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15), -// _mm_set_epi8(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15), -// _mm_set_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15), -// _mm_set_epi8(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15), -// _mm_set_epi8(13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15), -// _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15), -// }; -// int InstrEmit_vsldoi_(PPCFunctionBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) { -// // (VD) <- ((VA) || (VB)) << (SH << 3) -// if (!sh) { -// // No shift? -// f.StoreVR(vd, f.LoadVR(va)); -// e.TraceVR(vd, va, vb); -// return 0; -// } else if (sh == 16) { -// f.StoreVR(vd, f.LoadVR(vb)); -// e.TraceVR(vd, va, vb); -// return 0; -// } -// // TODO(benvanik): optimize for the rotation case: -// // vsldoi128 vr63,vr63,vr63,4 -// // (ABCD ABCD) << 4b = (BCDA) -// // TODO(benvanik): rewrite this piece of shit. -// XmmVar v(c.newXmmVar()); -// c.movaps(v, f.LoadVR(va)); -// XmmVar v_r(c.newXmmVar()); -// c.movaps(v_r, f.LoadVR(vb)); -// // (VA << SH) OR (VB >> (16 - SH)) -// GpVar gt(c.newGpVar()); -// c.xor_(gt, gt); -// c.pinsrb(v, gt.r8(), imm(0)); -// c.pinsrb(v_r, gt.r8(), imm(15)); -// c.mov(gt, imm((sysint_t)&__shift_table_out[sh])); -// XmmVar shuf(c.newXmmVar()); -// c.movaps(shuf, xmmword_ptr(gt)); -// c.pshufb(v, shuf); -// c.mov(gt, imm((sysint_t)&__shift_table_in[sh])); -// c.movaps(shuf, xmmword_ptr(gt)); -// c.pshufb(v_r, shuf); -// c.por(v, v_r); -// f.StoreVR(vd, v); -// e.TraceVR(vd, va, vb); -// return 0; -// } -// XEEMITTER(vsldoi, 0x1000002C, VXA )(PPCFunctionBuilder& f, InstrData& i) { -// return InstrEmit_vsldoi_(f, i.VXA.VD, i.VXA.VA, i.VXA.VB, i.VXA.VC & 0xF); -// } -// XEEMITTER(vsldoi128, VX128_5(4, 16), VX128_5)(PPCFunctionBuilder& f, InstrData& i) { -// return InstrEmit_vsldoi_(f, VX128_5_VD128, VX128_5_VA128, VX128_5_VB128, VX128_5_SH); -// } +static uint8_t __vsldoi_table[16][16] = { + {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, // unused + {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}, + {17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2}, + {18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3}, + {19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4}, + {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5}, + {21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6}, + {22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7}, + {23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8}, + {24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9}, + {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10}, + {26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11}, + {27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12}, + {28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13}, + {29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14}, + {30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15}, +}; +int InstrEmit_vsldoi_(PPCFunctionBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) { + // (VD) <- ((VA) || (VB)) << (SH << 3) + if (!sh) { + f.StoreVR(vd, f.LoadVR(va)); + return 0; + } else if (sh == 16) { + f.StoreVR(vd, f.LoadVR(vb)); + return 0; + } + // TODO(benvanik): optimize for the rotation case: + // vsldoi128 vr63,vr63,vr63,4 + // (ABCD ABCD) << 4b = (BCDA) + // (VA << SH) OR (VB >> (16 - SH)) + Value* control = f.LoadConstant(*((vec128_t*)(__vsldoi_table[sh]))); + Value* v = f.Permute( + control, + f.LoadVR(va), + f.LoadVR(vb), INT8_TYPE); + f.StoreVR(vd, v); + return 0; +} +XEEMITTER(vsldoi, 0x1000002C, VXA )(PPCFunctionBuilder& f, InstrData& i) { + return InstrEmit_vsldoi_(f, i.VXA.VD, i.VXA.VA, i.VXA.VB, i.VXA.VC & 0xF); +} +XEEMITTER(vsldoi128, VX128_5(4, 16), VX128_5)(PPCFunctionBuilder& f, InstrData& i) { + return InstrEmit_vsldoi_(f, VX128_5_VD128, VX128_5_VA128, VX128_5_VB128, VX128_5_SH); +} XEEMITTER(vslo, 0x1000040C, VX )(PPCFunctionBuilder& f, InstrData& i) { XEINSTRNOTIMPLEMENTED(); @@ -1637,45 +1603,27 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(PPCFunctionBuilder& f, Inst switch (type) { case 0: // VPACK_D3DCOLOR { - XEASSERTALWAYS(); - return 1; // http://hlssmod.net/he_code/public/pixelwriter.h // ARGB (WXYZ) -> RGBA (XYZW) // zzzzZZZZzzzzARGB - //c.movaps(vt, f.LoadVR(vb)); - //// zzzzZZZZzzzzARGB - //// 000R000G000B000A - //c.mov(gt, imm( - // ((1ull << 7) << 56) | - // ((1ull << 7) << 48) | - // ((1ull << 7) << 40) | - // ((0ull) << 32) | // B - // ((1ull << 7) << 24) | - // ((1ull << 7) << 16) | - // ((1ull << 7) << 8) | - // ((3ull) << 0)) // A - // ); // lo - //c.movq(v, gt); - //c.mov(gt, imm( - // ((1ull << 7) << 56) | - // ((1ull << 7) << 48) | - // ((1ull << 7) << 40) | - // ((2ull) << 32) | // R - // ((1ull << 7) << 24) | - // ((1ull << 7) << 16) | - // ((1ull << 7) << 8) | - // ((1ull) << 0)) // G - // ); // hi - //c.pinsrq(v, gt, imm(1)); - //c.pshufb(vt, v); - //// {256*R.0, 256*G.0, 256*B.0, 256*A.0} - //c.cvtdq2ps(v, vt); - //// {R.0, G.0, B.0 A.0} - //// 1/256 = 0.00390625 = 0x3B800000 - //c.mov(gt, imm(0x3B800000)); - //c.movd(vt, gt.r32()); - //c.shufps(vt, vt, imm(0)); - //c.mulps(v, vt); + v = f.LoadVR(vb); + // 0zzzZZZZzzzzARGB + v = f.Insert(v, 0, f.LoadConstant((int8_t)0)); + // 000R000G000B000A + vec128_t shuf_v = { 0 }; + shuf_v.b16[3] = 13; + shuf_v.b16[7] = 14; + shuf_v.b16[11] = 15; + shuf_v.b16[15] = 12; + Value* shuf = f.LoadConstant(shuf_v); + v = f.Permute(shuf, v, v, INT8_TYPE); + // {256*R.0, 256*G.0, 256*B.0, 256*A.0} + v = f.VectorConvertI2F(v); + // {R.0, G.0, B.0 A.0} + // 1/256 = 0.00390625 = 0x3B800000 + v = f.Mul( + v, + f.Splat(f.LoadConstant((uint32_t)0x3B800000), VEC128_TYPE)); } break; case 1: // VPACK_NORMSHORT2 @@ -1955,8 +1903,8 @@ void RegisterEmitCategoryAltivec() { XEREGISTERINSTR(vslo128, VX128(5, 912)); XEREGISTERINSTR(vslw, 0x10000184); XEREGISTERINSTR(vslw128, VX128(6, 208)); - // XEREGISTERINSTR(vsldoi, 0x1000002C); - // XEREGISTERINSTR(vsldoi128, VX128_5(4, 16)); + XEREGISTERINSTR(vsldoi, 0x1000002C); + XEREGISTERINSTR(vsldoi128, VX128_5(4, 16)); XEREGISTERINSTR(vspltb, 0x1000020C); XEREGISTERINSTR(vsplth, 0x1000024C); XEREGISTERINSTR(vspltw, 0x1000028C); diff --git a/src/alloy/frontend/ppc/ppc_emit_alu.cc b/src/alloy/frontend/ppc/ppc_emit_alu.cc index a2ddff3d4..9426ca957 100644 --- a/src/alloy/frontend/ppc/ppc_emit_alu.cc +++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc @@ -1159,65 +1159,34 @@ XEEMITTER(sradix, 0x7C000674, XS )(PPCFunctionBuilder& f, InstrData& i) { return 0; } -// XEEMITTER(srawx, 0x7C000630, X )(PPCFunctionBuilder& f, InstrData& i) { -// // n <- rB[59-63] -// // r <- ROTL32((RS)[32:63], 64-n) -// // m <- MASK(n+32, 63) -// // s <- (RS)[32] -// // RA <- r&m | (i64.s)&¬m -// // CA <- s & ((r&¬m)[32:63]≠0) - -// // if n == 0: rA <- sign_extend(rS), XER[CA] = 0 -// // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS) - -// GpVar v(c.newGpVar()); -// c.mov(v, f.LoadGPR(i.X.RT)); -// GpVar sh(c.newGpVar()); -// c.mov(sh, f.LoadGPR(i.X.RB)); -// c.and_(sh, imm(0x7F)); - -// GpVar ca(c.newGpVar()); -// Label skip(c.newLabel()); -// Label full(c.newLabel()); -// c.test(sh, sh); -// c.jnz(full); -// { -// // No shift, just a fancy sign extend and CA clearer. -// c.cdqe(v); -// c.mov(ca, imm(0)); -// } -// c.jmp(skip); -// c.bind(full); -// { -// // CA is set if any bits are shifted out of the right and if the result -// // is negative. Start tracking that here. -// c.mov(ca, v); -// c.and_(ca, imm(~XEMASK(32 + i.X.RB, 64))); -// c.cmp(ca, imm(0)); -// c.xor_(ca, ca); -// c.setnz(ca.r8()); - -// // Shift right and sign extend the 32bit part. -// c.sar(v.r32(), imm(i.X.RB)); -// c.cdqe(v); - -// // CA is set to 1 if the low-order 32 bits of (RS) contain a negative number -// // and any 1-bits are shifted out of position 63; otherwise CA is set to 0. -// // We already have ca set to indicate the shift bits, now just and in sign. -// GpVar ca_2(c.newGpVar()); -// c.mov(ca_2, v.r32()); -// c.shr(ca_2, imm(31)); -// c.and_(ca, ca_2); -// } -// c.bind(skip); - -// f.StoreGPR(i.X.RA, v); -// e.update_xer_with_carry(ca); -// if (i.X.Rc) { -// f.UpdateCR(0, v); -// } -// return 0; -// } +XEEMITTER(srawx, 0x7C000630, X )(PPCFunctionBuilder& f, InstrData& i) { + // n <- rB[59-63] + // r <- ROTL32((RS)[32:63], 64-n) + // m <- MASK(n+32, 63) + // s <- (RS)[32] + // RA <- r&m | (i64.s)&¬m + // CA <- s & ((r&¬m)[32:63]≠0) + // if n == 0: rA <- sign_extend(rS), XER[CA] = 0 + // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS) + Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE); + Value* sh = f.And( + f.Truncate(f.LoadGPR(i.X.RB), INT32_TYPE), + f.LoadConstant((int8_t)0x7F)); + // CA is set if any bits are shifted out of the right and if the result + // is negative. + Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh)); + Value* ca = f.And( + f.Shr(v, 31), + f.IsTrue(f.And(v, mask))); + f.StoreCA(ca); + v = f.Sha(v, sh), + v = f.SignExtend(v, INT64_TYPE); + f.StoreGPR(i.X.RA, v); + if (i.X.Rc) { + f.UpdateCR(0, v); + } + return 0; +} XEEMITTER(srawix, 0x7C000670, X )(PPCFunctionBuilder& f, InstrData& i) { // n <- SH @@ -1226,10 +1195,8 @@ XEEMITTER(srawix, 0x7C000670, X )(PPCFunctionBuilder& f, InstrData& i) { // s <- (RS)[32] // RA <- r&m | (i64.s)&¬m // CA <- s & ((r&¬m)[32:63]≠0) - // if n == 0: rA <- sign_extend(rS), XER[CA] = 0 // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS) - Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE); Value* ca; if (!i.X.RB) { @@ -1323,7 +1290,7 @@ void RegisterEmitCategoryALU() { XEREGISTERINSTR(srwx, 0x7C000430); // XEREGISTERINSTR(sradx, 0x7C000634); XEREGISTERINSTR(sradix, 0x7C000674); - // XEREGISTERINSTR(srawx, 0x7C000630); + XEREGISTERINSTR(srawx, 0x7C000630); XEREGISTERINSTR(srawix, 0x7C000670); }