srawx, vsldoi, vupkd3d of D3DCOLOR -- all untested

This commit is contained in:
Ben Vanik 2013-12-08 23:15:05 -08:00
parent 3ec930d9fc
commit eb2d596c27
2 changed files with 94 additions and 179 deletions

View File

@ -1249,85 +1249,51 @@ XEEMITTER(vslw128, VX128(6, 208), VX128 )(PPCFunctionBuilder& f, Inst
return InstrEmit_vslw_(f, VX128_VD128, VX128_VA128, VX128_VB128); return InstrEmit_vslw_(f, VX128_VD128, VX128_VA128, VX128_VB128);
} }
// static __m128i __shift_table_out[16] = { static uint8_t __vsldoi_table[16][16] = {
// _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), // unused {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, // unused
// _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1), {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1},
// _mm_set_epi8( 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2), {17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2},
// _mm_set_epi8( 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3), {18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3},
// _mm_set_epi8( 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4), {19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4},
// _mm_set_epi8( 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5), {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6), {21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7), {22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9, 8), {23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10, 9), {24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11, 10), {25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12, 11), {26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13, 12), {27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14, 13), {28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 14), {29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14},
// _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15), {30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15},
// }; };
// static __m128i __shift_table_in[16] = { int InstrEmit_vsldoi_(PPCFunctionBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) {
// _mm_set_epi8(15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), // unused // (VD) <- ((VA) || (VB)) << (SH << 3)
// _mm_set_epi8( 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), if (!sh) {
// _mm_set_epi8( 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), f.StoreVR(vd, f.LoadVR(va));
// _mm_set_epi8( 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), return 0;
// _mm_set_epi8( 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), } else if (sh == 16) {
// _mm_set_epi8( 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), f.StoreVR(vd, f.LoadVR(vb));
// _mm_set_epi8( 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15), return 0;
// _mm_set_epi8( 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15, 15), }
// _mm_set_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15, 15), // TODO(benvanik): optimize for the rotation case:
// _mm_set_epi8( 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15, 15), // vsldoi128 vr63,vr63,vr63,4
// _mm_set_epi8( 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15, 15), // (ABCD ABCD) << 4b = (BCDA)
// _mm_set_epi8(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15, 15), // (VA << SH) OR (VB >> (16 - SH))
// _mm_set_epi8(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15, 15), Value* control = f.LoadConstant(*((vec128_t*)(__vsldoi_table[sh])));
// _mm_set_epi8(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15, 15), Value* v = f.Permute(
// _mm_set_epi8(13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 15), control,
// _mm_set_epi8(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15), f.LoadVR(va),
// }; f.LoadVR(vb), INT8_TYPE);
// int InstrEmit_vsldoi_(PPCFunctionBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, uint32_t sh) { f.StoreVR(vd, v);
// // (VD) <- ((VA) || (VB)) << (SH << 3) return 0;
// if (!sh) { }
// // No shift? XEEMITTER(vsldoi, 0x1000002C, VXA )(PPCFunctionBuilder& f, InstrData& i) {
// f.StoreVR(vd, f.LoadVR(va)); return InstrEmit_vsldoi_(f, i.VXA.VD, i.VXA.VA, i.VXA.VB, i.VXA.VC & 0xF);
// e.TraceVR(vd, va, vb); }
// return 0; XEEMITTER(vsldoi128, VX128_5(4, 16), VX128_5)(PPCFunctionBuilder& f, InstrData& i) {
// } else if (sh == 16) { return InstrEmit_vsldoi_(f, VX128_5_VD128, VX128_5_VA128, VX128_5_VB128, VX128_5_SH);
// f.StoreVR(vd, f.LoadVR(vb)); }
// e.TraceVR(vd, va, vb);
// return 0;
// }
// // TODO(benvanik): optimize for the rotation case:
// // vsldoi128 vr63,vr63,vr63,4
// // (ABCD ABCD) << 4b = (BCDA)
// // TODO(benvanik): rewrite this piece of shit.
// XmmVar v(c.newXmmVar());
// c.movaps(v, f.LoadVR(va));
// XmmVar v_r(c.newXmmVar());
// c.movaps(v_r, f.LoadVR(vb));
// // (VA << SH) OR (VB >> (16 - SH))
// GpVar gt(c.newGpVar());
// c.xor_(gt, gt);
// c.pinsrb(v, gt.r8(), imm(0));
// c.pinsrb(v_r, gt.r8(), imm(15));
// c.mov(gt, imm((sysint_t)&__shift_table_out[sh]));
// XmmVar shuf(c.newXmmVar());
// c.movaps(shuf, xmmword_ptr(gt));
// c.pshufb(v, shuf);
// c.mov(gt, imm((sysint_t)&__shift_table_in[sh]));
// c.movaps(shuf, xmmword_ptr(gt));
// c.pshufb(v_r, shuf);
// c.por(v, v_r);
// f.StoreVR(vd, v);
// e.TraceVR(vd, va, vb);
// return 0;
// }
// XEEMITTER(vsldoi, 0x1000002C, VXA )(PPCFunctionBuilder& f, InstrData& i) {
// return InstrEmit_vsldoi_(f, i.VXA.VD, i.VXA.VA, i.VXA.VB, i.VXA.VC & 0xF);
// }
// XEEMITTER(vsldoi128, VX128_5(4, 16), VX128_5)(PPCFunctionBuilder& f, InstrData& i) {
// return InstrEmit_vsldoi_(f, VX128_5_VD128, VX128_5_VA128, VX128_5_VB128, VX128_5_SH);
// }
XEEMITTER(vslo, 0x1000040C, VX )(PPCFunctionBuilder& f, InstrData& i) { XEEMITTER(vslo, 0x1000040C, VX )(PPCFunctionBuilder& f, InstrData& i) {
XEINSTRNOTIMPLEMENTED(); XEINSTRNOTIMPLEMENTED();
@ -1637,45 +1603,27 @@ XEEMITTER(vupkd3d128, VX128_3(6, 2032), VX128_3)(PPCFunctionBuilder& f, Inst
switch (type) { switch (type) {
case 0: // VPACK_D3DCOLOR case 0: // VPACK_D3DCOLOR
{ {
XEASSERTALWAYS();
return 1;
// http://hlssmod.net/he_code/public/pixelwriter.h // http://hlssmod.net/he_code/public/pixelwriter.h
// ARGB (WXYZ) -> RGBA (XYZW) // ARGB (WXYZ) -> RGBA (XYZW)
// zzzzZZZZzzzzARGB // zzzzZZZZzzzzARGB
//c.movaps(vt, f.LoadVR(vb)); v = f.LoadVR(vb);
//// zzzzZZZZzzzzARGB // 0zzzZZZZzzzzARGB
//// 000R000G000B000A v = f.Insert(v, 0, f.LoadConstant((int8_t)0));
//c.mov(gt, imm( // 000R000G000B000A
// ((1ull << 7) << 56) | vec128_t shuf_v = { 0 };
// ((1ull << 7) << 48) | shuf_v.b16[3] = 13;
// ((1ull << 7) << 40) | shuf_v.b16[7] = 14;
// ((0ull) << 32) | // B shuf_v.b16[11] = 15;
// ((1ull << 7) << 24) | shuf_v.b16[15] = 12;
// ((1ull << 7) << 16) | Value* shuf = f.LoadConstant(shuf_v);
// ((1ull << 7) << 8) | v = f.Permute(shuf, v, v, INT8_TYPE);
// ((3ull) << 0)) // A // {256*R.0, 256*G.0, 256*B.0, 256*A.0}
// ); // lo v = f.VectorConvertI2F(v);
//c.movq(v, gt); // {R.0, G.0, B.0 A.0}
//c.mov(gt, imm( // 1/256 = 0.00390625 = 0x3B800000
// ((1ull << 7) << 56) | v = f.Mul(
// ((1ull << 7) << 48) | v,
// ((1ull << 7) << 40) | f.Splat(f.LoadConstant((uint32_t)0x3B800000), VEC128_TYPE));
// ((2ull) << 32) | // R
// ((1ull << 7) << 24) |
// ((1ull << 7) << 16) |
// ((1ull << 7) << 8) |
// ((1ull) << 0)) // G
// ); // hi
//c.pinsrq(v, gt, imm(1));
//c.pshufb(vt, v);
//// {256*R.0, 256*G.0, 256*B.0, 256*A.0}
//c.cvtdq2ps(v, vt);
//// {R.0, G.0, B.0 A.0}
//// 1/256 = 0.00390625 = 0x3B800000
//c.mov(gt, imm(0x3B800000));
//c.movd(vt, gt.r32());
//c.shufps(vt, vt, imm(0));
//c.mulps(v, vt);
} }
break; break;
case 1: // VPACK_NORMSHORT2 case 1: // VPACK_NORMSHORT2
@ -1955,8 +1903,8 @@ void RegisterEmitCategoryAltivec() {
XEREGISTERINSTR(vslo128, VX128(5, 912)); XEREGISTERINSTR(vslo128, VX128(5, 912));
XEREGISTERINSTR(vslw, 0x10000184); XEREGISTERINSTR(vslw, 0x10000184);
XEREGISTERINSTR(vslw128, VX128(6, 208)); XEREGISTERINSTR(vslw128, VX128(6, 208));
// XEREGISTERINSTR(vsldoi, 0x1000002C); XEREGISTERINSTR(vsldoi, 0x1000002C);
// XEREGISTERINSTR(vsldoi128, VX128_5(4, 16)); XEREGISTERINSTR(vsldoi128, VX128_5(4, 16));
XEREGISTERINSTR(vspltb, 0x1000020C); XEREGISTERINSTR(vspltb, 0x1000020C);
XEREGISTERINSTR(vsplth, 0x1000024C); XEREGISTERINSTR(vsplth, 0x1000024C);
XEREGISTERINSTR(vspltw, 0x1000028C); XEREGISTERINSTR(vspltw, 0x1000028C);

View File

@ -1159,65 +1159,34 @@ XEEMITTER(sradix, 0x7C000674, XS )(PPCFunctionBuilder& f, InstrData& i) {
return 0; return 0;
} }
// XEEMITTER(srawx, 0x7C000630, X )(PPCFunctionBuilder& f, InstrData& i) { XEEMITTER(srawx, 0x7C000630, X )(PPCFunctionBuilder& f, InstrData& i) {
// // n <- rB[59-63] // n <- rB[59-63]
// // r <- ROTL32((RS)[32:63], 64-n) // r <- ROTL32((RS)[32:63], 64-n)
// // m <- MASK(n+32, 63) // m <- MASK(n+32, 63)
// // s <- (RS)[32] // s <- (RS)[32]
// // RA <- r&m | (i64.s)&¬m // RA <- r&m | (i64.s)&¬m
// // CA <- s & ((r&¬m)[32:63]≠0) // CA <- s & ((r&¬m)[32:63]≠0)
// if n == 0: rA <- sign_extend(rS), XER[CA] = 0
// // if n == 0: rA <- sign_extend(rS), XER[CA] = 0 // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS)
// // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS) Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE);
Value* sh = f.And(
// GpVar v(c.newGpVar()); f.Truncate(f.LoadGPR(i.X.RB), INT32_TYPE),
// c.mov(v, f.LoadGPR(i.X.RT)); f.LoadConstant((int8_t)0x7F));
// GpVar sh(c.newGpVar()); // CA is set if any bits are shifted out of the right and if the result
// c.mov(sh, f.LoadGPR(i.X.RB)); // is negative.
// c.and_(sh, imm(0x7F)); Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh));
Value* ca = f.And(
// GpVar ca(c.newGpVar()); f.Shr(v, 31),
// Label skip(c.newLabel()); f.IsTrue(f.And(v, mask)));
// Label full(c.newLabel()); f.StoreCA(ca);
// c.test(sh, sh); v = f.Sha(v, sh),
// c.jnz(full); v = f.SignExtend(v, INT64_TYPE);
// { f.StoreGPR(i.X.RA, v);
// // No shift, just a fancy sign extend and CA clearer. if (i.X.Rc) {
// c.cdqe(v); f.UpdateCR(0, v);
// c.mov(ca, imm(0)); }
// } return 0;
// c.jmp(skip); }
// c.bind(full);
// {
// // CA is set if any bits are shifted out of the right and if the result
// // is negative. Start tracking that here.
// c.mov(ca, v);
// c.and_(ca, imm(~XEMASK(32 + i.X.RB, 64)));
// c.cmp(ca, imm(0));
// c.xor_(ca, ca);
// c.setnz(ca.r8());
// // Shift right and sign extend the 32bit part.
// c.sar(v.r32(), imm(i.X.RB));
// c.cdqe(v);
// // CA is set to 1 if the low-order 32 bits of (RS) contain a negative number
// // and any 1-bits are shifted out of position 63; otherwise CA is set to 0.
// // We already have ca set to indicate the shift bits, now just and in sign.
// GpVar ca_2(c.newGpVar());
// c.mov(ca_2, v.r32());
// c.shr(ca_2, imm(31));
// c.and_(ca, ca_2);
// }
// c.bind(skip);
// f.StoreGPR(i.X.RA, v);
// e.update_xer_with_carry(ca);
// if (i.X.Rc) {
// f.UpdateCR(0, v);
// }
// return 0;
// }
XEEMITTER(srawix, 0x7C000670, X )(PPCFunctionBuilder& f, InstrData& i) { XEEMITTER(srawix, 0x7C000670, X )(PPCFunctionBuilder& f, InstrData& i) {
// n <- SH // n <- SH
@ -1226,10 +1195,8 @@ XEEMITTER(srawix, 0x7C000670, X )(PPCFunctionBuilder& f, InstrData& i) {
// s <- (RS)[32] // s <- (RS)[32]
// RA <- r&m | (i64.s)&¬m // RA <- r&m | (i64.s)&¬m
// CA <- s & ((r&¬m)[32:63]≠0) // CA <- s & ((r&¬m)[32:63]≠0)
// if n == 0: rA <- sign_extend(rS), XER[CA] = 0 // if n == 0: rA <- sign_extend(rS), XER[CA] = 0
// if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS) // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS)
Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE); Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE);
Value* ca; Value* ca;
if (!i.X.RB) { if (!i.X.RB) {
@ -1323,7 +1290,7 @@ void RegisterEmitCategoryALU() {
XEREGISTERINSTR(srwx, 0x7C000430); XEREGISTERINSTR(srwx, 0x7C000430);
// XEREGISTERINSTR(sradx, 0x7C000634); // XEREGISTERINSTR(sradx, 0x7C000634);
XEREGISTERINSTR(sradix, 0x7C000674); XEREGISTERINSTR(sradix, 0x7C000674);
// XEREGISTERINSTR(srawx, 0x7C000630); XEREGISTERINSTR(srawx, 0x7C000630);
XEREGISTERINSTR(srawix, 0x7C000670); XEREGISTERINSTR(srawix, 0x7C000670);
} }